stop leaking FDs when guessing doctypes

This commit is contained in:
Martin A. Brown 2016-03-14 20:32:42 -07:00
parent a2daee9425
commit 26de64a2bb
2 changed files with 16 additions and 17 deletions

View File

@ -42,18 +42,16 @@ def depends(*predecessors):
class SignatureChecker(object):
@classmethod
def signatureLocation(cls, f):
f.seek(0)
buf = f.read(1024)
def signatureLocation(cls, buf, fname):
for sig in cls.signatures:
try:
sindex = buf.index(sig)
logger.debug("YES FOUND signature %r in %s at %s; doctype %s.",
sig, f.name, sindex, cls)
sig, fname, sindex, cls)
return sindex
except ValueError:
logger.debug("not found signature %r in %s for type %s",
sig, f.name, cls.__name__)
sig, fname, cls.__name__)
return None

View File

@ -4,11 +4,10 @@
from __future__ import absolute_import, division, print_function
import os
import errno
import inspect
import logging
from tldp.utils import makefh
import tldp.doctypes
logger = logging.getLogger(__name__)
@ -32,10 +31,10 @@ def getDoctypeClasses():
return getDoctypeMembers(inspect.isclass)
def guess(thing):
def guess(fname):
'''return a tldp.doctype class which is a best guess for document type
thing: Could be a filename or an open file.
:parama fname: A filename.
The guess function will try to guess the document type (doctype) from the
file extension. If extension matching produces multiple possible doctype
@ -55,11 +54,10 @@ def guess(thing):
* It could/should use heuristics or something richer than signatures.
'''
try:
f = makefh(thing)
except TypeError:
stem, ext = os.path.splitext(fname)
except AttributeError:
return None
stem, ext = os.path.splitext(f.name)
if not ext:
logger.debug("%s no file extension, skipping %s.", stem, ext)
return None
@ -77,19 +75,22 @@ def guess(thing):
# -- for this extension, multiple document types, probably SGML, XML
#
logger.debug("%s multiple possible doctypes for extension %s on file %s.",
stem, ext, f.name)
stem, ext, fname)
for doctype in possible:
logger.debug("%s extension %s could be %s.", stem, ext, doctype)
with open(fname) as f:
buf = f.read(1024)
guesses = list()
for doctype in possible:
sindex = doctype.signatureLocation(f)
sindex = doctype.signatureLocation(buf, fname)
if sindex is not None:
guesses.append((sindex, doctype))
if not guesses:
logger.warning("%s no matching signature found for %s.",
stem, f.name)
stem, fname)
return None
if len(guesses) == 1:
_, doctype = guesses.pop()
@ -100,10 +101,10 @@ def guess(thing):
# first signature in the file as the more likely document type.
#
guesses.sort()
logger.info("%s multiple doctype guesses for file %s", stem, f.name)
logger.info("%s multiple doctype guesses for file %s", stem, fname)
for sindex, doctype in guesses:
logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
logger.info("%s going to guess %s for %s", stem, doctype, f.name)
logger.info("%s going to guess %s for %s", stem, doctype, fname)
_, doctype = guesses.pop(0)
return doctype