stop leaking FDs when guessing doctypes

This commit is contained in:
Martin A. Brown 2016-03-14 20:32:42 -07:00
parent a2daee9425
commit 26de64a2bb
2 changed files with 16 additions and 17 deletions

View File

@ -42,18 +42,16 @@ def depends(*predecessors):
class SignatureChecker(object): class SignatureChecker(object):
@classmethod @classmethod
def signatureLocation(cls, f): def signatureLocation(cls, buf, fname):
f.seek(0)
buf = f.read(1024)
for sig in cls.signatures: for sig in cls.signatures:
try: try:
sindex = buf.index(sig) sindex = buf.index(sig)
logger.debug("YES FOUND signature %r in %s at %s; doctype %s.", logger.debug("YES FOUND signature %r in %s at %s; doctype %s.",
sig, f.name, sindex, cls) sig, fname, sindex, cls)
return sindex return sindex
except ValueError: except ValueError:
logger.debug("not found signature %r in %s for type %s", logger.debug("not found signature %r in %s for type %s",
sig, f.name, cls.__name__) sig, fname, cls.__name__)
return None return None

View File

@ -4,11 +4,10 @@
from __future__ import absolute_import, division, print_function from __future__ import absolute_import, division, print_function
import os import os
import errno
import inspect import inspect
import logging import logging
from tldp.utils import makefh
import tldp.doctypes import tldp.doctypes
logger = logging.getLogger(__name__) logger = logging.getLogger(__name__)
@ -32,10 +31,10 @@ def getDoctypeClasses():
return getDoctypeMembers(inspect.isclass) return getDoctypeMembers(inspect.isclass)
def guess(thing): def guess(fname):
'''return a tldp.doctype class which is a best guess for document type '''return a tldp.doctype class which is a best guess for document type
thing: Could be a filename or an open file. :parama fname: A filename.
The guess function will try to guess the document type (doctype) from the The guess function will try to guess the document type (doctype) from the
file extension. If extension matching produces multiple possible doctype file extension. If extension matching produces multiple possible doctype
@ -55,11 +54,10 @@ def guess(thing):
* It could/should use heuristics or something richer than signatures. * It could/should use heuristics or something richer than signatures.
''' '''
try: try:
f = makefh(thing) stem, ext = os.path.splitext(fname)
except TypeError: except AttributeError:
return None return None
stem, ext = os.path.splitext(f.name)
if not ext: if not ext:
logger.debug("%s no file extension, skipping %s.", stem, ext) logger.debug("%s no file extension, skipping %s.", stem, ext)
return None return None
@ -77,19 +75,22 @@ def guess(thing):
# -- for this extension, multiple document types, probably SGML, XML # -- for this extension, multiple document types, probably SGML, XML
# #
logger.debug("%s multiple possible doctypes for extension %s on file %s.", logger.debug("%s multiple possible doctypes for extension %s on file %s.",
stem, ext, f.name) stem, ext, fname)
for doctype in possible: for doctype in possible:
logger.debug("%s extension %s could be %s.", stem, ext, doctype) logger.debug("%s extension %s could be %s.", stem, ext, doctype)
with open(fname) as f:
buf = f.read(1024)
guesses = list() guesses = list()
for doctype in possible: for doctype in possible:
sindex = doctype.signatureLocation(f) sindex = doctype.signatureLocation(buf, fname)
if sindex is not None: if sindex is not None:
guesses.append((sindex, doctype)) guesses.append((sindex, doctype))
if not guesses: if not guesses:
logger.warning("%s no matching signature found for %s.", logger.warning("%s no matching signature found for %s.",
stem, f.name) stem, fname)
return None return None
if len(guesses) == 1: if len(guesses) == 1:
_, doctype = guesses.pop() _, doctype = guesses.pop()
@ -100,10 +101,10 @@ def guess(thing):
# first signature in the file as the more likely document type. # first signature in the file as the more likely document type.
# #
guesses.sort() guesses.sort()
logger.info("%s multiple doctype guesses for file %s", stem, f.name) logger.info("%s multiple doctype guesses for file %s", stem, fname)
for sindex, doctype in guesses: for sindex, doctype in guesses:
logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex) logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
logger.info("%s going to guess %s for %s", stem, doctype, f.name) logger.info("%s going to guess %s for %s", stem, doctype, fname)
_, doctype = guesses.pop(0) _, doctype = guesses.pop(0)
return doctype return doctype