From 26de64a2bb3046c4add6e7d8fefdb3c85f0c9237 Mon Sep 17 00:00:00 2001 From: "Martin A. Brown" Date: Mon, 14 Mar 2016 20:32:42 -0700 Subject: [PATCH] stop leaking FDs when guessing doctypes --- tldp/doctypes/common.py | 8 +++----- tldp/typeguesser.py | 25 +++++++++++++------------ 2 files changed, 16 insertions(+), 17 deletions(-) diff --git a/tldp/doctypes/common.py b/tldp/doctypes/common.py index c2d926d..e90024b 100644 --- a/tldp/doctypes/common.py +++ b/tldp/doctypes/common.py @@ -42,18 +42,16 @@ def depends(*predecessors): class SignatureChecker(object): @classmethod - def signatureLocation(cls, f): - f.seek(0) - buf = f.read(1024) + def signatureLocation(cls, buf, fname): for sig in cls.signatures: try: sindex = buf.index(sig) logger.debug("YES FOUND signature %r in %s at %s; doctype %s.", - sig, f.name, sindex, cls) + sig, fname, sindex, cls) return sindex except ValueError: logger.debug("not found signature %r in %s for type %s", - sig, f.name, cls.__name__) + sig, fname, cls.__name__) return None diff --git a/tldp/typeguesser.py b/tldp/typeguesser.py index fdb7836..7ddff09 100644 --- a/tldp/typeguesser.py +++ b/tldp/typeguesser.py @@ -4,11 +4,10 @@ from __future__ import absolute_import, division, print_function import os +import errno import inspect import logging -from tldp.utils import makefh - import tldp.doctypes logger = logging.getLogger(__name__) @@ -32,10 +31,10 @@ def getDoctypeClasses(): return getDoctypeMembers(inspect.isclass) -def guess(thing): +def guess(fname): '''return a tldp.doctype class which is a best guess for document type - thing: Could be a filename or an open file. + :parama fname: A filename. The guess function will try to guess the document type (doctype) from the file extension. If extension matching produces multiple possible doctype @@ -55,11 +54,10 @@ def guess(thing): * It could/should use heuristics or something richer than signatures. ''' try: - f = makefh(thing) - except TypeError: + stem, ext = os.path.splitext(fname) + except AttributeError: return None - stem, ext = os.path.splitext(f.name) if not ext: logger.debug("%s no file extension, skipping %s.", stem, ext) return None @@ -77,19 +75,22 @@ def guess(thing): # -- for this extension, multiple document types, probably SGML, XML # logger.debug("%s multiple possible doctypes for extension %s on file %s.", - stem, ext, f.name) + stem, ext, fname) for doctype in possible: logger.debug("%s extension %s could be %s.", stem, ext, doctype) + with open(fname) as f: + buf = f.read(1024) + guesses = list() for doctype in possible: - sindex = doctype.signatureLocation(f) + sindex = doctype.signatureLocation(buf, fname) if sindex is not None: guesses.append((sindex, doctype)) if not guesses: logger.warning("%s no matching signature found for %s.", - stem, f.name) + stem, fname) return None if len(guesses) == 1: _, doctype = guesses.pop() @@ -100,10 +101,10 @@ def guess(thing): # first signature in the file as the more likely document type. # guesses.sort() - logger.info("%s multiple doctype guesses for file %s", stem, f.name) + logger.info("%s multiple doctype guesses for file %s", stem, fname) for sindex, doctype in guesses: logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex) - logger.info("%s going to guess %s for %s", stem, doctype, f.name) + logger.info("%s going to guess %s for %s", stem, doctype, fname) _, doctype = guesses.pop(0) return doctype