python-tldp/tldp/typeguesser.py

125 lines
3.9 KiB
Python
Raw Normal View History

2016-02-11 03:22:23 +00:00
#! /usr/bin/python
2016-02-18 21:25:02 +00:00
# -*- coding: utf8 -*-
2016-02-11 03:22:23 +00:00
from __future__ import absolute_import, division, print_function
2016-02-11 03:22:23 +00:00
import os
import inspect
2016-02-25 17:51:17 +00:00
from tldp.utils import logger, makefh
2016-02-11 03:22:23 +00:00
2016-02-25 17:51:17 +00:00
import tldp.doctypes
2016-02-11 03:22:23 +00:00
2016-02-25 17:51:17 +00:00
def getDoctypeMembers(membertype):
'''returns a list of tldp.doctypes; convenience function'''
found = list()
for name, member in inspect.getmembers(tldp.doctypes, membertype):
logger.debug("Located %s %s (%r).", membertype.__name__, name, member)
found.append(member)
return found
def getDoctypeModules():
'''returns a list of the modules known in tldp.doctypes
This is the canonical list of doctypes which are recognized and capable of
being processed into outputs. See tldp.doctypes for more information.
'''
return getDoctypeMembers(inspect.ismodule)
def getDoctypeClasses():
'''returns a list of the classes known in tldp.doctypes
2016-02-19 07:07:44 +00:00
This is the canonical list of doctypes which are recognized and capable of
being processed into outputs. See tldp.doctypes for more information.
'''
2016-02-25 17:51:17 +00:00
return getDoctypeMembers(inspect.isclass)
2016-02-11 03:22:23 +00:00
def guess(thing):
2016-02-19 07:07:44 +00:00
'''return a tldp.doctype class which is a best guess for document type
thing: Could be a filename or an open file.
The guess function will try to guess the document type (doctype) from the
file extension. If extension matching produces multiple possible doctype
matches (e.g. .xml or .sgml), the guess function will then use signature
matching to find the earliest match in the file for a signature.
If there are multiple signature matches, it will choose the signature
matching at the earliest position in the file.
Bugs/shortcomings:
* This is only a guesser.
* When signature matching, it reports first signature it discovers in
any input file.
* It could/should read more than 1024 bytes (cf. SignatureChecker)
especially if it cannot return any result.
* It could/should use heuristics or something richer than signatures.
'''
2016-02-11 03:22:23 +00:00
try:
2016-02-12 20:25:16 +00:00
f = makefh(thing)
2016-02-11 03:22:23 +00:00
except TypeError:
return None
stem, ext = os.path.splitext(f.name)
if not ext:
logger.debug("%s no file extension, skipping %s.", stem, ext)
return None
2016-02-25 17:51:17 +00:00
possible = [t for t in knowndoctypeclasses if ext in t.extensions]
logger.debug("Possible: %r", possible)
2016-02-11 03:22:23 +00:00
if not possible:
logger.debug("%s unknown extension %s.", stem, ext)
2016-02-11 03:22:23 +00:00
return None
2016-02-11 03:22:23 +00:00
if len(possible) == 1:
doctype = possible.pop()
return doctype
# -- for this extension, multiple document types, probably SGML, XML
#
logger.debug("%s multiple possible doctypes for extension %s on file %s.",
stem, ext, f.name)
2016-02-11 03:22:23 +00:00
for doctype in possible:
logger.debug("%s extension %s could be %s.", stem, ext, doctype)
2016-02-11 03:22:23 +00:00
guesses = list()
for doctype in possible:
2016-02-12 20:25:16 +00:00
sindex = doctype.signatureLocation(f)
2016-02-11 03:22:23 +00:00
if sindex is not None:
guesses.append((sindex, doctype))
if not guesses:
logger.warning("%s no matching signature found for %s.",
stem, f.name)
2016-02-11 03:22:23 +00:00
return None
if len(guesses) == 1:
_, doctype = guesses.pop()
return doctype
# -- OK, this is unusual; we still found multiple document type
# signatures. Seems rare but unlikely, so we should choose the
# first signature in the file as the more likely document type.
#
guesses.sort()
logger.info("%s multiple doctype guesses for file %s", stem, f.name)
2016-02-11 03:22:23 +00:00
for sindex, doctype in guesses:
logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
logger.info("%s going to guess %s for %s", stem, doctype, f.name)
_, doctype = guesses.pop(0)
2016-02-11 03:22:23 +00:00
return doctype
2016-02-25 17:51:17 +00:00
knowndoctypemodules = getDoctypeModules()
knowndoctypeclasses = getDoctypeClasses()
2016-02-11 03:22:23 +00:00
knownextensions = set()
2016-02-25 17:51:17 +00:00
for x in knowndoctypeclasses:
2016-02-11 03:22:23 +00:00
knownextensions.update(x.extensions)
#
# -- end of file