python-tldp/tldp/typeguesser.py

76 lines
2.1 KiB
Python
Raw Normal View History

2016-02-11 03:22:23 +00:00
#! /usr/bin/python
from __future__ import absolute_import, division, print_function
2016-02-11 03:22:23 +00:00
import os
import inspect
2016-02-12 18:31:50 +00:00
from .utils import logger, makefh
2016-02-11 03:22:23 +00:00
from . import doctypes
def listDoctypes():
knowndoctypes = list()
for name, member in inspect.getmembers(doctypes, inspect.isclass):
logger.debug("Located class %s (%r).", name, member)
2016-02-11 03:22:23 +00:00
knowndoctypes.append(member)
logger.info("Capable of handling %s document classes.", len(knowndoctypes))
return knowndoctypes
def guess(thing):
try:
2016-02-12 20:25:16 +00:00
f = makefh(thing)
2016-02-11 03:22:23 +00:00
except TypeError:
return None
2016-02-12 20:25:16 +00:00
_, ext = os.path.splitext(f.name)
2016-02-11 03:22:23 +00:00
possible = [t for t in knowndoctypes if ext in t.extensions]
if not possible:
return None
if len(possible) == 1:
doctype = possible.pop()
return doctype
# -- for this extension, multiple document types, probably SGML, XML
#
logger.debug("Extension is %s for %s; multiple possible document types.",
2016-02-12 20:25:16 +00:00
ext, f.name)
2016-02-11 03:22:23 +00:00
for doctype in possible:
2016-02-12 20:25:16 +00:00
logger.debug("Extension is %s for %s; %s.", ext, f.name, doctype)
2016-02-11 03:22:23 +00:00
guesses = list()
for doctype in possible:
2016-02-12 20:25:16 +00:00
sindex = doctype.signatureLocation(f)
2016-02-11 03:22:23 +00:00
if sindex is not None:
guesses.append((sindex, doctype))
if not guesses:
logger.warning("Extension is %s for %s; no matching signature found.",
2016-02-12 20:25:16 +00:00
ext, f.name)
2016-02-11 03:22:23 +00:00
return None
if len(guesses) == 1:
_, doctype = guesses.pop()
return doctype
# -- OK, this is unusual; we still found multiple document type
# signatures. Seems rare but unlikely, so we should choose the
# first signature in the file as the more likely document type.
#
guesses.sort()
2016-02-12 20:25:16 +00:00
logger.info("Multiple guesses for file %s", f.name)
2016-02-11 03:22:23 +00:00
for sindex, doctype in guesses:
logger.info("Could be %s (file position %s)", doctype, sindex)
logger.info("Going to guess that it is %s", doctype)
_, doctype = guesses.pop()
return doctype
knowndoctypes = listDoctypes()
knownextensions = set()
for x in knowndoctypes:
knownextensions.update(x.extensions)
#
# -- end of file