python-tldp/tldp/guess.py

87 lines
2.4 KiB
Python

#! /usr/bin/python
from __future__ import print_function
import os
import inspect
from . import doctypes
from .utils import logger
def makefh(thing):
if isinstance(thing, file):
f = thing
elif isinstance(thing, str) and os.path.isfile(thing):
f = open(thing)
else:
raise TypeError("Cannot make file from type %s of %r" %
(type(thing), thing,))
return f
def listDoctypes():
knowndoctypes = list()
for name, member in inspect.getmembers(doctypes, inspect.isclass):
logger.info("Located class %s (%r).", name, member)
knowndoctypes.append(member)
logger.info("Capable of handling %s document classes.", len(knowndoctypes))
return knowndoctypes
def guess(thing):
try:
fin = makefh(thing)
except TypeError:
return None
_, ext = os.path.splitext(fin.name)
possible = [t for t in knowndoctypes if ext in t.extensions]
if not possible:
return None
if len(possible) == 1:
doctype = possible.pop()
return doctype
# -- for this extension, multiple document types, probably SGML, XML
#
logger.info("Extension is %s for %s; multiple possible document types.",
ext, fin.name)
for doctype in possible:
logger.debug("Extension is %s for %s; %s.", ext, fin.name, doctype)
guesses = list()
for doctype in possible:
sindex = doctype.signatureLocation(fin.name, fin)
if sindex is not None:
guesses.append((sindex, doctype))
if not guesses:
logger.info("Extension is %s for %s; but no matching signature found.",
ext, fin.name)
return None
if len(guesses) == 1:
_, doctype = guesses.pop()
return doctype
# -- OK, this is unusual; we still found multiple document type
# signatures. Seems rare but unlikely, so we should choose the
# first signature in the file as the more likely document type.
#
guesses.sort()
logger.info("Multiple guesses for file %s", fin.name)
for sindex, doctype in guesses:
logger.info("Could be %s (file position %s)", doctype, sindex)
logger.info("Going to guess that it is %s", doctype)
_, doctype = guesses.pop()
return doctype
knowndoctypes = listDoctypes()
knownextensions = set()
for x in knowndoctypes:
knownextensions.update(x.extensions)
#
# -- end of file