python-tldp/tldp/typeguesser.py

84 lines
2.3 KiB
Python
Raw Normal View History

2016-02-11 03:22:23 +00:00
#! /usr/bin/python
2016-02-18 21:25:02 +00:00
# -*- coding: utf8 -*-
2016-02-11 03:22:23 +00:00
from __future__ import absolute_import, division, print_function
2016-02-11 03:22:23 +00:00
import os
import inspect
2016-02-12 18:31:50 +00:00
from .utils import logger, makefh
2016-02-11 03:22:23 +00:00
from . import doctypes
def listDoctypes():
2016-02-17 07:42:57 +00:00
kdt = list()
2016-02-11 03:22:23 +00:00
for name, member in inspect.getmembers(doctypes, inspect.isclass):
logger.debug("Located class %s (%r).", name, member)
2016-02-17 07:42:57 +00:00
kdt.append(member)
logger.debug("Capable of handling %s document classes.", len(kdt))
return kdt
2016-02-11 03:22:23 +00:00
def guess(thing):
try:
2016-02-12 20:25:16 +00:00
f = makefh(thing)
2016-02-11 03:22:23 +00:00
except TypeError:
return None
stem, ext = os.path.splitext(f.name)
if not ext:
logger.debug("%s no file extension, skipping %s.", stem, ext)
return None
2016-02-11 03:22:23 +00:00
possible = [t for t in knowndoctypes if ext in t.extensions]
logger.debug("Possible: %r", possible)
2016-02-11 03:22:23 +00:00
if not possible:
logger.debug("%s unknown extension %s.", stem, ext)
2016-02-11 03:22:23 +00:00
return None
2016-02-11 03:22:23 +00:00
if len(possible) == 1:
doctype = possible.pop()
return doctype
# -- for this extension, multiple document types, probably SGML, XML
#
logger.debug("%s multiple possible doctypes for extension %s on file %s.",
stem, ext, f.name)
2016-02-11 03:22:23 +00:00
for doctype in possible:
logger.debug("%s extension %s could be %s.", stem, ext, doctype)
2016-02-11 03:22:23 +00:00
guesses = list()
for doctype in possible:
2016-02-12 20:25:16 +00:00
sindex = doctype.signatureLocation(f)
2016-02-11 03:22:23 +00:00
if sindex is not None:
guesses.append((sindex, doctype))
if not guesses:
logger.warning("%s no matching signature found for %s.",
stem, f.name)
2016-02-11 03:22:23 +00:00
return None
if len(guesses) == 1:
_, doctype = guesses.pop()
return doctype
# -- OK, this is unusual; we still found multiple document type
# signatures. Seems rare but unlikely, so we should choose the
# first signature in the file as the more likely document type.
#
guesses.sort()
logger.info("%s multiple doctype guesses for file %s", stem, f.name)
2016-02-11 03:22:23 +00:00
for sindex, doctype in guesses:
logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
logger.info("%s going to guess %s for %s", stem, doctype, f.name)
_, doctype = guesses.pop(0)
2016-02-11 03:22:23 +00:00
return doctype
knowndoctypes = listDoctypes()
knownextensions = set()
for x in knowndoctypes:
knownextensions.update(x.extensions)
#
# -- end of file