2016-02-11 03:22:23 +00:00
|
|
|
#! /usr/bin/python
|
2016-02-18 21:25:02 +00:00
|
|
|
# -*- coding: utf8 -*-
|
2016-02-11 03:22:23 +00:00
|
|
|
|
2016-02-11 19:28:38 +00:00
|
|
|
from __future__ import absolute_import, division, print_function
|
2016-02-11 03:22:23 +00:00
|
|
|
|
|
|
|
import os
|
|
|
|
import inspect
|
|
|
|
|
2016-02-12 18:31:50 +00:00
|
|
|
from .utils import logger, makefh
|
2016-02-11 03:22:23 +00:00
|
|
|
from . import doctypes
|
|
|
|
|
|
|
|
|
|
|
|
def listDoctypes():
|
2016-02-17 07:42:57 +00:00
|
|
|
kdt = list()
|
2016-02-11 03:22:23 +00:00
|
|
|
for name, member in inspect.getmembers(doctypes, inspect.isclass):
|
2016-02-11 16:17:04 +00:00
|
|
|
logger.debug("Located class %s (%r).", name, member)
|
2016-02-17 07:42:57 +00:00
|
|
|
kdt.append(member)
|
|
|
|
logger.debug("Capable of handling %s document classes.", len(kdt))
|
|
|
|
return kdt
|
2016-02-11 03:22:23 +00:00
|
|
|
|
|
|
|
|
|
|
|
def guess(thing):
|
|
|
|
try:
|
2016-02-12 20:25:16 +00:00
|
|
|
f = makefh(thing)
|
2016-02-11 03:22:23 +00:00
|
|
|
except TypeError:
|
|
|
|
return None
|
|
|
|
|
2016-02-18 17:16:45 +00:00
|
|
|
stem, ext = os.path.splitext(f.name)
|
2016-02-13 07:14:51 +00:00
|
|
|
if not ext:
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.debug("%s no file extension, skipping %s.", stem, ext)
|
2016-02-13 07:14:51 +00:00
|
|
|
return None
|
|
|
|
|
2016-02-11 03:22:23 +00:00
|
|
|
possible = [t for t in knowndoctypes if ext in t.extensions]
|
2016-02-13 07:14:51 +00:00
|
|
|
logger.debug("Possible: %r", possible)
|
2016-02-11 03:22:23 +00:00
|
|
|
if not possible:
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.debug("%s unknown extension %s.", stem, ext)
|
2016-02-11 03:22:23 +00:00
|
|
|
return None
|
2016-02-13 07:14:51 +00:00
|
|
|
|
2016-02-11 03:22:23 +00:00
|
|
|
if len(possible) == 1:
|
|
|
|
doctype = possible.pop()
|
|
|
|
return doctype
|
|
|
|
|
|
|
|
# -- for this extension, multiple document types, probably SGML, XML
|
|
|
|
#
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.debug("%s multiple possible doctypes for extension %s on file %s.",
|
|
|
|
stem, ext, f.name)
|
2016-02-11 03:22:23 +00:00
|
|
|
for doctype in possible:
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.debug("%s extension %s could be %s.", stem, ext, doctype)
|
2016-02-11 03:22:23 +00:00
|
|
|
|
|
|
|
guesses = list()
|
|
|
|
for doctype in possible:
|
2016-02-12 20:25:16 +00:00
|
|
|
sindex = doctype.signatureLocation(f)
|
2016-02-11 03:22:23 +00:00
|
|
|
if sindex is not None:
|
|
|
|
guesses.append((sindex, doctype))
|
|
|
|
|
|
|
|
if not guesses:
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.warning("%s no matching signature found for %s.",
|
|
|
|
stem, f.name)
|
2016-02-11 03:22:23 +00:00
|
|
|
return None
|
|
|
|
if len(guesses) == 1:
|
|
|
|
_, doctype = guesses.pop()
|
|
|
|
return doctype
|
|
|
|
|
|
|
|
# -- OK, this is unusual; we still found multiple document type
|
|
|
|
# signatures. Seems rare but unlikely, so we should choose the
|
|
|
|
# first signature in the file as the more likely document type.
|
|
|
|
#
|
|
|
|
guesses.sort()
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.info("%s multiple doctype guesses for file %s", stem, f.name)
|
2016-02-11 03:22:23 +00:00
|
|
|
for sindex, doctype in guesses:
|
2016-02-18 17:16:45 +00:00
|
|
|
logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
|
|
|
|
logger.info("%s going to guess %s for %s", stem, doctype, f.name)
|
2016-02-13 07:00:30 +00:00
|
|
|
_, doctype = guesses.pop(0)
|
2016-02-11 03:22:23 +00:00
|
|
|
return doctype
|
|
|
|
|
|
|
|
|
|
|
|
knowndoctypes = listDoctypes()
|
|
|
|
knownextensions = set()
|
|
|
|
for x in knowndoctypes:
|
|
|
|
knownextensions.update(x.extensions)
|
|
|
|
|
|
|
|
#
|
|
|
|
# -- end of file
|