python-tldp/tldp/typeguesser.py

#! /usr/bin/python
# -*- coding: utf8 -*-

from __future__ import absolute_import, division, print_function

import os
import inspect

from tldp.utils import logger, makefh

import tldp.doctypes


def getDoctypeMembers(membertype):
    '''returns a list of tldp.doctypes; convenience function'''
    found = list()
    for name, member in inspect.getmembers(tldp.doctypes, membertype):
        logger.debug("Located %s %s (%r).", membertype.__name__, name, member)
        found.append(member)
    return found


def getDoctypeModules():
    '''returns a list of the modules known in tldp.doctypes

    This is the canonical list of doctypes which are recognized and capable of
    being processed into outputs.  See tldp.doctypes for more information.
    '''
    return getDoctypeMembers(inspect.ismodule)


def getDoctypeClasses():
    '''returns a list of the classes known in tldp.doctypes

    This is the canonical list of doctypes which are recognized and capable of
    being processed into outputs.  See tldp.doctypes for more information.
    '''
    return getDoctypeMembers(inspect.isclass)


def guess(thing):
    '''return a tldp.doctype class which is a best guess for document type

    thing: Could be a filename or an open file.

    The guess function will try to guess the document type (doctype) from the
    file extension.  If extension matching produces multiple possible doctype
    matches (e.g. .xml or .sgml), the guess function will then use signature
    matching to find the earliest match in the file for a signature.

    If there are multiple signature matches, it will choose the signature
    matching at the earliest position in the file.

    Bugs/shortcomings:

      * This is only a guesser.
      * When signature matching, it reports first signature it discovers in
        any input file.
      * It could/should read more than 1024 bytes (cf. SignatureChecker)
        especially if it cannot return any result.
      * It could/should use heuristics or something richer than signatures.
    '''
    try:
        f = makefh(thing)
    except TypeError:
        return None

    stem, ext = os.path.splitext(f.name)
    if not ext:
        logger.debug("%s no file extension, skipping %s.", stem, ext)
        return None

    possible = [t for t in knowndoctypeclasses if ext in t.extensions]
    logger.debug("Possible:  %r", possible)
    if not possible:
        logger.debug("%s unknown extension %s.", stem, ext)
        return None

    if len(possible) == 1:
        doctype = possible.pop()
        return doctype

    # -- for this extension, multiple document types, probably SGML, XML
    #
    logger.debug("%s multiple possible doctypes for extension %s on file %s.",
                 stem, ext, f.name)
    for doctype in possible:
        logger.debug("%s extension %s could be %s.", stem, ext, doctype)

    guesses = list()
    for doctype in possible:
        sindex = doctype.signatureLocation(f)
        if sindex is not None:
            guesses.append((sindex, doctype))

    if not guesses:
        logger.warning("%s no matching signature found for %s.",
                       stem, f.name)
        return None
    if len(guesses) == 1:
        _, doctype = guesses.pop()
        return doctype

    # -- OK, this is unusual; we still found multiple document type
    #    signatures.  Seems rare but unlikely, so we should choose the
    #    first signature in the file as the more likely document type.
    #
    guesses.sort()
    logger.info("%s multiple doctype guesses for file %s", stem, f.name)
    for sindex, doctype in guesses:
        logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
    logger.info("%s going to guess %s for %s", stem, doctype, f.name)
    _, doctype = guesses.pop(0)
    return doctype


knowndoctypemodules = getDoctypeModules()
knowndoctypeclasses = getDoctypeClasses()
knownextensions = set()
for x in knowndoctypeclasses:
    knownextensions.update(x.extensions)

#
# -- end of file
initial commit 2016-02-11 03:22:23 +00:00			`#! /usr/bin/python`
adding # -- coding: utf8 -- 2016-02-18 21:25:02 +00:00			`# -- coding: utf8 --`
initial commit 2016-02-11 03:22:23 +00:00
changing to __future__ (consistency across project) 2016-02-11 19:28:38 +00:00			`from __future__ import absolute_import, division, print_function`
initial commit 2016-02-11 03:22:23 +00:00
			`import os`
			`import inspect`

inspect for classes and modules 2016-02-25 17:51:17 +00:00			`from tldp.utils import logger, makefh`
initial commit 2016-02-11 03:22:23 +00:00
inspect for classes and modules 2016-02-25 17:51:17 +00:00			`import tldp.doctypes`
initial commit 2016-02-11 03:22:23 +00:00
inspect for classes and modules 2016-02-25 17:51:17 +00:00
			`def getDoctypeMembers(membertype):`
			`'''returns a list of tldp.doctypes; convenience function'''`
			`found = list()`
			`for name, member in inspect.getmembers(tldp.doctypes, membertype):`
			`logger.debug("Located %s %s (%r).", membertype.__name__, name, member)`
			`found.append(member)`
			`return found`


			`def getDoctypeModules():`
			`'''returns a list of the modules known in tldp.doctypes`

			`This is the canonical list of doctypes which are recognized and capable of`
			`being processed into outputs. See tldp.doctypes for more information.`
			`'''`
			`return getDoctypeMembers(inspect.ismodule)`


			`def getDoctypeClasses():`
			`'''returns a list of the classes known in tldp.doctypes`
docstrings for functions 2016-02-19 07:07:44 +00:00
			`This is the canonical list of doctypes which are recognized and capable of`
			`being processed into outputs. See tldp.doctypes for more information.`
			`'''`
inspect for classes and modules 2016-02-25 17:51:17 +00:00			`return getDoctypeMembers(inspect.isclass)`
initial commit 2016-02-11 03:22:23 +00:00

			`def guess(thing):`
docstrings for functions 2016-02-19 07:07:44 +00:00			`'''return a tldp.doctype class which is a best guess for document type`

			`thing: Could be a filename or an open file.`

			`The guess function will try to guess the document type (doctype) from the`
			`file extension. If extension matching produces multiple possible doctype`
			`matches (e.g. .xml or .sgml), the guess function will then use signature`
			`matching to find the earliest match in the file for a signature.`

			`If there are multiple signature matches, it will choose the signature`
			`matching at the earliest position in the file.`

			`Bugs/shortcomings:`

			`* This is only a guesser.`
			`* When signature matching, it reports first signature it discovers in`
			`any input file.`
			`* It could/should read more than 1024 bytes (cf. SignatureChecker)`
			`especially if it cannot return any result.`
			`* It could/should use heuristics or something richer than signatures.`
			`'''`
initial commit 2016-02-11 03:22:23 +00:00			`try:`
renaming fin to f 2016-02-12 20:25:16 +00:00			`f = makefh(thing)`
initial commit 2016-02-11 03:22:23 +00:00			`except TypeError:`
			`return None`

add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`stem, ext = os.path.splitext(f.name)`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00			`if not ext:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s no file extension, skipping %s.", stem, ext)`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00			`return None`

inspect for classes and modules 2016-02-25 17:51:17 +00:00			`possible = [t for t in knowndoctypeclasses if ext in t.extensions]`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00			`logger.debug("Possible: %r", possible)`
initial commit 2016-02-11 03:22:23 +00:00			`if not possible:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s unknown extension %s.", stem, ext)`
initial commit 2016-02-11 03:22:23 +00:00			`return None`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00
initial commit 2016-02-11 03:22:23 +00:00			`if len(possible) == 1:`
			`doctype = possible.pop()`
			`return doctype`

			`# -- for this extension, multiple document types, probably SGML, XML`
			`#`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s multiple possible doctypes for extension %s on file %s.",`
			`stem, ext, f.name)`
initial commit 2016-02-11 03:22:23 +00:00			`for doctype in possible:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s extension %s could be %s.", stem, ext, doctype)`
initial commit 2016-02-11 03:22:23 +00:00
			`guesses = list()`
			`for doctype in possible:`
renaming fin to f 2016-02-12 20:25:16 +00:00			`sindex = doctype.signatureLocation(f)`
initial commit 2016-02-11 03:22:23 +00:00			`if sindex is not None:`
			`guesses.append((sindex, doctype))`

			`if not guesses:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.warning("%s no matching signature found for %s.",`
			`stem, f.name)`
initial commit 2016-02-11 03:22:23 +00:00			`return None`
			`if len(guesses) == 1:`
			`_, doctype = guesses.pop()`
			`return doctype`

			`# -- OK, this is unusual; we still found multiple document type`
			`# signatures. Seems rare but unlikely, so we should choose the`
			`# first signature in the file as the more likely document type.`
			`#`
			`guesses.sort()`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.info("%s multiple doctype guesses for file %s", stem, f.name)`
initial commit 2016-02-11 03:22:23 +00:00			`for sindex, doctype in guesses:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)`
			`logger.info("%s going to guess %s for %s", stem, doctype, f.name)`
returning the wrong answer... get the FIRST answer (yay for testing) 2016-02-13 07:00:30 +00:00			`_, doctype = guesses.pop(0)`
initial commit 2016-02-11 03:22:23 +00:00			`return doctype`


inspect for classes and modules 2016-02-25 17:51:17 +00:00			`knowndoctypemodules = getDoctypeModules()`
			`knowndoctypeclasses = getDoctypeClasses()`
initial commit 2016-02-11 03:22:23 +00:00			`knownextensions = set()`
inspect for classes and modules 2016-02-25 17:51:17 +00:00			`for x in knowndoctypeclasses:`
initial commit 2016-02-11 03:22:23 +00:00			`knownextensions.update(x.extensions)`

			`#`
			`# -- end of file`