python-tldp/tldp/typeguesser.py

#! /usr/bin/python
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 Linux Documentation Project

from __future__ import absolute_import, division, print_function

import os
import codecs
import inspect
import logging

import tldp.doctypes

logger = logging.getLogger(__name__)


def getDoctypeMembers(membertype):
    '''returns a list of tldp.doctypes; convenience function'''
    found = list()
    for name, member in inspect.getmembers(tldp.doctypes, membertype):
        logger.debug("Located %s %s (%r).", membertype.__name__, name, member)
        found.append(member)
    return found


def getDoctypeClasses():
    '''returns a list of the classes known in tldp.doctypes

    This is the canonical list of doctypes which are recognized and capable of
    being processed into outputs.  See tldp.doctypes for more information.
    '''
    return getDoctypeMembers(inspect.isclass)


def guess(fname):
    '''return a tldp.doctype class which is a best guess for document type

    :parama fname: A filename.

    The guess function will try to guess the document type (doctype) from the
    file extension.  If extension matching produces multiple possible doctype
    matches (e.g. .xml or .sgml), the guess function will then use signature
    matching to find the earliest match in the file for a signature.

    If there are multiple signature matches, it will choose the signature
    matching at the earliest position in the file.

    Bugs/shortcomings:

      * This is only a guesser.
      * When signature matching, it reports first signature it discovers in
        any input file.
      * It could/should read more than 1024 bytes (cf. SignatureChecker)
        especially if it cannot return any result.
      * It could/should use heuristics or something richer than signatures.
    '''
    try:
        stem, ext = os.path.splitext(fname)
    except (AttributeError, TypeError):
        return None

    if not ext:
        logger.debug("%s no file extension, skipping %s.", stem, ext)
        return None

    possible = [t for t in knowndoctypes if ext in t.extensions]
    logger.debug("Possible:  %r", possible)
    if not possible:
        logger.debug("%s unknown extension %s.", stem, ext)
        return None

    if len(possible) == 1:
        doctype = possible.pop()
        return doctype

    # -- for this extension, multiple document types, probably SGML, XML
    #
    logger.debug("%s multiple possible doctypes for extension %s on file %s.",
                 stem, ext, fname)
    for doctype in possible:
        logger.debug("%s extension %s could be %s.", stem, ext, doctype)

    try:
        with codecs.open(fname, encoding='utf-8') as f:
            buf = f.read(1024)
    except UnicodeDecodeError:
        # -- a wee bit ugly, but many SGML docs used iso-8859-1, so fall back
        with codecs.open(fname, encoding='iso-8859-1') as f:
            buf = f.read(1024)

    guesses = list()
    for doctype in possible:
        sindex = doctype.signatureLocation(buf, fname)
        if sindex is not None:
            guesses.append((sindex, doctype))

    if not guesses:
        logger.warning("%s no matching signature found for %s.",
                       stem, fname)
        return None
    if len(guesses) == 1:
        _, doctype = guesses.pop()
        return doctype

    # -- OK, this is unusual; we still found multiple document type
    #    signatures.  Seems rare but unlikely, so we should choose the
    #    first signature in the file as the more likely document type.
    #
    guesses.sort()
    logger.info("%s multiple doctype guesses for file %s", stem, fname)
    for sindex, doctype in guesses:
        logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)
    logger.info("%s going to guess %s for %s", stem, doctype, fname)
    _, doctype = guesses.pop(0)
    return doctype


knowndoctypes = getDoctypeClasses()
knownextensions = set()
for x in knowndoctypes:
    knownextensions.update(x.extensions)

#
# -- end of file
initial commit 2016-02-11 03:22:23 +00:00			`#! /usr/bin/python`
adding # -- coding: utf8 -- 2016-02-18 21:25:02 +00:00			`# -- coding: utf8 --`
adding 2016 LDP copyright to each file 2016-04-29 15:02:02 +00:00			`#`
			`# Copyright (c) 2016 Linux Documentation Project`
initial commit 2016-02-11 03:22:23 +00:00
changing to __future__ (consistency across project) 2016-02-11 19:28:38 +00:00			`from __future__ import absolute_import, division, print_function`
initial commit 2016-02-11 03:22:23 +00:00
			`import os`
switch to codecs.open and expect UTF-8 data 2016-03-15 04:42:21 +00:00			`import codecs`
initial commit 2016-02-11 03:22:23 +00:00			`import inspect`
pull logger straight from logging; use abs imports 2016-02-25 19:39:18 +00:00			`import logging`
initial commit 2016-02-11 03:22:23 +00:00
inspect for classes and modules 2016-02-25 17:51:17 +00:00			`import tldp.doctypes`
initial commit 2016-02-11 03:22:23 +00:00
pep8/pyflakes 2016-03-01 04:33:14 +00:00			`logger = logging.getLogger(__name__)`

inspect for classes and modules 2016-02-25 17:51:17 +00:00
			`def getDoctypeMembers(membertype):`
			`'''returns a list of tldp.doctypes; convenience function'''`
			`found = list()`
			`for name, member in inspect.getmembers(tldp.doctypes, membertype):`
			`logger.debug("Located %s %s (%r).", membertype.__name__, name, member)`
			`found.append(member)`
			`return found`


			`def getDoctypeClasses():`
			`'''returns a list of the classes known in tldp.doctypes`
docstrings for functions 2016-02-19 07:07:44 +00:00
			`This is the canonical list of doctypes which are recognized and capable of`
			`being processed into outputs. See tldp.doctypes for more information.`
			`'''`
inspect for classes and modules 2016-02-25 17:51:17 +00:00			`return getDoctypeMembers(inspect.isclass)`
initial commit 2016-02-11 03:22:23 +00:00

stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`def guess(fname):`
docstrings for functions 2016-02-19 07:07:44 +00:00			`'''return a tldp.doctype class which is a best guess for document type`

stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`:parama fname: A filename.`
docstrings for functions 2016-02-19 07:07:44 +00:00
			`The guess function will try to guess the document type (doctype) from the`
			`file extension. If extension matching produces multiple possible doctype`
			`matches (e.g. .xml or .sgml), the guess function will then use signature`
			`matching to find the earliest match in the file for a signature.`

			`If there are multiple signature matches, it will choose the signature`
			`matching at the earliest position in the file.`

			`Bugs/shortcomings:`

			`* This is only a guesser.`
			`* When signature matching, it reports first signature it discovers in`
			`any input file.`
			`* It could/should read more than 1024 bytes (cf. SignatureChecker)`
			`especially if it cannot return any result.`
			`* It could/should use heuristics or something richer than signatures.`
			`'''`
initial commit 2016-02-11 03:22:23 +00:00			`try:`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`stem, ext = os.path.splitext(fname)`
fix guess(non-string) with Python 3.6 The os.path.* functions now consistently raise TypeError rather than something more random when called with inappropriate types. Fixes #6 2017-07-13 23:15:25 +00:00			`except (AttributeError, TypeError):`
initial commit 2016-02-11 03:22:23 +00:00			`return None`

skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00			`if not ext:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s no file extension, skipping %s.", stem, ext)`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00			`return None`

revert to name knowndoctypes (classes only) no need to itemize knowndoctypemodules after moving the config_fragment functions to the staticmethod argparse on each class 2016-02-25 18:37:45 +00:00			`possible = [t for t in knowndoctypes if ext in t.extensions]`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00			`logger.debug("Possible: %r", possible)`
initial commit 2016-02-11 03:22:23 +00:00			`if not possible:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s unknown extension %s.", stem, ext)`
initial commit 2016-02-11 03:22:23 +00:00			`return None`
skip files with no extension (and complain) 2016-02-13 07:14:51 +00:00
initial commit 2016-02-11 03:22:23 +00:00			`if len(possible) == 1:`
			`doctype = possible.pop()`
			`return doctype`

			`# -- for this extension, multiple document types, probably SGML, XML`
			`#`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s multiple possible doctypes for extension %s on file %s.",`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`stem, ext, fname)`
initial commit 2016-02-11 03:22:23 +00:00			`for doctype in possible:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.debug("%s extension %s could be %s.", stem, ext, doctype)`
initial commit 2016-02-11 03:22:23 +00:00
need to fall back to iso-8859-1 for SGML docs 2016-03-15 20:26:03 +00:00			`try:`
			`with codecs.open(fname, encoding='utf-8') as f:`
			`buf = f.read(1024)`
			`except UnicodeDecodeError:`
			`# -- a wee bit ugly, but many SGML docs used iso-8859-1, so fall back`
			`with codecs.open(fname, encoding='iso-8859-1') as f:`
			`buf = f.read(1024)`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00
initial commit 2016-02-11 03:22:23 +00:00			`guesses = list()`
			`for doctype in possible:`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`sindex = doctype.signatureLocation(buf, fname)`
initial commit 2016-02-11 03:22:23 +00:00			`if sindex is not None:`
			`guesses.append((sindex, doctype))`

			`if not guesses:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.warning("%s no matching signature found for %s.",`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`stem, fname)`
initial commit 2016-02-11 03:22:23 +00:00			`return None`
			`if len(guesses) == 1:`
			`_, doctype = guesses.pop()`
			`return doctype`

			`# -- OK, this is unusual; we still found multiple document type`
			`# signatures. Seems rare but unlikely, so we should choose the`
			`# first signature in the file as the more likely document type.`
			`#`
			`guesses.sort()`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`logger.info("%s multiple doctype guesses for file %s", stem, fname)`
initial commit 2016-02-11 03:22:23 +00:00			`for sindex, doctype in guesses:`
add stem to logging entries for consistency, make sure that stem is included on logging lines 2016-02-18 17:16:45 +00:00			`logger.info("%s could be %s (sig at pos %s)", stem, doctype, sindex)`
stop leaking FDs when guessing doctypes 2016-03-15 03:32:42 +00:00			`logger.info("%s going to guess %s for %s", stem, doctype, fname)`
returning the wrong answer... get the FIRST answer (yay for testing) 2016-02-13 07:00:30 +00:00			`_, doctype = guesses.pop(0)`
initial commit 2016-02-11 03:22:23 +00:00			`return doctype`


revert to name knowndoctypes (classes only) no need to itemize knowndoctypemodules after moving the config_fragment functions to the staticmethod argparse on each class 2016-02-25 18:37:45 +00:00			`knowndoctypes = getDoctypeClasses()`
initial commit 2016-02-11 03:22:23 +00:00			`knownextensions = set()`
revert to name knowndoctypes (classes only) no need to itemize knowndoctypemodules after moving the config_fragment functions to the staticmethod argparse on each class 2016-02-25 18:37:45 +00:00			`for x in knowndoctypes:`
initial commit 2016-02-11 03:22:23 +00:00			`knownextensions.update(x.extensions)`

			`#`
			`# -- end of file`