python-tldp/tldp/sources.py

#! /usr/bin/python
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 Linux Documentation Project

from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import os
import sys
import errno
import logging

from tldp.ldpcollection import LDPDocumentCollection

from tldp.utils import md5files, stem_and_ext
from tldp.typeguesser import guess, knownextensions

logger = logging.getLogger(__name__)

IGNORABLE_SOURCE = ('index.sgml')


def scansourcedirs(dirnames):
    '''return a dict() of all SourceDocuments discovered in dirnames
    dirnames:  a list of directories containing SourceDocuments.

    scansourcedirs ensures it is operating on the absolute filesystem path for
    each of the source directories.

    If any of the supplied dirnames does not exist as a directory, the function
    will log the missing source directory names and then will raise an IOError
    and quit.

    For each document that it finds in a source directory, it creates a
    SourceDocument entry using the stem name as a key.

    The rules for identifying possible SourceDocuments go as follows.

      - Within any source directory, a source document can consist of a single
        file with an extension or a directory.

      - If the candidate entry is a directory, then, the stem is the full
        directory name, e.g. Masquerading-Simple-HOWTO

      - If the candidate entry is a file, the stem is the filename minus
        extension, e.g. Encrypted-Root-Filesystem-HOWTO

    Because the function accepts (and will scan) many source directories, it
    is possible that there will be stem name collisions.  If it discovers a
    stem collision, SourceCollection will issue a warning and skip the
    duplicated stem(s).  [It also tries to process the source directories and
    candidates in a stable order between runs.]
    '''
    found = dict()
    dirs = [os.path.abspath(x) for x in dirnames]
    results = [os.path.exists(x) for x in dirs]

    if not all(results):
        for result, sdir in zip(results, dirs):
            logger.critical("Source collection dir must already exist: %s",
                            sdir)
        raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir)

    for sdir in sorted(dirs):
        logger.debug("Scanning for source documents in %s.", sdir)
        for fname in sorted(os.listdir(sdir)):
            candidates = list()
            possible = arg_issourcedoc(os.path.join(sdir, fname))
            if possible:
                candidates.append(SourceDocument(possible))
            else:
                logger.warning("Skipping non-document %s", fname)
                continue
            for candy in candidates:
                if candy.stem in found:
                    dup = found[candy.stem].filename
                    logger.warning("Ignoring duplicate is %s", candy.filename)
                    logger.warning("Existing dup-entry is %s", dup)
                else:
                    found[candy.stem] = candy
    logger.debug("Discovered %s source documents", len(found))
    return found


def arg_issourcedoc(filename):
    filename = os.path.abspath(filename)
    if os.path.isfile(filename):
        if os.path.basename(filename) in IGNORABLE_SOURCE:
            return None
        return filename
    elif os.path.isdir(filename):
        return sourcedoc_fromdir(filename)
    return None


def sourcedoc_fromdir(name):
    candidates = list()
    if not os.path.isdir(name):
        return None
    stem = os.path.basename(name)
    for ext in knownextensions:
        possible = os.path.join(name, stem + ext)
        if os.path.isfile(possible):
            candidates.append(possible)
    if len(candidates) > 1:
        logger.warning("%s multiple document choices in dir %s, bailing....",
                       stem, name)
        raise Exception("multiple document choices in " + name)
    elif len(candidates) == 0:
        return None
    else:
        doc = candidates.pop()
        logger.debug("%s identified main document %s.", stem, doc)
        return doc


class SourceCollection(LDPDocumentCollection):
    '''a dict-like container for SourceDocument objects

    The key in the SourceCollection is the stem name of the document, which
    allows convenient access and guarantees non-collision.

    The use of the stem as a key works conveniently with the
    OutputCollection which uses the same strategy on OutputDirectory.
    '''
    def __init__(self, dirnames=None):
        '''construct a SourceCollection

        delegates most responsibility to function scansourcedirs
        '''
        if dirnames is None:
            return
        self.update(scansourcedirs(dirnames))


class SourceDocument(object):
    '''a class providing a container for each set of source documents
    '''
    def __repr__(self):
        return '<%s:%s (%s)>' % \
               (self.__class__.__name__, self.filename, self.doctype)

    def __init__(self, filename):
        '''construct a SourceDocument

        filename is a required parameter

        The filename is the main (and sometimes sole) document representing
        the source of the LDP HOWTO or Guide.  It is the document that is
        passed by name to be handled by any document processing toolchains
        (see also tldp.doctypes).

        Each instantiation will raise an IOERror if the supplied filename does
        not exist or if the filename isn't a file (symlink is fine, directory
        or fifo is not).

        The remainder of the instantiation will set attributes that are useful
        later in the processing phase, for example, stem, status, enclosing
        directory name and file extension.

        There are two important attributes.  First, the document type guesser
        will try to infer the doctype (from file extension and signature).
        Note that it is not a fatal error if document type cannot be guessed,
        but the document will not be able to be processed.  Second, it is
        useful during the decision-making process to know if any of the source
        files are newer than the output files. Thus, the stat() information
        for every file in the source document directory (or just the single
        source document file) will be collected.
        '''
        self.filename = os.path.abspath(filename)

        if not os.path.exists(self.filename):
            fn = self.filename
            logger.critical("Missing source document: %s", fn)
            raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), fn)

        if os.path.isdir(self.filename):
            self.filename = sourcedoc_fromdir(self.filename)
        elif os.path.isfile(self.filename):
            pass
        else:
            # -- we did not receive a useable document file or directory name
            self.filename = None

        if self.filename is None:
            fn = filename
            logger.critical("Source document is not a plain file: %s", fn)
            raise ValueError(fn + " not identifiable as a document")

        self.doctype = guess(self.filename)
        self.status = 'source'
        self.output = None
        self.working = None
        self.differing = set()
        self.dirname, self.basename = os.path.split(self.filename)
        self.stem, self.ext = stem_and_ext(self.basename)
        parentbase = os.path.basename(self.dirname)
        logger.debug("%s found source %s", self.stem, self.filename)
        if parentbase == self.stem:
            parentdir = os.path.dirname(self.dirname)
            self.md5sums = md5files(self.dirname, relative=parentdir)
        else:
            self.md5sums = md5files(self.filename, relative=self.dirname)

    def detail(self, widths, verbose, file=sys.stdout):
        '''produce a small tabular output about the document'''
        template = ' '.join(('{s.status:{w.status}}',
                             '{s.doctype.__name__:{w.doctype}}',
                             '{s.stem:{w.stem}}'))
        outstr = template.format(s=self, w=widths)
        print(outstr, file=file)
        if verbose:
            print('         doctype {}'.format(self.doctype), file=file)
            if self.output:
                print('      output dir {}'.format(self.output.dirname),
                      file=file)
            print('     source file {}'.format(self.filename), file=file)
            for why, f in sorted(self.differing):
                fname = os.path.join(self.dirname, f)
                print('  {:>7} source {}'.format(why, fname), file=file)
            if self.output:
                for f in sorted(self.output.missing):
                    print('  missing output {}'.format(f), file=file)

#
# -- end of file