python-tldp/tldp/sources.py

#! /usr/bin/python
# -*- coding: utf8 -*-

from __future__ import absolute_import, division, print_function

import os
import errno
import collections

from .utils import logger, statfiles
from .typeguesser import guess, knownextensions


def scansourcedirs(dirnames):
    '''return a dict() of all SourceDocuments discovered in dirnames
    dirnames:  a list of directories containing SourceDocuments.
    
    scansourcedirs ensures it is operating on the absolute filesystem path for
    each of the source directories.

    If any of the supplied dirnames does not exist as a directory, the function
    will log the missing source directory names and then will raise an IOError
    and quit.

    For each document that it finds in a source directory, it creates a
    SourceDocument entry using the stem name as a key.

    The rules for identifying possible SourceDocuments go as follows.

      - Within any source directory, a source document can consist of a single
        file with an extension or a directory.

      - If the candidate entry is a directory, then, the stem is the full
        directory name, e.g. Masquerading-Simple-HOWTO

      - If the candidate entry is a file, the stem is the filename minus
        extension, e.g. Encrypted-Root-Filesystem-HOWTO

    Because the function accepts (and will scan) many source directories, it
    is possible that there will be stem name collisions.  If it discovers a
    stem collision, SourceCollection will issue a warning and skip the
    duplicated stem(s).  [It also tries to process the source directories and
    candidates in a stable order between runs.]
    '''
    found = dict()
    dirs = [os.path.abspath(x) for x in dirnames]
    results = [os.path.exists(x) for x in dirs]

    if not all(results):
        for result, sdir in zip(results, dirs):
            logger.critical("Source collection dir must already exist: %s",
                            sdir)
        raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir)

    for sdir in sorted(dirs):
        for fname in sorted(os.listdir(sdir)):
            candidates = list()
            possible = os.path.join(sdir, fname)
            if os.path.isfile(possible):
                candidates.append(SourceDocument(possible))
            elif os.path.isdir(possible):
                stem = os.path.basename(fname)
                for ext in knownextensions:
                    possible = os.path.join(sdir, fname, stem + ext)
                    if os.path.isfile(possible):
                        candidates.append(SourceDocument(possible))
            else:
                logger.warning("Skipping non-directory, non-plain file %s",
                               possible)
                continue
            for candy in candidates:
                if candy.stem in found:
                    dup = found[candy.stem].filename
                    logger.warning("Ignoring duplicate is %s", candy.filename)
                    logger.warning("Existing dup-entry is %s", dup)
                else:
                    found[candy.stem] = candy
    logger.debug("Discovered %s documents total", len(found))
    return found


class SourceCollection(collections.MutableMapping):
    '''a dict-like container for SourceDocument objects

    The key in the SourceCollection is the stem name of the document, which
    allows convenient access and guarantees non-collision.

    The use of the stem as a key works conveniently with the
    OutputCollection which uses the same strategy on OutputDirectory.
    '''
    def __repr__(self):
        return '<%s:(%s docs)>' % (self.__class__.__name__, len(self))

    def __init__(self, dirnames=None):
        '''construct a SourceCollection

        delegates most responsibility to function scansourcedirs
        '''
        if dirnames is None:
            return
        self.update(scansourcedirs(dirnames))

    def __delitem__(self, key):
        del self.__dict__[key]

    def __getitem__(self, key):
        return self.__dict__[key]

    def __setitem__(self, key, value):
        self.__dict__[key] = value

    def __iter__(self):
        return iter(self.__dict__)

    def __len__(self):
        return len(self.__dict__)


class SourceDocument(object):
    '''a class providing a container for each set of source documents
    '''
    def __repr__(self):
        return '<%s:%s (%s)>' % \
               (self.__class__.__name__, self.filename, self.doctype)

    def __init__(self, filename):
        '''construct a SourceDocument

        filename is a required parameter

        The filename is the main (and sometimes sole) document representing
        the source of the LDP HOWTO or Guide.  It is the document that is
        passed by name to be handled by any document processing toolchains
        (see also tldp.doctypes).

        Each instantiation will raise an IOERror if the supplied filename does
        not exist or if the filename isn't a file (symlink is fine, directory
        or fifo is not).

        The remainder of the instantiation will set attributes that are useful
        later in the processing phase, for example, stem, status, enclosing
        directory name and file extension.

        There are two important attributes.  First, the document type guesser
        will try to infer the doctype (from file extension and signature).
        Note that it is not a fatal error if document type cannot be guessed,
        but the document will not be able to be processed.  Second, it is
        useful during the decision-making process to know if any of the source
        files are newer than the output files. Thus, the stat() information
        for every file in the source document directory (or just the single
        source document file) will be collected.
        '''
        self.filename = os.path.abspath(filename)
        if not os.path.exists(self.filename):
            logger.critical("Missing source document: %s", self.filename)
            raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.filename)
        if not os.path.isfile(self.filename):
            logger.critical("Source document is not a plain file: %s", self.filename)
            raise TypeError("Wrong type, not a plain file: " + self.filename)

        self.doctype = guess(self.filename)
        self.status = 'source'
        self.dirname, self.basename = os.path.split(self.filename)
        self.stem, self.ext = os.path.splitext(self.basename)
        parentbase = os.path.basename(self.dirname)
        logger.debug("%s found source %s", self.stem, self.filename)
        if parentbase == self.stem:
            self.statinfo = statfiles(self.dirname, relative=self.dirname)
        else:
            self.statinfo = statfiles(self.filename, relative=self.dirname)

#
# -- end of file
initial commit 2016-02-11 03:22:23 +00:00			`#! /usr/bin/python`
adding # -- coding: utf8 -- 2016-02-18 21:25:02 +00:00			`# -- coding: utf8 --`
initial commit 2016-02-11 03:22:23 +00:00
changing to __future__ (consistency across project) 2016-02-11 19:28:38 +00:00			`from __future__ import absolute_import, division, print_function`
initial commit 2016-02-11 03:22:23 +00:00
			`import os`
adjust error-raising invocations (and tests) 2016-02-16 05:32:35 +00:00			`import errno`
adjust SourceDirs to behave like a dictionary 2016-02-16 05:51:56 +00:00			`import collections`
initial commit 2016-02-11 03:22:23 +00:00
switch to using statfiles 2016-02-17 19:19:48 +00:00			`from .utils import logger, statfiles`
correct the reference to the renamed guess function 2016-02-12 21:24:21 +00:00			`from .typeguesser import guess, knownextensions`
initial commit 2016-02-11 03:22:23 +00:00

move logic from SourceCollection to scansourcedirs moving the source dir scanning logic into a function (in preparation for further refactoring of single-file or entire-directory source document detection) adapting tests (by changing the name from SourceCollection to scansourcedirs). no other tests required added new test to ensure that an empty SourceCollection() returned as expected 2016-02-19 07:31:18 +00:00			`def scansourcedirs(dirnames):`
			`'''return a dict() of all SourceDocuments discovered in dirnames`
			`dirnames: a list of directories containing SourceDocuments.`

			`scansourcedirs ensures it is operating on the absolute filesystem path for`
			`each of the source directories.`

			`If any of the supplied dirnames does not exist as a directory, the function`
			`will log the missing source directory names and then will raise an IOError`
			`and quit.`

			`For each document that it finds in a source directory, it creates a`
			`SourceDocument entry using the stem name as a key.`

			`The rules for identifying possible SourceDocuments go as follows.`

			`- Within any source directory, a source document can consist of a single`
			`file with an extension or a directory.`

			`- If the candidate entry is a directory, then, the stem is the full`
			`directory name, e.g. Masquerading-Simple-HOWTO`

			`- If the candidate entry is a file, the stem is the filename minus`
			`extension, e.g. Encrypted-Root-Filesystem-HOWTO`

			`Because the function accepts (and will scan) many source directories, it`
			`is possible that there will be stem name collisions. If it discovers a`
			`stem collision, SourceCollection will issue a warning and skip the`
			`duplicated stem(s). [It also tries to process the source directories and`
			`candidates in a stable order between runs.]`
			`'''`
			`found = dict()`
			`dirs = [os.path.abspath(x) for x in dirnames]`
			`results = [os.path.exists(x) for x in dirs]`

			`if not all(results):`
			`for result, sdir in zip(results, dirs):`
			`logger.critical("Source collection dir must already exist: %s",`
			`sdir)`
			`raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir)`

			`for sdir in sorted(dirs):`
			`for fname in sorted(os.listdir(sdir)):`
			`candidates = list()`
			`possible = os.path.join(sdir, fname)`
			`if os.path.isfile(possible):`
			`candidates.append(SourceDocument(possible))`
			`elif os.path.isdir(possible):`
			`stem = os.path.basename(fname)`
			`for ext in knownextensions:`
			`possible = os.path.join(sdir, fname, stem + ext)`
			`if os.path.isfile(possible):`
			`candidates.append(SourceDocument(possible))`
			`else:`
			`logger.warning("Skipping non-directory, non-plain file %s",`
			`possible)`
			`continue`
			`for candy in candidates:`
			`if candy.stem in found:`
			`dup = found[candy.stem].filename`
			`logger.warning("Ignoring duplicate is %s", candy.filename)`
			`logger.warning("Existing dup-entry is %s", dup)`
			`else:`
			`found[candy.stem] = candy`
			`logger.debug("Discovered %s documents total", len(found))`
			`return found`


renaming OutputDirs more appropriately to OutputCollection 2016-02-16 07:52:08 +00:00			`class SourceCollection(collections.MutableMapping):`
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`'''a dict-like container for SourceDocument objects`
initial commit 2016-02-11 03:22:23 +00:00
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`The key in the SourceCollection is the stem name of the document, which`
			`allows convenient access and guarantees non-collision.`

			`The use of the stem as a key works conveniently with the`
			`OutputCollection which uses the same strategy on OutputDirectory.`
			`'''`
initial commit 2016-02-11 03:22:23 +00:00			`def __repr__(self):`
shorter __repr__ can fit on one line 2016-02-17 16:35:53 +00:00			`return '<%s:(%s docs)>' % (self.__class__.__name__, len(self))`
initial commit 2016-02-11 03:22:23 +00:00
move logic from SourceCollection to scansourcedirs moving the source dir scanning logic into a function (in preparation for further refactoring of single-file or entire-directory source document detection) adapting tests (by changing the name from SourceCollection to scansourcedirs). no other tests required added new test to ensure that an empty SourceCollection() returned as expected 2016-02-19 07:31:18 +00:00			`def __init__(self, dirnames=None):`
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`'''construct a SourceCollection`

move logic from SourceCollection to scansourcedirs moving the source dir scanning logic into a function (in preparation for further refactoring of single-file or entire-directory source document detection) adapting tests (by changing the name from SourceCollection to scansourcedirs). no other tests required added new test to ensure that an empty SourceCollection() returned as expected 2016-02-19 07:31:18 +00:00			`delegates most responsibility to function scansourcedirs`
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`'''`
move logic from SourceCollection to scansourcedirs moving the source dir scanning logic into a function (in preparation for further refactoring of single-file or entire-directory source document detection) adapting tests (by changing the name from SourceCollection to scansourcedirs). no other tests required added new test to ensure that an empty SourceCollection() returned as expected 2016-02-19 07:31:18 +00:00			`if dirnames is None:`
allow creation of empty SourceCollection; fixes Allow creation of an empty SourceCollection, which can be handed around in the driver to allow for higher-level document wrangling fix bad, always-failing directory check (thank you, testing) clarify handling of documents living in a directory and the generation of the fileset 2016-02-17 07:40:09 +00:00			`return`
move logic from SourceCollection to scansourcedirs moving the source dir scanning logic into a function (in preparation for further refactoring of single-file or entire-directory source document detection) adapting tests (by changing the name from SourceCollection to scansourcedirs). no other tests required added new test to ensure that an empty SourceCollection() returned as expected 2016-02-19 07:31:18 +00:00			`self.update(scansourcedirs(dirnames))`
adjust SourceDirs to behave like a dictionary 2016-02-16 05:51:56 +00:00
			`def __delitem__(self, key):`
			`del self.__dict__[key]`

			`def __getitem__(self, key):`
			`return self.__dict__[key]`

			`def __setitem__(self, key, value):`
			`self.__dict__[key] = value`

			`def __iter__(self):`
			`return iter(self.__dict__)`

			`def __len__(self):`
			`return len(self.__dict__)`
initial commit 2016-02-11 03:22:23 +00:00

			`class SourceDocument(object):`
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`'''a class providing a container for each set of source documents`
			`'''`
initial commit 2016-02-11 03:22:23 +00:00			`def __repr__(self):`
better __repr__ and doctype @property Include a better __repr__ for the SourceDocument object and make the doctype attribute a @property 2016-02-11 16:12:16 +00:00			`return '<%s:%s (%s)>' % \`
			`(self.__class__.__name__, self.filename, self.doctype)`
initial commit 2016-02-11 03:22:23 +00:00
			`def __init__(self, filename):`
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`'''construct a SourceDocument`

			`filename is a required parameter`

			`The filename is the main (and sometimes sole) document representing`
			`the source of the LDP HOWTO or Guide. It is the document that is`
			`passed by name to be handled by any document processing toolchains`
			`(see also tldp.doctypes).`

			`Each instantiation will raise an IOERror if the supplied filename does`
			`not exist or if the filename isn't a file (symlink is fine, directory`
			`or fifo is not).`

			`The remainder of the instantiation will set attributes that are useful`
			`later in the processing phase, for example, stem, status, enclosing`
			`directory name and file extension.`

			`There are two important attributes. First, the document type guesser`
			`will try to infer the doctype (from file extension and signature).`
			`Note that it is not a fatal error if document type cannot be guessed,`
			`but the document will not be able to be processed. Second, it is`
			`useful during the decision-making process to know if any of the source`
			`files are newer than the output files. Thus, the stat() information`
			`for every file in the source document directory (or just the single`
			`source document file) will be collected.`
			`'''`
initial commit 2016-02-11 03:22:23 +00:00			`self.filename = os.path.abspath(filename)`
			`if not os.path.exists(self.filename):`
adjusting some logging and exceptions for verbosity/clarity 2016-02-16 05:15:29 +00:00			`logger.critical("Missing source document: %s", self.filename)`
adjust error-raising invocations (and tests) 2016-02-16 05:32:35 +00:00			`raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), self.filename)`
check for plain file type, too 2016-02-13 07:59:13 +00:00			`if not os.path.isfile(self.filename):`
adjusting some logging and exceptions for verbosity/clarity 2016-02-16 05:15:29 +00:00			`logger.critical("Source document is not a plain file: %s", self.filename)`
			`raise TypeError("Wrong type, not a plain file: " + self.filename)`
initial commit 2016-02-11 03:22:23 +00:00
adding a bunch of docstring docs 2016-02-18 21:58:53 +00:00			`self.doctype = guess(self.filename)`
adding support for documents to know their status 2016-02-17 08:17:49 +00:00			`self.status = 'source'`
handling multiple source dirs and renaming SourceDir to, simply Sources 2016-02-11 23:16:12 +00:00			`self.dirname, self.basename = os.path.split(self.filename)`
initial commit 2016-02-11 03:22:23 +00:00			`self.stem, self.ext = os.path.splitext(self.basename)`
allow creation of empty SourceCollection; fixes Allow creation of an empty SourceCollection, which can be handed around in the driver to allow for higher-level document wrangling fix bad, always-failing directory check (thank you, testing) clarify handling of documents living in a directory and the generation of the fileset 2016-02-17 07:40:09 +00:00			`parentbase = os.path.basename(self.dirname)`
put stem in logging like many other logging lines 2016-02-18 03:03:37 +00:00			`logger.debug("%s found source %s", self.stem, self.filename)`
allow creation of empty SourceCollection; fixes Allow creation of an empty SourceCollection, which can be handed around in the driver to allow for higher-level document wrangling fix bad, always-failing directory check (thank you, testing) clarify handling of documents living in a directory and the generation of the fileset 2016-02-17 07:40:09 +00:00			`if parentbase == self.stem:`
switch to using statfiles 2016-02-17 19:19:48 +00:00			`self.statinfo = statfiles(self.dirname, relative=self.dirname)`
allow creation of empty SourceCollection; fixes Allow creation of an empty SourceCollection, which can be handed around in the driver to allow for higher-level document wrangling fix bad, always-failing directory check (thank you, testing) clarify handling of documents living in a directory and the generation of the fileset 2016-02-17 07:40:09 +00:00			`else:`
switch to using statfiles 2016-02-17 19:19:48 +00:00			`self.statinfo = statfiles(self.filename, relative=self.dirname)`
initial commit 2016-02-11 03:22:23 +00:00
adjust SourceDirs to behave like a dictionary 2016-02-16 05:51:56 +00:00			`#`
initial commit 2016-02-11 03:22:23 +00:00			`# -- end of file`