python-tldp/tldp/inventory.py

#! /usr/bin/python
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 Linux Documentation Project

from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import copy
import logging
from collections import OrderedDict

from tldp.sources import SourceCollection
from tldp.outputs import OutputCollection

logger = logging.getLogger(__name__)

# -- any individual document (source or output) will have a status
#    from the following list of status_types
#
stypes = OrderedDict()
stypes['source'] = 'found in source repository'
stypes['output'] = 'found in output repository'
stypes['published'] = 'matching stem in source/output; doc is up to date'
stypes['stale'] = 'matching stem in source/output; but source is newer'
stypes['orphan'] = 'stem located in output, but no source found (i.e. old?)'
stypes['broken'] = 'output is missing an expected output format (e.g. PDF)'
stypes['new'] = 'stem located in source, but missing in output; unpublished'

status_types = stypes.keys()

# -- the user probably doesn't usually care (too much) about listing
#    every single published document and source document, but is probably
#    mostly interested in specific documents grouped by status; so the
#    status_classes are just sets of status_types
#
status_classes = OrderedDict(zip(status_types, [[x] for x in status_types]))
status_classes['outputs'] = ['output']
status_classes['sources'] = ['source']
status_classes['orphans'] = ['orphan']
status_classes['orphaned'] = ['orphan']
status_classes['problems'] = ['orphan', 'broken', 'stale']
status_classes['work'] = ['new', 'orphan', 'broken', 'stale']
status_classes['all'] = ['published', 'new', 'orphan', 'broken', 'stale']


class Inventory(object):
    '''a container for classifying documents by their status

    Every SourceDocument has no more than one matching OutputDirectory.

    The Inventory class encodes the logic for identifying the following
    different status possibilities for an arbitrary set of SourceDocuments and
    OutputDirectorys.

    The following are possible values for status:
       - 'source':  a source document before any status detection
       - 'output':  an output document before any status detection
       - 'new':  a source document without any matching output stem
       - 'published':  a pair of source/output documents with matching stems
       - 'orphan':  an output document without any matching source stem
       - 'broken':  a published document with missing output files
       - 'stale':  a published document with new(er) source files

    The Inventory object is intended to be used to identify work that needs to
    be done on individual source documents to produce up-to-date output
    documents.
    '''
    def __repr__(self):
        return '<%s: %d published, %d orphan, %d new, %d stale, %d broken>' % (
               self.__class__.__name__,
               len(self.published),
               len(self.orphan),
               len(self.new),
               len(self.stale),
               len(self.broken),)

    def __init__(self, pubdir, sourcedirs):
        '''construct an Inventory

        pubdir: path to the OutputCollection

        sourcedirs: a list of directories which could be passed to the
          SourceCollection object; essentially a directory containing
          SourceDocuments; for example LDP/LDP/howto/linuxdoc and
          LDP/LDP/guide/docbook
        '''
        self.output = OutputCollection(pubdir)
        self.source = SourceCollection(sourcedirs)
        s = copy.deepcopy(self.source)
        o = copy.deepcopy(self.output)
        sset = set(s.keys())
        oset = set(o.keys())

        # -- orphan identification
        #
        self.orphan = OutputCollection()
        for doc in oset.difference(sset):
            self.orphan[doc] = o[doc]
            del o[doc]
            self.orphan[doc].status = 'orphan'
        logger.debug("Identified %d orphan documents: %r.", len(self.orphan),
                     self.orphan.keys())

        # -- unpublished ('new') identification
        #
        self.new = SourceCollection()
        for doc in sset.difference(oset):
            self.new[doc] = s[doc]
            del s[doc]
            self.new[doc].status = 'new'
        logger.debug("Identified %d new documents: %r.", len(self.new),
                     self.new.keys())

        # -- published identification; source and output should be same size
        assert len(s) == len(o)
        for stem, odoc in o.items():
            sdoc = s[stem]
            sdoc.output = odoc
            odoc.source = sdoc
            sdoc.status = sdoc.output.status = 'published'
        self.published = s
        logger.debug("Identified %d published documents.", len(self.published))

        # -- broken identification
        #
        self.broken = SourceCollection()
        for stem, sdoc in s.items():
            if not sdoc.output.iscomplete:
                self.broken[stem] = sdoc
                sdoc.status = sdoc.output.status = 'broken'
        logger.debug("Identified %d broken documents: %r.", len(self.broken),
                     self.broken.keys())

        # -- stale identification
        #
        self.stale = SourceCollection()
        for stem, sdoc in s.items():
            odoc = sdoc.output
            omd5, smd5 = odoc.md5sums, sdoc.md5sums
            if omd5 != smd5:
                logger.debug("%s differing MD5 sets %r %r", stem, smd5, omd5)
                changed = set()
                for gone in set(omd5.keys()).difference(smd5.keys()):
                    logger.debug("%s gone %s", stem, gone)
                    changed.add(('gone', gone))
                for new in set(smd5.keys()).difference(omd5.keys()):
                    changed.add(('new', new))
                for sfn in set(smd5.keys()).intersection(omd5.keys()):
                    if smd5[sfn] != omd5[sfn]:
                        changed.add(('changed', sfn))
                for why, sfn in changed:
                    logger.debug("%s differing source %s (%s)", stem, sfn, why)
                odoc.status = sdoc.status = 'stale'
                sdoc.differing = changed
                self.stale[stem] = sdoc
        logger.debug("Identified %d stale documents: %r.", len(self.stale),
                     self.stale.keys())

    def getByStatusClass(self, status_class):
        desired = status_classes.get(status_class, None)
        assert isinstance(desired, list)
        collection = SourceCollection()
        for status_type in desired:
            collection.update(getattr(self, status_type))
        return collection

    @property
    def outputs(self):
        return self.getByStatusClass('outputs')

    @property
    def sources(self):
        return self.getByStatusClass('sources')

    @property
    def problems(self):
        return self.getByStatusClass('problems')

    @property
    def work(self):
        return self.getByStatusClass('work')

    @property
    def orphans(self):
        return self.getByStatusClass('orphans')

    @property
    def orphaned(self):
        return self.getByStatusClass('orphaned')

    @property
    def all(self):
        return self.getByStatusClass('all')

#
# -- end of file
inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00			`#! /usr/bin/python`
adding # -- coding: utf8 -- 2016-02-18 21:25:02 +00:00			`# -- coding: utf8 --`
adding 2016 LDP copyright to each file 2016-04-29 15:02:02 +00:00			`#`
			`# Copyright (c) 2016 Linux Documentation Project`
inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00
			`from __future__ import absolute_import, division, print_function`
everybody gets unicode_literals 2016-03-15 05:18:09 +00:00			`from __future__ import unicode_literals`
inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`import copy`
pull logger straight from logging; use abs imports 2016-02-25 19:39:18 +00:00			`import logging`
add support for --doctypes and --statustypes provide CLI-discoverable listing of supported source document types and status types 2016-03-06 19:29:13 +00:00			`from collections import OrderedDict`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00
pull logger straight from logging; use abs imports 2016-02-25 19:39:18 +00:00			`from tldp.sources import SourceCollection`
			`from tldp.outputs import OutputCollection`
inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00
pep8/pyflakes 2016-03-01 04:33:14 +00:00			`logger = logging.getLogger(__name__)`
inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00
add status_classes for user interaction 2016-02-23 16:59:56 +00:00			`# -- any individual document (source or output) will have a status`
			`# from the following list of status_types`
			`#`
add support for --doctypes and --statustypes provide CLI-discoverable listing of supported source document types and status types 2016-03-06 19:29:13 +00:00			`stypes = OrderedDict()`
			`stypes['source'] = 'found in source repository'`
			`stypes['output'] = 'found in output repository'`
			`stypes['published'] = 'matching stem in source/output; doc is up to date'`
			`stypes['stale'] = 'matching stem in source/output; but source is newer'`
			`stypes['orphan'] = 'stem located in output, but no source found (i.e. old?)'`
			`stypes['broken'] = 'output is missing an expected output format (e.g. PDF)'`
			`stypes['new'] = 'stem located in source, but missing in output; unpublished'`

			`status_types = stypes.keys()`
itemize valid types for status codes and standardize on name 'orphan' (not 'orphans' or 'orphaned') assume that SourceCollection and OutputCollection keys are always sorted 2016-02-23 04:24:19 +00:00
add status_classes for user interaction 2016-02-23 16:59:56 +00:00			`# -- the user probably doesn't usually care (too much) about listing`
			`# every single published document and source document, but is probably`
			`# mostly interested in specific documents grouped by status; so the`
			`# status_classes are just sets of status_types`
			`#`
add support for --doctypes and --statustypes provide CLI-discoverable listing of supported source document types and status types 2016-03-06 19:29:13 +00:00			`status_classes = OrderedDict(zip(status_types, [[x] for x in status_types]))`
add status_classes for user interaction 2016-02-23 16:59:56 +00:00			`status_classes['outputs'] = ['output']`
			`status_classes['sources'] = ['source']`
			`status_classes['orphans'] = ['orphan']`
			`status_classes['orphaned'] = ['orphan']`
add support for --doctypes and --statustypes provide CLI-discoverable listing of supported source document types and status types 2016-03-06 19:29:13 +00:00			`status_classes['problems'] = ['orphan', 'broken', 'stale']`
			`status_classes['work'] = ['new', 'orphan', 'broken', 'stale']`
add status_classes for user interaction 2016-02-23 16:59:56 +00:00			`status_classes['all'] = ['published', 'new', 'orphan', 'broken', 'stale']`

inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`class Inventory(object):`
adding some docstrings 2016-02-19 07:07:53 +00:00			`'''a container for classifying documents by their status`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00
adding some docstrings 2016-02-19 07:07:53 +00:00			`Every SourceDocument has no more than one matching OutputDirectory.`

			`The Inventory class encodes the logic for identifying the following`
			`different status possibilities for an arbitrary set of SourceDocuments and`
			`OutputDirectorys.`

			`The following are possible values for status:`
			`- 'source': a source document before any status detection`
			`- 'output': an output document before any status detection`
			`- 'new': a source document without any matching output stem`
			`- 'published': a pair of source/output documents with matching stems`
			`- 'orphan': an output document without any matching source stem`
			`- 'broken': a published document with missing output files`
			`- 'stale': a published document with new(er) source files`

			`The Inventory object is intended to be used to identify work that needs to`
			`be done on individual source documents to produce up-to-date output`
			`documents.`
			`'''`
repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00			`def __repr__(self):`
itemize valid types for status codes and standardize on name 'orphan' (not 'orphans' or 'orphaned') assume that SourceCollection and OutputCollection keys are always sorted 2016-02-23 04:24:19 +00:00			`return '<%s: %d published, %d orphan, %d new, %d stale, %d broken>' % (`
repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00			`self.__class__.__name__,`
			`len(self.published),`
itemize valid types for status codes and standardize on name 'orphan' (not 'orphans' or 'orphaned') assume that SourceCollection and OutputCollection keys are always sorted 2016-02-23 04:24:19 +00:00			`len(self.orphan),`
repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00			`len(self.new),`
			`len(self.stale),`
pyflakes/pep8 adjustments 2016-04-28 16:12:41 +00:00			`len(self.broken),)`
repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`def __init__(self, pubdir, sourcedirs):`
adding some docstrings 2016-02-19 07:07:53 +00:00			`'''construct an Inventory`

			`pubdir: path to the OutputCollection`

			`sourcedirs: a list of directories which could be passed to the`
			`SourceCollection object; essentially a directory containing`
			`SourceDocuments; for example LDP/LDP/howto/linuxdoc and`
			`LDP/LDP/guide/docbook`
			`'''`
rename sources/outputs to singular; remember newer files 2016-02-23 05:22:17 +00:00			`self.output = OutputCollection(pubdir)`
			`self.source = SourceCollection(sourcedirs)`
			`s = copy.deepcopy(self.source)`
			`o = copy.deepcopy(self.output)`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`sset = set(s.keys())`
			`oset = set(o.keys())`
pep8 fixes 2016-02-17 17:12:07 +00:00
repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00			`# -- orphan identification`
			`#`
itemize valid types for status codes and standardize on name 'orphan' (not 'orphans' or 'orphaned') assume that SourceCollection and OutputCollection keys are always sorted 2016-02-23 04:24:19 +00:00			`self.orphan = OutputCollection()`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`for doc in oset.difference(sset):`
itemize valid types for status codes and standardize on name 'orphan' (not 'orphans' or 'orphaned') assume that SourceCollection and OutputCollection keys are always sorted 2016-02-23 04:24:19 +00:00			`self.orphan[doc] = o[doc]`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`del o[doc]`
itemize valid types for status codes and standardize on name 'orphan' (not 'orphans' or 'orphaned') assume that SourceCollection and OutputCollection keys are always sorted 2016-02-23 04:24:19 +00:00			`self.orphan[doc].status = 'orphan'`
decrease logging yak; remove unused functions 2016-02-24 17:26:23 +00:00			`logger.debug("Identified %d orphan documents: %r.", len(self.orphan),`
pep8/pyflakes 2016-03-01 04:33:14 +00:00			`self.orphan.keys())`
pep8 fixes 2016-02-17 17:12:07 +00:00
repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00			`# -- unpublished ('new') identification`
			`#`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`self.new = SourceCollection()`
			`for doc in sset.difference(oset):`
			`self.new[doc] = s[doc]`
			`del s[doc]`
			`self.new[doc].status = 'new'`
decrease logging yak; remove unused functions 2016-02-24 17:26:23 +00:00			`logger.debug("Identified %d new documents: %r.", len(self.new),`
pep8/pyflakes 2016-03-01 04:33:14 +00:00			`self.new.keys())`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00
rename sources/outputs to singular; remember newer files 2016-02-23 05:22:17 +00:00			`# -- published identification; source and output should be same size`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`assert len(s) == len(o)`
			`for stem, odoc in o.items():`
			`sdoc = s[stem]`
			`sdoc.output = odoc`
			`odoc.source = sdoc`
record the 'broken' field in sdoc/odoc and add function to list_broken (analogically appropriate) 2016-02-19 08:54:39 +00:00			`sdoc.status = sdoc.output.status = 'published'`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`self.published = s`
decrease logging yak; remove unused functions 2016-02-24 17:26:23 +00:00			`logger.debug("Identified %d published documents.", len(self.published))`
pep8 fixes 2016-02-17 17:12:07 +00:00
calculate stale by MD5s; swap stale/broken move the stanza that identifies the broken output directories up higher in the file; it's a simpler chunk of code adjust the detection of stale-ness by referring to an output MD5 file and compare with the available source files 2016-04-02 17:47:45 +00:00			`# -- broken identification`
			`#`
			`self.broken = SourceCollection()`
			`for stem, sdoc in s.items():`
			`if not sdoc.output.iscomplete:`
			`self.broken[stem] = sdoc`
			`sdoc.status = sdoc.output.status = 'broken'`
			`logger.debug("Identified %d broken documents: %r.", len(self.broken),`
			`self.broken.keys())`

repair mtime comparison logic repair mtime comparison logic for source and output directories after generalizing the statfiles() function in utils.py add a __repr__ to the Inventory object 2016-02-17 20:04:37 +00:00			`# -- stale identification`
			`#`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`self.stale = SourceCollection()`
			`for stem, sdoc in s.items():`
			`odoc = sdoc.output`
calculate stale by MD5s; swap stale/broken move the stanza that identifies the broken output directories up higher in the file; it's a simpler chunk of code adjust the detection of stale-ness by referring to an output MD5 file and compare with the available source files 2016-04-02 17:47:45 +00:00			`omd5, smd5 = odoc.md5sums, sdoc.md5sums`
			`if omd5 != smd5:`
			`logger.debug("%s differing MD5 sets %r %r", stem, smd5, omd5)`
			`changed = set()`
			`for gone in set(omd5.keys()).difference(smd5.keys()):`
			`logger.debug("%s gone %s", stem, gone)`
			`changed.add(('gone', gone))`
			`for new in set(smd5.keys()).difference(omd5.keys()):`
			`changed.add(('new', new))`
			`for sfn in set(smd5.keys()).intersection(omd5.keys()):`
			`if smd5[sfn] != omd5[sfn]:`
			`changed.add(('changed', sfn))`
			`for why, sfn in changed:`
			`logger.debug("%s differing source %s (%s)", stem, sfn, why)`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`odoc.status = sdoc.status = 'stale'`
calculate stale by MD5s; swap stale/broken move the stanza that identifies the broken output directories up higher in the file; it's a simpler chunk of code adjust the detection of stale-ness by referring to an output MD5 file and compare with the available source files 2016-04-02 17:47:45 +00:00			`sdoc.differing = changed`
adjust logic into an inventory object 2016-02-17 16:35:36 +00:00			`self.stale[stem] = sdoc`
decrease logging yak; remove unused functions 2016-02-24 17:26:23 +00:00			`logger.debug("Identified %d stale documents: %r.", len(self.stale),`
pep8/pyflakes 2016-03-01 04:33:14 +00:00			`self.stale.keys())`
inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00
access inventory by status class name create method getByStatusClass() and add testing 2016-03-01 02:03:16 +00:00			`def getByStatusClass(self, status_class):`
			`desired = status_classes.get(status_class, None)`
			`assert isinstance(desired, list)`
			`collection = SourceCollection()`
			`for status_type in desired:`
			`collection.update(getattr(self, status_type))`
			`return collection`

			`@property`
			`def outputs(self):`
			`return self.getByStatusClass('outputs')`

			`@property`
			`def sources(self):`
			`return self.getByStatusClass('sources')`

			`@property`
			`def problems(self):`
			`return self.getByStatusClass('problems')`

			`@property`
			`def work(self):`
			`return self.getByStatusClass('work')`

			`@property`
			`def orphans(self):`
			`return self.getByStatusClass('orphans')`

			`@property`
			`def orphaned(self):`
			`return self.getByStatusClass('orphaned')`

			`@property`
			`def all(self):`
			`return self.getByStatusClass('all')`

inital commit of driver file for handling the porcelain logic 2016-02-17 08:22:31 +00:00			`#`
			`# -- end of file`