python-tldp/tldp/outputs.py

#! /usr/bin/python
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 Linux Documentation Project

from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals

import os
import sys
import errno
import codecs
import logging

from tldp.ldpcollection import LDPDocumentCollection
from tldp.utils import logdir

logger = logging.getLogger(__name__)


class OutputNamingConvention(object):
    '''A base class inherited by OutputDirectory to ensure consistent
    naming of files across the output collection of documents,
    regardless of the source document type and processing toolchain
    choice.

    Sets a list of names for documents that are expected to be present
    in order to report that the directory iscomplete.
    '''
    expected = ['name_txt', 'name_pdf', 'name_htmls', 'name_html',
                'name_indexhtml']

    def __init__(self, dirname, stem):
        self.dirname = dirname
        self.stem = stem

    @property
    def MD5SUMS(self):
        return os.path.join(self.dirname, '.LDP-source-MD5SUMS')

    @property
    def name_txt(self):
        return os.path.join(self.dirname, self.stem + '.txt')

    @property
    def name_fo(self):
        return os.path.join(self.dirname, self.stem + '.fo')

    @property
    def name_pdf(self):
        return os.path.join(self.dirname, self.stem + '.pdf')

    @property
    def name_html(self):
        return os.path.join(self.dirname, self.stem + '.html')

    @property
    def name_htmls(self):
        return os.path.join(self.dirname, self.stem + '-single.html')

    @property
    def name_epub(self):
        return os.path.join(self.dirname, self.stem + '.epub')

    @property
    def name_indexhtml(self):
        return os.path.join(self.dirname, 'index.html')

    @property
    def validsource(self):
        return os.path.join(self.dirname, self.stem + '.xml')  # -- burp

    @property
    def iscomplete(self):
        '''True if the output directory contains all expected documents'''
        present = list()
        for prop in self.expected:
            name = getattr(self, prop, None)
            assert name is not None
            present.append(os.path.exists(name))
        return all(present)

    @property
    def missing(self):
        '''returns a set of missing files'''
        missing = set()
        for prop in self.expected:
            name = getattr(self, prop, None)
            assert name is not None
            if not os.path.isfile(name):
                missing.add(name)
        return missing

    @property
    def md5sums(self):
        d = dict()
        try:
            with codecs.open(self.MD5SUMS, encoding='utf-8') as f:
                for line in f:
                    if line.startswith('#'):
                        continue
                    hashval, fname = line.strip().split()
                    d[fname] = hashval
        except IOError as e:
            if e.errno != errno.ENOENT:
                raise
        return d


class OutputDirectory(OutputNamingConvention):
    '''A class providing a container for each set of output documents
    for a given source document and general methods for operating on
    and preparing the output directory for a document processor.
    For example, the process of generating each document type for a single
    source (e.g. 'Unicode-HOWTO') would be managed by this object.

    An important element of the OutputDirectory is the stem, determined
    from the directory name when __init__() is called.
    '''
    def __repr__(self):
        return '<%s:%s>' % (self.__class__.__name__, self.dirname)

    @classmethod
    def fromsource(cls, dirname, source):
        newname = os.path.join(dirname, source.stem)
        return cls(newname, source=source)

    def __init__(self, dirname, source=None):
        '''constructor
        :param dirname: directory name for all output documents

        This directory name is expected to end with the document stem name,
        for example '/path/to/the/collection/Unicode-HOWTO'.  The parent
        directory (e.g. '/path/to/the/collection' must exist already.  The
        output directory itself will be created, or emptied and cleared if
        the document needs to be rebuilt.
        '''
        self.dirname = os.path.abspath(dirname)
        self.stem = os.path.basename(self.dirname)
        super(OutputDirectory, self).__init__(self.dirname, self.stem)
        parent = os.path.dirname(self.dirname)
        if not os.path.isdir(parent):
            logger.critical("Missing output collection directory %s.", parent)
            raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), parent)
        self.status = 'output'
        self.source = source
        self.logdir = os.path.join(self.dirname, logdir)

    def detail(self, widths, verbose, file=sys.stdout):
        template = ' '.join(('{s.status:{w.status}}',
                             '{u:{w.doctype}}',
                             '{s.stem:{w.stem}}'))
        outstr = template.format(s=self, w=widths, u="<unknown>")
        print(outstr, file=file)
        if verbose:
            print('  missing source', file=file)


class OutputCollection(LDPDocumentCollection):
    '''a dict-like container for OutputDirectory objects

    The key of an OutputCollection is the stem name of the document, which
    allows convenient access and guaranteed non-collision.

    The use of the stem as a key works conveniently with the
    SourceCollection which uses the same strategy on SourceDocuments.
    '''
    def __init__(self, dirname=None):
        '''construct an OutputCollection

        If dirname is not supplied, OutputCollection is basically, a dict().
        If dirname is supplied, then OutputCollection scans the filesystem for
        subdirectories of dirname and creates an OutputDirectory for each
        subdir.  Each subdir name is used as the stem (or key) for holding the
        OutputDirectory in the OutputCollection.

        For example, consider the following directory tree:

            en
            ├── Latvian-HOWTO
            ├── Scanner-HOWTO
            ├── UUCP-HOWTO
            └── Wireless-HOWTO

        If called like OutputCollection("en"), the result in memory would be
        a structure resembling this:

            OutputCollection("/path/en") = {
              "Latvian-HOWTO":  OutputDirectory("/path/en/Latvian-HOWTO")
              "Scanner-HOWTO":  OutputDirectory("/path/en/Scanner-HOWTO")
              "UUCP-HOWTO":     OutputDirectory("/path/en/UUCP-HOWTO")
              "Wireless-HOWTO": OutputDirectory("/path/en/Wireless-HOWTO")
              }

        '''
        if dirname is None:
            return
        elif not os.path.isdir(dirname):
            logger.critical("Output collection dir %s must already exist.",
                            dirname)
            raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), dirname)
        for fname in sorted(os.listdir(dirname), key=lambda x: x.lower()):
            name = os.path.join(dirname, fname)
            if not os.path.isdir(name):
                logger.info("Skipping non-directory %s (in %s)", name, dirname)
                continue
            logger.debug("Found directory %s (in %s)", name, dirname)
            o = OutputDirectory(name)
            assert o.stem not in self
            self[o.stem] = o


#
# -- end of file