python-tldp/tldp/sources.py

228 lines
8.6 KiB
Python

#! /usr/bin/python
# -*- coding: utf8 -*-
#
# Copyright (c) 2016 Linux Documentation Project
from __future__ import absolute_import, division, print_function
from __future__ import unicode_literals
import os
import sys
import errno
import logging
from tldp.ldpcollection import LDPDocumentCollection
from tldp.utils import md5files, stem_and_ext
from tldp.typeguesser import guess, knownextensions
logger = logging.getLogger(__name__)
IGNORABLE_SOURCE = ('index.sgml')
def scansourcedirs(dirnames):
'''return a dict() of all SourceDocuments discovered in dirnames
dirnames: a list of directories containing SourceDocuments.
scansourcedirs ensures it is operating on the absolute filesystem path for
each of the source directories.
If any of the supplied dirnames does not exist as a directory, the function
will log the missing source directory names and then will raise an IOError
and quit.
For each document that it finds in a source directory, it creates a
SourceDocument entry using the stem name as a key.
The rules for identifying possible SourceDocuments go as follows.
- Within any source directory, a source document can consist of a single
file with an extension or a directory.
- If the candidate entry is a directory, then, the stem is the full
directory name, e.g. Masquerading-Simple-HOWTO
- If the candidate entry is a file, the stem is the filename minus
extension, e.g. Encrypted-Root-Filesystem-HOWTO
Because the function accepts (and will scan) many source directories, it
is possible that there will be stem name collisions. If it discovers a
stem collision, SourceCollection will issue a warning and skip the
duplicated stem(s). [It also tries to process the source directories and
candidates in a stable order between runs.]
'''
found = dict()
dirs = [os.path.abspath(x) for x in dirnames]
results = [os.path.exists(x) for x in dirs]
if not all(results):
for result, sdir in zip(results, dirs):
logger.critical("Source collection dir must already exist: %s",
sdir)
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir)
for sdir in sorted(dirs):
logger.debug("Scanning for source documents in %s.", sdir)
for fname in sorted(os.listdir(sdir)):
candidates = list()
possible = arg_issourcedoc(os.path.join(sdir, fname))
if possible:
candidates.append(SourceDocument(possible))
else:
logger.warning("Skipping non-document %s", fname)
continue
for candy in candidates:
if candy.stem in found:
dup = found[candy.stem].filename
logger.warning("Ignoring duplicate is %s", candy.filename)
logger.warning("Existing dup-entry is %s", dup)
else:
found[candy.stem] = candy
logger.debug("Discovered %s source documents", len(found))
return found
def arg_issourcedoc(filename):
filename = os.path.abspath(filename)
if os.path.isfile(filename):
if os.path.basename(filename) in IGNORABLE_SOURCE:
return None
return filename
elif os.path.isdir(filename):
return sourcedoc_fromdir(filename)
return None
def sourcedoc_fromdir(name):
candidates = list()
if not os.path.isdir(name):
return None
stem = os.path.basename(name)
for ext in knownextensions:
possible = os.path.join(name, stem + ext)
if os.path.isfile(possible):
candidates.append(possible)
if len(candidates) > 1:
logger.warning("%s multiple document choices in dir %s, bailing....",
stem, name)
raise Exception("multiple document choices in " + name)
elif len(candidates) == 0:
return None
else:
doc = candidates.pop()
logger.debug("%s identified main document %s.", stem, doc)
return doc
class SourceCollection(LDPDocumentCollection):
'''a dict-like container for SourceDocument objects
The key in the SourceCollection is the stem name of the document, which
allows convenient access and guarantees non-collision.
The use of the stem as a key works conveniently with the
OutputCollection which uses the same strategy on OutputDirectory.
'''
def __init__(self, dirnames=None):
'''construct a SourceCollection
delegates most responsibility to function scansourcedirs
'''
if dirnames is None:
return
self.update(scansourcedirs(dirnames))
class SourceDocument(object):
'''a class providing a container for each set of source documents
'''
def __repr__(self):
return '<%s:%s (%s)>' % \
(self.__class__.__name__, self.filename, self.doctype)
def __init__(self, filename):
'''construct a SourceDocument
filename is a required parameter
The filename is the main (and sometimes sole) document representing
the source of the LDP HOWTO or Guide. It is the document that is
passed by name to be handled by any document processing toolchains
(see also tldp.doctypes).
Each instantiation will raise an IOERror if the supplied filename does
not exist or if the filename isn't a file (symlink is fine, directory
or fifo is not).
The remainder of the instantiation will set attributes that are useful
later in the processing phase, for example, stem, status, enclosing
directory name and file extension.
There are two important attributes. First, the document type guesser
will try to infer the doctype (from file extension and signature).
Note that it is not a fatal error if document type cannot be guessed,
but the document will not be able to be processed. Second, it is
useful during the decision-making process to know if any of the source
files are newer than the output files. Thus, the stat() information
for every file in the source document directory (or just the single
source document file) will be collected.
'''
self.filename = os.path.abspath(filename)
if not os.path.exists(self.filename):
fn = self.filename
logger.critical("Missing source document: %s", fn)
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), fn)
if os.path.isdir(self.filename):
self.filename = sourcedoc_fromdir(self.filename)
elif os.path.isfile(self.filename):
pass
else:
# -- we did not receive a useable document file or directory name
self.filename = None
if self.filename is None:
fn = filename
logger.critical("Source document is not a plain file: %s", fn)
raise ValueError(fn + " not identifiable as a document")
self.doctype = guess(self.filename)
self.status = 'source'
self.output = None
self.working = None
self.differing = set()
self.dirname, self.basename = os.path.split(self.filename)
self.stem, self.ext = stem_and_ext(self.basename)
parentbase = os.path.basename(self.dirname)
logger.debug("%s found source %s", self.stem, self.filename)
if parentbase == self.stem:
parentdir = os.path.dirname(self.dirname)
self.md5sums = md5files(self.dirname, relative=parentdir)
else:
self.md5sums = md5files(self.filename, relative=self.dirname)
def detail(self, widths, verbose, file=sys.stdout):
'''produce a small tabular output about the document'''
template = ' '.join(('{s.status:{w.status}}',
'{s.doctype.__name__:{w.doctype}}',
'{s.stem:{w.stem}}'))
outstr = template.format(s=self, w=widths)
print(outstr, file=file)
if verbose:
print(' doctype {}'.format(self.doctype), file=file)
if self.output:
print(' output dir {}'.format(self.output.dirname),
file=file)
print(' source file {}'.format(self.filename), file=file)
for why, f in sorted(self.differing):
fname = os.path.join(self.dirname, f)
print(' {:>7} source {}'.format(why, fname), file=file)
if self.output:
for f in sorted(self.output.missing):
print(' missing output {}'.format(f), file=file)
#
# -- end of file