mirror of https://github.com/tLDP/python-tldp
226 lines
8.5 KiB
Python
226 lines
8.5 KiB
Python
#! /usr/bin/python
|
|
# -*- coding: utf8 -*-
|
|
|
|
from __future__ import absolute_import, division, print_function
|
|
from __future__ import unicode_literals
|
|
|
|
import os
|
|
import sys
|
|
import errno
|
|
import logging
|
|
|
|
from tldp.ldpcollection import LDPDocumentCollection
|
|
|
|
from tldp.utils import md5files, stem_and_ext
|
|
from tldp.typeguesser import guess, knownextensions
|
|
|
|
logger = logging.getLogger(__name__)
|
|
|
|
IGNORABLE_SOURCE = ('index.sgml')
|
|
|
|
|
|
def scansourcedirs(dirnames):
|
|
'''return a dict() of all SourceDocuments discovered in dirnames
|
|
dirnames: a list of directories containing SourceDocuments.
|
|
|
|
scansourcedirs ensures it is operating on the absolute filesystem path for
|
|
each of the source directories.
|
|
|
|
If any of the supplied dirnames does not exist as a directory, the function
|
|
will log the missing source directory names and then will raise an IOError
|
|
and quit.
|
|
|
|
For each document that it finds in a source directory, it creates a
|
|
SourceDocument entry using the stem name as a key.
|
|
|
|
The rules for identifying possible SourceDocuments go as follows.
|
|
|
|
- Within any source directory, a source document can consist of a single
|
|
file with an extension or a directory.
|
|
|
|
- If the candidate entry is a directory, then, the stem is the full
|
|
directory name, e.g. Masquerading-Simple-HOWTO
|
|
|
|
- If the candidate entry is a file, the stem is the filename minus
|
|
extension, e.g. Encrypted-Root-Filesystem-HOWTO
|
|
|
|
Because the function accepts (and will scan) many source directories, it
|
|
is possible that there will be stem name collisions. If it discovers a
|
|
stem collision, SourceCollection will issue a warning and skip the
|
|
duplicated stem(s). [It also tries to process the source directories and
|
|
candidates in a stable order between runs.]
|
|
'''
|
|
found = dict()
|
|
dirs = [os.path.abspath(x) for x in dirnames]
|
|
results = [os.path.exists(x) for x in dirs]
|
|
|
|
if not all(results):
|
|
for result, sdir in zip(results, dirs):
|
|
logger.critical("Source collection dir must already exist: %s",
|
|
sdir)
|
|
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir)
|
|
|
|
for sdir in sorted(dirs):
|
|
logger.debug("Scanning for source documents in %s.", sdir)
|
|
for fname in sorted(os.listdir(sdir)):
|
|
candidates = list()
|
|
possible = arg_issourcedoc(os.path.join(sdir, fname))
|
|
if possible:
|
|
candidates.append(SourceDocument(possible))
|
|
else:
|
|
logger.warning("Skipping non-document %s", fname)
|
|
continue
|
|
for candy in candidates:
|
|
if candy.stem in found:
|
|
dup = found[candy.stem].filename
|
|
logger.warning("Ignoring duplicate is %s", candy.filename)
|
|
logger.warning("Existing dup-entry is %s", dup)
|
|
else:
|
|
found[candy.stem] = candy
|
|
logger.debug("Discovered %s source documents", len(found))
|
|
return found
|
|
|
|
|
|
def arg_issourcedoc(filename):
|
|
filename = os.path.abspath(filename)
|
|
if os.path.isfile(filename):
|
|
if os.path.basename(filename) in IGNORABLE_SOURCE:
|
|
return None
|
|
return filename
|
|
elif os.path.isdir(filename):
|
|
return sourcedoc_fromdir(filename)
|
|
return None
|
|
|
|
|
|
def sourcedoc_fromdir(name):
|
|
candidates = list()
|
|
if not os.path.isdir(name):
|
|
return None
|
|
stem = os.path.basename(name)
|
|
for ext in knownextensions:
|
|
possible = os.path.join(name, stem + ext)
|
|
if os.path.isfile(possible):
|
|
candidates.append(possible)
|
|
if len(candidates) > 1:
|
|
logger.warning("%s multiple document choices in dir %s, bailing....",
|
|
stem, name)
|
|
raise Exception("multiple document choices in " + name)
|
|
elif len(candidates) == 0:
|
|
return None
|
|
else:
|
|
doc = candidates.pop()
|
|
logger.debug("%s identified main document %s.", stem, doc)
|
|
return doc
|
|
|
|
|
|
class SourceCollection(LDPDocumentCollection):
|
|
'''a dict-like container for SourceDocument objects
|
|
|
|
The key in the SourceCollection is the stem name of the document, which
|
|
allows convenient access and guarantees non-collision.
|
|
|
|
The use of the stem as a key works conveniently with the
|
|
OutputCollection which uses the same strategy on OutputDirectory.
|
|
'''
|
|
def __init__(self, dirnames=None):
|
|
'''construct a SourceCollection
|
|
|
|
delegates most responsibility to function scansourcedirs
|
|
'''
|
|
if dirnames is None:
|
|
return
|
|
self.update(scansourcedirs(dirnames))
|
|
|
|
|
|
class SourceDocument(object):
|
|
'''a class providing a container for each set of source documents
|
|
'''
|
|
def __repr__(self):
|
|
return '<%s:%s (%s)>' % \
|
|
(self.__class__.__name__, self.filename, self.doctype)
|
|
|
|
def __init__(self, filename):
|
|
'''construct a SourceDocument
|
|
|
|
filename is a required parameter
|
|
|
|
The filename is the main (and sometimes sole) document representing
|
|
the source of the LDP HOWTO or Guide. It is the document that is
|
|
passed by name to be handled by any document processing toolchains
|
|
(see also tldp.doctypes).
|
|
|
|
Each instantiation will raise an IOERror if the supplied filename does
|
|
not exist or if the filename isn't a file (symlink is fine, directory
|
|
or fifo is not).
|
|
|
|
The remainder of the instantiation will set attributes that are useful
|
|
later in the processing phase, for example, stem, status, enclosing
|
|
directory name and file extension.
|
|
|
|
There are two important attributes. First, the document type guesser
|
|
will try to infer the doctype (from file extension and signature).
|
|
Note that it is not a fatal error if document type cannot be guessed,
|
|
but the document will not be able to be processed. Second, it is
|
|
useful during the decision-making process to know if any of the source
|
|
files are newer than the output files. Thus, the stat() information
|
|
for every file in the source document directory (or just the single
|
|
source document file) will be collected.
|
|
'''
|
|
self.filename = os.path.abspath(filename)
|
|
|
|
if not os.path.exists(self.filename):
|
|
fn = self.filename
|
|
logger.critical("Missing source document: %s", fn)
|
|
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), fn)
|
|
|
|
if os.path.isdir(self.filename):
|
|
self.filename = sourcedoc_fromdir(self.filename)
|
|
elif os.path.isfile(self.filename):
|
|
pass
|
|
else:
|
|
# -- we did not receive a useable document file or directory name
|
|
self.filename = None
|
|
|
|
if self.filename is None:
|
|
fn = filename
|
|
logger.critical("Source document is not a plain file: %s", fn)
|
|
raise ValueError(fn + " not identifiable as a document")
|
|
|
|
self.doctype = guess(self.filename)
|
|
self.status = 'source'
|
|
self.output = None
|
|
self.working = None
|
|
self.differing = set()
|
|
self.dirname, self.basename = os.path.split(self.filename)
|
|
self.stem, self.ext = stem_and_ext(self.basename)
|
|
parentbase = os.path.basename(self.dirname)
|
|
logger.debug("%s found source %s", self.stem, self.filename)
|
|
if parentbase == self.stem:
|
|
parentdir = os.path.dirname(self.dirname)
|
|
self.md5sums = md5files(self.dirname, relative=parentdir)
|
|
else:
|
|
self.md5sums = md5files(self.filename, relative=self.dirname)
|
|
|
|
def detail(self, widths, verbose, file=sys.stdout):
|
|
'''produce a small tabular output about the document'''
|
|
template = ' '.join(('{s.status:{w.status}}',
|
|
'{s.doctype.__name__:{w.doctype}}',
|
|
'{s.stem:{w.stem}}'))
|
|
outstr = template.format(s=self, w=widths)
|
|
print(outstr, file=file)
|
|
if verbose:
|
|
print(' doctype {}'.format(self.doctype), file=file)
|
|
if self.output:
|
|
print(' output dir {}'.format(self.output.dirname),
|
|
file=file)
|
|
print(' source file {}'.format(self.filename), file=file)
|
|
for why, f in sorted(self.differing):
|
|
fname = os.path.join(self.dirname, f)
|
|
print(' {:>7} source {}'.format(why, fname), file=file)
|
|
if self.output:
|
|
for f in sorted(self.output.missing):
|
|
print(' missing output {}'.format(f), file=file)
|
|
|
|
#
|
|
# -- end of file
|