allow creation of empty SourceCollection; fixes

Allow creation of an empty SourceCollection, which can be handed around in the
driver to allow for higher-level document wrangling
fix bad, always-failing directory check (thank you, testing)
clarify handling of documents living in a directory and the generation of the
fileset
This commit is contained in:
Martin A. Brown 2016-02-16 23:40:09 -08:00
parent f39237d307
commit f17d164b52
1 changed files with 14 additions and 18 deletions

View File

@ -6,7 +6,7 @@ import os
import errno import errno
import collections import collections
from .utils import logger from .utils import logger, getfileset
from .typeguesser import guess, knownextensions from .typeguesser import guess, knownextensions
@ -16,7 +16,9 @@ class SourceCollection(collections.MutableMapping):
return '<%s:(%s docs)>' % \ return '<%s:(%s docs)>' % \
(self.__class__.__name__, len(self)) (self.__class__.__name__, len(self))
def __init__(self, args): def __init__(self, args=None):
if args is None:
return
dirs = [os.path.abspath(x) for x in args] dirs = [os.path.abspath(x) for x in args]
results = [os.path.exists(x) for x in dirs] results = [os.path.exists(x) for x in dirs]
@ -26,13 +28,12 @@ class SourceCollection(collections.MutableMapping):
raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir) raise IOError(errno.ENOENT, os.strerror(errno.ENOENT), sdir)
for sdir in dirs: for sdir in dirs:
docs = dict()
candidates = list()
for fname in os.listdir(sdir): for fname in os.listdir(sdir):
candidates = list()
possible = os.path.join(sdir, fname) possible = os.path.join(sdir, fname)
if os.path.isfile(possible): if os.path.isfile(possible):
candidates.append(SourceDocument(possible)) candidates.append(SourceDocument(possible))
elif os.path.isdir(fname): elif os.path.isdir(possible):
stem = os.path.basename(fname) stem = os.path.basename(fname)
for ext in knownextensions: for ext in knownextensions:
possible = os.path.join(sdir, fname, stem + ext) possible = os.path.join(sdir, fname, stem + ext)
@ -44,12 +45,11 @@ class SourceCollection(collections.MutableMapping):
continue continue
for candy in candidates: for candy in candidates:
if candy.stem in self: if candy.stem in self:
logger.warning("Duplicate stems: %s and %s", logger.warning("Ignoring duplicate is %s", candy.filename)
self[candy.stem].filename, candy.filename) logger.warning("Existing dup-entry is %s", self[candy.stem].filename)
logger.warning("Ignoring %s", candy.filename)
else: else:
self[candy.stem] = candy self[candy.stem] = candy
logger.info("Discovered %s documents total", len(self)) logger.debug("Discovered %s documents total", len(self))
def __delitem__(self, key): def __delitem__(self, key):
del self.__dict__[key] del self.__dict__[key]
@ -87,16 +87,12 @@ class SourceDocument(object):
self.doctype = self._doctype() self.doctype = self._doctype()
self.dirname, self.basename = os.path.split(self.filename) self.dirname, self.basename = os.path.split(self.filename)
self.stem, self.ext = os.path.splitext(self.basename) self.stem, self.ext = os.path.splitext(self.basename)
self.stat = os.stat(self.filename)
self.resources = False # -- assume no ./images/, ./resources/ self.resources = False # -- assume no ./images/, ./resources/
self.singlefile = True # -- assume only one file parentbase = os.path.basename(self.dirname)
parentdir = os.path.basename(self.dirname) if parentbase == self.stem:
if parentdir == self.stem: self.fileset = getfileset(self.dirname)
self.singlefile = False else:
for rdir in ('resources', 'images'): self.fileset = set([self.basename])
if os.path.exists(os.path.join(self.dirname, rdir)):
self.resources = True
def _doctype(self): def _doctype(self):
return guess(self.filename) return guess(self.filename)