moving from txt2db to wt2db

This commit is contained in:
david 2002-02-02 04:47:19 +00:00
parent 5c1733174b
commit abd7408625
2 changed files with 592 additions and 0 deletions

LDP/wt2db/README Normal file
View File

@ -0,0 +1,69 @@
This is a utility to convert text files in a specific format into valid
DocBook. Just pass it the input filename on the commmand line and you'll
get a .sgml file out. It won't be a complete valid document, as it will
have no header information or dtd specification. It's just a DocBook
fragment, not a complete document.
The following constructs are currently supported. If you need support for
an addition construct, write if you're subscribed,
or if you're not.
Or just add it in the cvs. :-)
Foo <para>Foo</para>
=Title= <sect1><title>Title</title>
=Title|id= <sect1 id='id'><title>Title</title>
works for other sect levels as well, and many other
tags. It is either the "id" value, or the "title"
value, depending on the semantics of the particular
tag. Usage should be obvious in context.
==Title== <sect2><title>Title</title>
===Title=== <sect3><title>Title</title>
#Foo <orderedlist>
#Bar <listitem><para>Foo</para></listitem>
#Baz <listitem><para>Bar</para></listitem>
/# <listitem><para>Baz</para></listitem>
*Foo <simplelist>
*Bar <listitem><para>Foo</para></listitem>
*Baz <listitem><para>Bar</para></listitem>
/* <listitem><para>Baz</para></listitem>
[[]] <ulink url=''>
[[ Foo]] <ulink url=''>
[|Foo]] You can also delimit with the pipe character "|".
This works on any of these [[]] tags.
[[file:Foo]] <filename>Foo</filename>
'''Foo''' <emphasis>Foo</emphasis>
A few DocBook structures do not have <para> tags wrapped around them. They
are <para> itself (duh!), <sect?> and <programlisting>. If you insert anything
using these tags, no <para> tags will be wrapped around it or inserted into it.
So if you want fine control over your <para> tags, insert them yourself.
These tags include:

LDP/wt2db/wt2db Executable file
View File

@ -0,0 +1,523 @@
#Converts txt files into docbook.
# Requirements:
# If you use the "ldp:" namespace, you must have wget installed.
# Wget is used to request an xml record from the LDP # database,
use File::Basename;
use HTML::Entities;
my($txtfile, $dbfile) = '';
#These keep track of which constructs we're in the middle of
my($id, $title);
$error = 0;
# read in cmd-line arguments
while (1) {
if($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
$dbfile = $ARGV[0];
} elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
} elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") {
} else {
$txtfile = $ARGV[0];
if ($ARGV[0] eq '') {
# abort if no input file given
if($txtfile eq '') {
print "txt2db: ERROR text file not specified.\n\n";
$error = 1;
} elsif( !(-r $txtfile) ) {
print "txt2db: ERROR cannot read $f ($!)\n\n";
$error = 1;
unless ($dbfile) {
($basename, $path, $ext) = fileparse($txtfile);
$dbfile = $basename;
$dbfile =~ s/\..*?$/\.sgml/;
$buf = '';
open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n";
print DB $buf, "\n";
# -----------------------------------------------------------
sub proc_txt {
my($f) = @_;
$linenumber = 0;
my ($noparatag,
$noparadepth = 0;
$noparaline = 0;
# read in the text file
open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n";
while ($originalline = <TXT>) {
$line = $originalline;
# blank lines
if ($line eq '') {
if ($noparadepth == 0) {
# capitalize hints that can be entered in lowercase
$line =~ s/^q:/Q:/;
$line =~ s/^a:/A:/;
# encode entities
# while ($line =~ //) {
# }
# decode_entities($line);
# inline docbook
# ulink
while ($line =~ /\[\[/) {
unless ($line =~ /\]\]/) {
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
# separate link url from link name
$link = $line;
$link=~ s/\n//g;
$link =~ s/.*?\[\[//;
$link =~ s/\]\].*?$//;
if ($link =~ /\|/) {
$linkname = $link;
$link =~ s/\|.+$//;
$linkname =~ s/^\S+\|//;
} else {
$linkname = $link;
# kill quotes, they mess us up
$link =~ s/'/%27/g;
# namespaces are handled differently
print "$link\n" if ($verbose);
if ($link =~ /^http:/) {
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^mailto:/) {
$linkname =~ s/^mailto://;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^wiki:/) {
$linkname =~ s/^wiki://;
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
$link =~ s/\ /+/;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^ldp:/) {
$linkname =~ s/^ldp://;
$link =~ s/^ldp://;
$tempfile = "/tmp/txt2db-" . $rand;
$cmd = "wget -q$link -O $tempfile";
open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n";
$link = "";
while ($url_line = <URL>) {
$url_line =~ s/\n//;
if ($url_line =~ /identifier/) {
$link .= $url_line;
unlink $tempfile;
$link =~ s/^.*?<identifier>//;
$link =~ s/<\/identifier>.*?$//;
if ($link eq '') {
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^file:/) {
$linkname =~ s/^file://;
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
} else {
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
# emphasis
while ($line =~ /'''.*'''/) {
$line =~ s/'''/<emphasis role='bold'>/;
$line =~ s/'''/<\/emphasis>/;
# this block defines DocBook structures that won't be broken up with
# paragraphs when we hit empty lines:
# <para>
# <sect1>
# <sect2>
# <sect3>
# <programlisting>
# <literallayout>
# forget about nopara
if ($noparadepth == 0) {
$noparatag = "";
# start a new nopara section
if ((($line =~ /^<para>/) or
($line =~ /^<sect/) or
($line =~ /^<screen>/) or
($line =~ /^<literallayout>/) or
($line =~ /^<programlisting>/)) and
($noparadepth == 0)) {
$noparatag = $line;
$noparatag =~ s/^.*?<//;
$noparatag =~ s/>.*?$//;
$noparaline = $linenumber;
if ($line =~ /^<screen>/) {
unless ($para) {
$line = "<para>" . $line;
$para = 1;
# count noparadepth
if ($noparatag ne '') {
$temp = $line;
while ($temp =~ /<$noparatag>/) {
$temp =~ s/<?$noparatag>//;
$noparadepth ++;
while ($temp =~ /<\/$noparatag>/) {
$temp =~ s/<?\/$noparatag>//;
$noparadepth --;
if ($noparadepth == 0) {
$noparaline == 0;
# runon protection
if ($linenumber >= ($noparaline + 100)) {
$buf .= "ERROR: runon block starting on line $noparaline\n";
# recover original line -- no whitespace modifiers
$line = $originalline;
# sect3
} elsif ($line =~ /^===/) {
if ($id eq '') {
$line = "<sect3><title>$title</title>\n";
} else {
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>\n";
$level3 = 1;
# sect2
} elsif ($line =~ /^==/) {
if ($id eq '') {
$line = "<sect2><title>$title</title>\n";
} else {
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>\n";
$level2 = 1;
# sect1
} elsif ($line =~ /^=/) {
if ($id eq '') {
$line = "<sect1><title>$title</title>\n";
} else {
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>\n";
$level1 = 1;
# orderedlist
} elsif ($line =~ /^#/) {
if ($orderedlist == 0) {
$buf .= "\n<orderedlist>\n";
$orderedlist = 1;
$line =~ s/^#//;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /^\/#/) {
$line =~ s/^\/#//;
# itemizedlist
} elsif ($line =~ /^\*/) {
if ($itemizedlist == 0) {
$buf .= "\n<itemizedlist>\n";
$itemizedlist = 1;
$line =~ s/^\*//;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /\/\*/) {
$line =~ s/^\/\*//;
# question
} elsif ($line =~ /^Q:/) {
$line =~ s/^Q://;
if ($id eq '') {
$line = "<question><para>" . $title . "</para></question>\n";
} else {
$line = "<question id='$id'><para>" . $title . "</para></question>\n";
unless ($qandaentry == 1) {
$line = "<qandaentry>\n" . $line;
$qandaentry = 1;
if ($qandaset == 0) {
$line = "<qandaset defaultlabel='qanda'>\n". $line;
$qandaset = 1;
# answer
} elsif ($line =~ /^A:/) {
$line =~ s/^A://;
$line = "<answer><para>" . $line;
$answer = 1;
$para = 1;
} elsif ($line =~ /^\s*----\s*$/) {
$line = '';
# para
} else {
if (($para == 0) and ($noparatag eq '')) {
$line = "<para>" . $line;
$para = 1;
} else {
$line .= " ";
$buf .= "$line ";
# close nesting
if ($noparadepth > 0) {
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
sub close1 {
if ($level1 == 1) {
$buf .= "</sect1>\n";
$level1 = 0;
sub close2 {
if ($level2 == 1) {
$buf .= "</sect2>\n";
$level2 = 0;
sub close3 {
if ($level3 == 1) {
$buf .= "</sect3>\n";
$level3 = 0;
sub closenonsect {
# &closeorderedlist;
# &closeitemizedlist;
sub closelistitem {
if ($listitem == 1 ) {
$buf .= "</listitem>\n";
$listitem = 0;
sub closeorderedlist {
if ($orderedlist == 1 ) {
$buf .= "</orderedlist>\n";
$orderedlist = 0;
sub closeitemizedlist {
if ($itemizedlist == 1 ) {
$buf .= "</itemizedlist>\n";
$itemizedlist = 0;
sub closelists {
sub closeanswer {
if ($answer == 1) {
$buf .= "</answer>\n";
$answer = 0;
sub closeqandaentry {
if ($qandaentry == 1) {
$buf .= "</qandaentry>\n";
$qandaentry = 0;
sub closeqandaset {
if ($qandaset == 1) {
$buf .= "</qandaset>\n";
$qandaset = 0;
sub closepara {
if ($para == 1) {
$buf .= "</para>\n";
$para = 0;
sub trimline {
$line =~ s/\s+$//;
$line =~ s/^\s+//;
sub splittitle {
$line =~ s/^=+//;
$line =~ s/=+$//;
$title = $line;
$id = "";
if ($line =~ /\|/) {
$title =~ s/\|.+//;
$id = $line;
$id =~ s/^.+\|//;
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$id =~ s/\s+$//;
$id =~ s/^\s+//;
sub usage {
print "Usage: txt2db [-v] [-h|-o <sgml file>] <text file>\n";
print "-o, --output-to write to the specified file.\n";
print "-v, --verbose show diagnostic output.\n";
print "-h, --help show this usage message.\n";