diff --git a/LDP/wt2db/README b/LDP/wt2db/README new file mode 100644 index 00000000..120edf37 --- /dev/null +++ b/LDP/wt2db/README @@ -0,0 +1,69 @@ +This is a utility to convert text files in a specific format into valid +DocBook. Just pass it the input filename on the commmand line and you'll +get a .sgml file out. It won't be a complete valid document, as it will +have no header information or dtd specification. It's just a DocBook +fragment, not a complete document. + +The following constructs are currently supported. If you need support for +an addition construct, write discuss@linuxdoc.org if you're subscribed, +or feedback@linuxdoc.org if you're not. + +Or just add it in the cvs. :-) + +Foo Foo + +=Title= Title + + +=Title|id= Title + + + works for other sect levels as well, and many other + tags. It is either the "id" value, or the "title" + value, depending on the semantics of the particular + tag. Usage should be obvious in context. + +==Title== Title + + +===Title=== Title + + + +#Foo +#Bar Foo +#Baz Bar +/# Baz + + +*Foo +*Bar Foo +*Baz Bar +/* Baz + + +[[http://foo.org]] + http://foo.org + + +[[http://foo.org Foo]] + Foo + + +[http://foo.org|Foo]] You can also delimit with the pipe character "|". + This works on any of these [[]] tags. + +[[file:Foo]] Foo + +'''Foo''' Foo + +A few DocBook structures do not have tags wrapped around them. They +are itself (duh!), and . If you insert anything +using these tags, no tags will be wrapped around it or inserted into it. +So if you want fine control over your tags, insert them yourself. + +These tags include: + + + + diff --git a/LDP/wt2db/wt2db b/LDP/wt2db/wt2db new file mode 100755 index 00000000..064ee394 --- /dev/null +++ b/LDP/wt2db/wt2db @@ -0,0 +1,523 @@ +#!/usr/bin/perl +# +#Converts txt files into docbook. +# +# Requirements: +# +# If you use the "ldp:" namespace, you must have wget installed. +# Wget is used to request an xml record from the LDP # database, +# http://db.linuxdoc.org. +# + +use File::Basename; +use HTML::Entities; + +my($txtfile, $dbfile) = ''; + +#These keep track of which constructs we're in the middle of +my($level1, + $level2, + $level3, + $orderedlist, + $listitem, + $itemizedlist, + $para, + $qandaset, + $qandaentry, + $answer); + +my($line); +my($id, $title); + +my($verbose); + +my($error); +$error = 0; + +# read in cmd-line arguments +# +while (1) { + if($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") { + shift(@ARGV); + $dbfile = $ARGV[0]; + shift(@ARGV); + } elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") { + &usage; + } elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") { + $verbose++; + shift(@ARGV); + } else { + $txtfile = $ARGV[0]; + shift(@ARGV); + } + + if ($ARGV[0] eq '') { + last; + } +} + +# abort if no input file given +# +if($txtfile eq '') { + print "txt2db: ERROR text file not specified.\n\n"; + $error = 1; + &usage(); +} elsif( !(-r $txtfile) ) { + print "txt2db: ERROR cannot read $f ($!)\n\n"; + $error = 1; + &usage(); +} + +unless ($dbfile) { + ($basename, $path, $ext) = fileparse($txtfile); + $dbfile = $basename; + $dbfile =~ s/\..*?$/\.sgml/; +} + +$buf = ''; + +&proc_txt($txtfile); + +open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n"; +print DB $buf, "\n"; +close(DB); + +exit(0); + +# ----------------------------------------------------------- + +sub proc_txt { + my($f) = @_; + + my($linenumber); + $linenumber = 0; + + my ($noparatag, + $noparadepth); + $noparadepth = 0; + $noparaline = 0; + + # read in the text file + # + open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n"; + while ($originalline = ) { + $line = $originalline; + $linenumber++; + + &trimline; + + # blank lines + if ($line eq '') { + if ($noparadepth == 0) { + &closenonsect; + next; + } + } + + # capitalize hints that can be entered in lowercase + # + $line =~ s/^q:/Q:/; + $line =~ s/^a:/A:/; + + # encode entities + # +# while ($line =~ //) { +# } +# decode_entities($line); + encode_entities($line); + + # inline docbook + # + # ulink + # + while ($line =~ /\[\[/) { + unless ($line =~ /\]\]/) { + $buf .= "ERROR unterminated '[[' tag on line $linenumber.\n"; + } + + # separate link url from link name + # + $link = $line; + $link=~ s/\n//g; + $link =~ s/.*?\[\[//; + $link =~ s/\]\].*?$//; + if ($link =~ /\|/) { + $linkname = $link; + $link =~ s/\|.+$//; + $linkname =~ s/^\S+\|//; + } else { + $linkname = $link; + } + + # kill quotes, they mess us up + # + $link =~ s/'/%27/g; + + # namespaces are handled differently + # + print "$link\n" if ($verbose); + if ($link =~ /^http:/) { + $line =~ s/\[\[.*?\]\]/$linkname<\/citetitle><\/ulink>/; + } elsif ($link =~ /^mailto:/) { + $linkname =~ s/^mailto://; + $line =~ s/\[\[.*?\]\]/$linkname<\/citetitle><\/ulink>/; + } elsif ($link =~ /^wiki:/) { + $linkname =~ s/^wiki://; + $link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/; + $link =~ s/\ /+/; + $line =~ s/\[\[.*?\]\]/$linkname<\/citetitle><\/ulink>/; + } elsif ($link =~ /^ldp:/) { + $linkname =~ s/^ldp://; + $link =~ s/^ldp://; + $tempfile = "/tmp/txt2db-" . $rand; + $cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile"; + system("$cmd"); + open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n"; + $link = ""; + while ($url_line = ) { + $url_line =~ s/\n//; + if ($url_line =~ /identifier/) { + $link .= $url_line; + } + } + close(URL); + unlink $tempfile; + $link =~ s/^.*?//; + $link =~ s/<\/identifier>.*?$//; + if ($link eq '') { + $linkname = "ERROR: LDP namespace resolution failure on $linkname"; + } + $line =~ s/\[\[.*?\]\]/$linkname<\/citetitle><\/ulink>/; + } elsif ($link =~ /^file:/) { + $linkname =~ s/^file://; + $line =~ s/\[\[.*?\]\]/$linkname<\/filename>/; + } else { + $line =~ s/\[\[.*?\]\]/$linkname<\/filename>/; + } + } + + # emphasis + # + while ($line =~ /'''.*'''/) { + $line =~ s/'''//; + $line =~ s/'''/<\/emphasis>/; + } + + # this block defines DocBook structures that won't be broken up with + # paragraphs when we hit empty lines: + # + # + # + # + # + # + # + + # forget about nopara + if ($noparadepth == 0) { + $noparatag = ""; + } + + # start a new nopara section + # + if ((($line =~ /^/) or + ($line =~ /^/) or + ($line =~ /^/) or + ($line =~ /^/)) and + ($noparadepth == 0)) { + &closepara; + $noparatag = $line; + $noparatag =~ s/^.*?.*?$//; + $noparaline = $linenumber; + if ($line =~ /^/) { + unless ($para) { + $line = "" . $line; + $para = 1; + } + } + } + + # count noparadepth + # + if ($noparatag ne '') { + $temp = $line; + while ($temp =~ /<$noparatag>/) { + $temp =~ s///; + $noparadepth ++; + } + while ($temp =~ /<\/$noparatag>/) { + $temp =~ s///; + $noparadepth --; + if ($noparadepth == 0) { + $noparaline == 0; + } + } + + # runon protection + # + if ($linenumber >= ($noparaline + 100)) { + $buf .= "ERROR: runon block starting on line $noparaline\n"; + last; + } + + # recover original line -- no whitespace modifiers + # + $line = $originalline; + + # sect3 + # + } elsif ($line =~ /^===/) { + &close3; + &splittitle; + if ($id eq '') { + $line = "$title\n"; + } else { + $line = "$title\n"; + } + $level3 = 1; + + # sect2 + # + } elsif ($line =~ /^==/) { + &close2; + &splittitle; + if ($id eq '') { + $line = "$title\n"; + } else { + $line = "$title\n"; + } + $level2 = 1; + + # sect1 + # + } elsif ($line =~ /^=/) { + &close1; + &splittitle; + if ($id eq '') { + $line = "$title\n"; + } else { + $line = "$title\n"; + } + $level1 = 1; + + # orderedlist + # + } elsif ($line =~ /^#/) { + &closeitemizedlist; + if ($orderedlist == 0) { + $buf .= "\n\n"; + $orderedlist = 1; + } + &closelistitem; + $line =~ s/^#//; + &trimline; + $line =~ s/^//; + $listitem = 1; + $para = 1; + } elsif ($line =~ /^\/#/) { + $line =~ s/^\/#//; + &trimline; + &closeorderedlist; + + # itemizedlist + # + } elsif ($line =~ /^\*/) { + &closeorderedlist; + if ($itemizedlist == 0) { + $buf .= "\n\n"; + $itemizedlist = 1; + } + &closelistitem; + $line =~ s/^\*//; + &trimline; + $line =~ s/^//; + $listitem = 1; + $para = 1; + } elsif ($line =~ /\/\*/) { + $line =~ s/^\/\*//; + &trimline; + &closeitemizedlist; + + # question + # + } elsif ($line =~ /^Q:/) { + &closelists; + &closeqandaentry; + $line =~ s/^Q://; + &trimline; + &splittitle; + if ($id eq '') { + $line = "" . $title . "\n"; + } else { + $line = "" . $title . "\n"; + } + unless ($qandaentry == 1) { + $line = "\n" . $line; + $qandaentry = 1; + } + if ($qandaset == 0) { + $line = "\n". $line; + $qandaset = 1; + } + + # answer + # + } elsif ($line =~ /^A:/) { + $line =~ s/^A://; + &trimline; + &closeanswer; + $line = "" . $line; + $answer = 1; + $para = 1; + + } elsif ($line =~ /^\s*----\s*$/) { + $line = ''; + + # para + # + } else { + if (($para == 0) and ($noparatag eq '')) { + $line = "" . $line; + $para = 1; + } else { + $line .= " "; + } + } + + $buf .= "$line "; + } + # close nesting + # + &close1; + + if ($noparadepth > 0) { + $buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n"; + } +} + +sub close1 { + &close2; + if ($level1 == 1) { + $buf .= "\n"; + $level1 = 0; + } +} + +sub close2 { + &close3; + if ($level2 == 1) { + $buf .= "\n"; + $level2 = 0; + } +} + +sub close3 { + &closeorderedlist; + &closeitemizedlist; + &closepara; + &closeqandaset; + if ($level3 == 1) { + $buf .= "\n"; + $level3 = 0; + } +} + +sub closenonsect { + &closepara; +# &closeorderedlist; +# &closeitemizedlist; +} + +sub closelistitem { + &closepara; + if ($listitem == 1 ) { + $buf .= "\n"; + $listitem = 0; + } +} + +sub closeorderedlist { + &closepara; + &closelistitem; + if ($orderedlist == 1 ) { + $buf .= "\n"; + $orderedlist = 0; + } +} + +sub closeitemizedlist { + &closepara; + &closelistitem; + if ($itemizedlist == 1 ) { + $buf .= "\n"; + $itemizedlist = 0; + } +} + +sub closelists { + &closeitemizedlist; + &closeorderedlist; +} + +sub closeanswer { + &closepara; + if ($answer == 1) { + $buf .= "\n"; + $answer = 0; + } +} + +sub closeqandaentry { + &closeanswer; + if ($qandaentry == 1) { + $buf .= "\n"; + $qandaentry = 0; + } +} + +sub closeqandaset { + &closeqandaentry; + if ($qandaset == 1) { + $buf .= "\n"; + $qandaset = 0; + } +} + +sub closepara { + if ($para == 1) { + $buf .= "\n"; + $para = 0; + } +} + +sub trimline { + $line =~ s/\s+$//; + $line =~ s/^\s+//; +} + +sub splittitle { + $line =~ s/^=+//; + $line =~ s/=+$//; + $title = $line; + $id = ""; + if ($line =~ /\|/) { + $title =~ s/\|.+//; + $id = $line; + $id =~ s/^.+\|//; + } + $title =~ s/\s+$//; + $title =~ s/^\s+//; + $id =~ s/\s+$//; + $id =~ s/^\s+//; +} + +sub usage { + print "Usage: txt2db [-v] [-h|-o ] \n"; + print "-o, --output-to write to the specified file.\n"; + print "-v, --verbose show diagnostic output.\n"; + print "-h, --help show this usage message.\n"; + exit($error); +} +