LDP/LDP/txt2db/txt2db.pl

524 lines
9.8 KiB
Perl
Executable File

#!/usr/bin/perl
#
#Converts txt files into docbook.
#
# Requirements:
#
# If you use the "ldp:" namespace, you must have wget installed.
# Wget is used to request an xml record from the LDP # database,
# http://db.linuxdoc.org.
#
use File::Basename;
use HTML::Entities;
my($txtfile, $dbfile) = '';
#These keep track of which constructs we're in the middle of
my($level1,
$level2,
$level3,
$orderedlist,
$listitem,
$itemizedlist,
$para,
$qandaset,
$qandaentry,
$answer);
my($line);
my($id, $title);
my($verbose);
my($error);
$error = 0;
# read in cmd-line arguments
#
while (1) {
if($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
shift(@ARGV);
$dbfile = $ARGV[0];
shift(@ARGV);
} elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
&usage;
} elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") {
$verbose++;
shift(@ARGV);
} else {
$txtfile = $ARGV[0];
shift(@ARGV);
}
if ($ARGV[0] eq '') {
last;
}
}
# abort if no input file given
#
if($txtfile eq '') {
print "txt2db: ERROR text file not specified.\n\n";
$error = 1;
&usage();
} elsif( !(-r $txtfile) ) {
print "txt2db: ERROR cannot read $f ($!)\n\n";
$error = 1;
&usage();
}
unless ($dbfile) {
($basename, $path, $ext) = fileparse($txtfile);
$dbfile = $basename;
$dbfile =~ s/\..*?$/\.sgml/;
}
$buf = '';
&proc_txt($txtfile);
open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n";
print DB $buf, "\n";
close(DB);
exit(0);
# -----------------------------------------------------------
sub proc_txt {
my($f) = @_;
my($linenumber);
$linenumber = 0;
my ($noparatag,
$noparadepth);
$noparadepth = 0;
$noparaline = 0;
# read in the text file
#
open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n";
while ($originalline = <TXT>) {
$line = $originalline;
$linenumber++;
&trimline;
# blank lines
if ($line eq '') {
if ($noparadepth == 0) {
&closenonsect;
next;
}
}
# capitalize hints that can be entered in lowercase
#
$line =~ s/^q:/Q:/;
$line =~ s/^a:/A:/;
# encode entities
#
# while ($line =~ //) {
# }
# decode_entities($line);
encode_entities($line);
# inline docbook
#
# ulink
#
while ($line =~ /\[\[/) {
unless ($line =~ /\]\]/) {
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
}
# separate link url from link name
#
$link = $line;
$link=~ s/\n//g;
$link =~ s/.*?\[\[//;
$link =~ s/\]\].*?$//;
if ($link =~ /\|/) {
$linkname = $link;
$link =~ s/\|.+$//;
$linkname =~ s/^\S+\|//;
} else {
$linkname = $link;
}
# kill quotes, they mess us up
#
$link =~ s/'/%27/g;
# namespaces are handled differently
#
print "$link\n" if ($verbose);
if ($link =~ /^http:/) {
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^mailto:/) {
$linkname =~ s/^mailto://;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^wiki:/) {
$linkname =~ s/^wiki://;
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
$link =~ s/\ /+/;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^ldp:/) {
$linkname =~ s/^ldp://;
$link =~ s/^ldp://;
$tempfile = "/tmp/txt2db-" . $rand;
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
system("$cmd");
open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n";
$link = "";
while ($url_line = <URL>) {
$url_line =~ s/\n//;
if ($url_line =~ /identifier/) {
$link .= $url_line;
}
}
close(URL);
unlink $tempfile;
$link =~ s/^.*?<identifier>//;
$link =~ s/<\/identifier>.*?$//;
if ($link eq '') {
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
}
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^file:/) {
$linkname =~ s/^file://;
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
} else {
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
}
}
# emphasis
#
while ($line =~ /'''.*'''/) {
$line =~ s/'''/<emphasis role='bold'>/;
$line =~ s/'''/<\/emphasis>/;
}
# this block defines DocBook structures that won't be broken up with
# paragraphs when we hit empty lines:
#
# <para>
# <sect1>
# <sect2>
# <sect3>
# <programlisting>
# <literallayout>
# forget about nopara
if ($noparadepth == 0) {
$noparatag = "";
}
# start a new nopara section
#
if ((($line =~ /^<para>/) or
($line =~ /^<sect/) or
($line =~ /^<screen>/) or
($line =~ /^<literallayout>/) or
($line =~ /^<programlisting>/)) and
($noparadepth == 0)) {
&closepara;
$noparatag = $line;
$noparatag =~ s/^.*?<//;
$noparatag =~ s/>.*?$//;
$noparaline = $linenumber;
if ($line =~ /^<screen>/) {
unless ($para) {
$line = "<para>" . $line;
$para = 1;
}
}
}
# count noparadepth
#
if ($noparatag ne '') {
$temp = $line;
while ($temp =~ /<$noparatag>/) {
$temp =~ s/<?$noparatag>//;
$noparadepth ++;
}
while ($temp =~ /<\/$noparatag>/) {
$temp =~ s/<?\/$noparatag>//;
$noparadepth --;
if ($noparadepth == 0) {
$noparaline == 0;
}
}
# runon protection
#
if ($linenumber >= ($noparaline + 100)) {
$buf .= "ERROR: runon block starting on line $noparaline\n";
last;
}
# recover original line -- no whitespace modifiers
#
$line = $originalline;
# sect3
#
} elsif ($line =~ /^===/) {
&close3;
&splittitle;
if ($id eq '') {
$line = "<sect3><title>$title</title>\n";
} else {
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>\n";
}
$level3 = 1;
# sect2
#
} elsif ($line =~ /^==/) {
&close2;
&splittitle;
if ($id eq '') {
$line = "<sect2><title>$title</title>\n";
} else {
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>\n";
}
$level2 = 1;
# sect1
#
} elsif ($line =~ /^=/) {
&close1;
&splittitle;
if ($id eq '') {
$line = "<sect1><title>$title</title>\n";
} else {
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>\n";
}
$level1 = 1;
# orderedlist
#
} elsif ($line =~ /^#/) {
&closeitemizedlist;
if ($orderedlist == 0) {
$buf .= "\n<orderedlist>\n";
$orderedlist = 1;
}
&closelistitem;
$line =~ s/^#//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /^\/#/) {
$line =~ s/^\/#//;
&trimline;
&closeorderedlist;
# itemizedlist
#
} elsif ($line =~ /^\*/) {
&closeorderedlist;
if ($itemizedlist == 0) {
$buf .= "\n<itemizedlist>\n";
$itemizedlist = 1;
}
&closelistitem;
$line =~ s/^\*//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /\/\*/) {
$line =~ s/^\/\*//;
&trimline;
&closeitemizedlist;
# question
#
} elsif ($line =~ /^Q:/) {
&closelists;
&closeqandaentry;
$line =~ s/^Q://;
&trimline;
&splittitle;
if ($id eq '') {
$line = "<question><para>" . $title . "</para></question>\n";
} else {
$line = "<question id='$id'><para>" . $title . "</para></question>\n";
}
unless ($qandaentry == 1) {
$line = "<qandaentry>\n" . $line;
$qandaentry = 1;
}
if ($qandaset == 0) {
$line = "<qandaset defaultlabel='qanda'>\n". $line;
$qandaset = 1;
}
# answer
#
} elsif ($line =~ /^A:/) {
$line =~ s/^A://;
&trimline;
&closeanswer;
$line = "<answer><para>" . $line;
$answer = 1;
$para = 1;
} elsif ($line =~ /^\s*----\s*$/) {
$line = '';
# para
#
} else {
if (($para == 0) and ($noparatag eq '')) {
$line = "<para>" . $line;
$para = 1;
} else {
$line .= " ";
}
}
$buf .= "$line ";
}
# close nesting
#
&close1;
if ($noparadepth > 0) {
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
}
}
sub close1 {
&close2;
if ($level1 == 1) {
$buf .= "</sect1>\n";
$level1 = 0;
}
}
sub close2 {
&close3;
if ($level2 == 1) {
$buf .= "</sect2>\n";
$level2 = 0;
}
}
sub close3 {
&closeorderedlist;
&closeitemizedlist;
&closepara;
&closeqandaset;
if ($level3 == 1) {
$buf .= "</sect3>\n";
$level3 = 0;
}
}
sub closenonsect {
&closepara;
# &closeorderedlist;
# &closeitemizedlist;
}
sub closelistitem {
&closepara;
if ($listitem == 1 ) {
$buf .= "</listitem>\n";
$listitem = 0;
}
}
sub closeorderedlist {
&closepara;
&closelistitem;
if ($orderedlist == 1 ) {
$buf .= "</orderedlist>\n";
$orderedlist = 0;
}
}
sub closeitemizedlist {
&closepara;
&closelistitem;
if ($itemizedlist == 1 ) {
$buf .= "</itemizedlist>\n";
$itemizedlist = 0;
}
}
sub closelists {
&closeitemizedlist;
&closeorderedlist;
}
sub closeanswer {
&closepara;
if ($answer == 1) {
$buf .= "</answer>\n";
$answer = 0;
}
}
sub closeqandaentry {
&closeanswer;
if ($qandaentry == 1) {
$buf .= "</qandaentry>\n";
$qandaentry = 0;
}
}
sub closeqandaset {
&closeqandaentry;
if ($qandaset == 1) {
$buf .= "</qandaset>\n";
$qandaset = 0;
}
}
sub closepara {
if ($para == 1) {
$buf .= "</para>\n";
$para = 0;
}
}
sub trimline {
$line =~ s/\s+$//;
$line =~ s/^\s+//;
}
sub splittitle {
$line =~ s/^=+//;
$line =~ s/=+$//;
$title = $line;
$id = "";
if ($line =~ /\|/) {
$title =~ s/\|.+//;
$id = $line;
$id =~ s/^.+\|//;
}
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$id =~ s/\s+$//;
$id =~ s/^\s+//;
}
sub usage {
print "Usage: txt2db [-v] [-h|-o <sgml file>] <text file>\n";
print "-o, --output-to write to the specified file.\n";
print "-v, --verbose show diagnostic output.\n";
print "-h, --help show this usage message.\n";
exit($error);
}