mirror of https://github.com/tLDP/LDP
523 lines
9.7 KiB
Perl
Executable File
523 lines
9.7 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
#Converts txt files into docbook.
|
|
#
|
|
# Requirements:
|
|
#
|
|
# If you use the "ldp:" namespace, you must have wget installed.
|
|
# Wget is used to request an xml record from the LDP # database,
|
|
# http://db.linuxdoc.org.
|
|
#
|
|
|
|
use File::Basename;
|
|
use HTML::Entities;
|
|
|
|
my($txtfile, $dbfile) = '';
|
|
|
|
#These keep track of which constructs we're in the middle of
|
|
my($level1,
|
|
$level2,
|
|
$level3,
|
|
$orderedlist,
|
|
$listitem,
|
|
$itemizedlist,
|
|
$para,
|
|
$qandaset,
|
|
$qandaentry,
|
|
$answer);
|
|
|
|
my($line);
|
|
my($id, $title);
|
|
|
|
my($verbose);
|
|
|
|
my($error);
|
|
$error = 0;
|
|
|
|
# read in cmd-line arguments
|
|
#
|
|
while (1) {
|
|
if($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
|
|
shift(@ARGV);
|
|
$dbfile = $ARGV[0];
|
|
shift(@ARGV);
|
|
} elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
|
|
&usage;
|
|
} elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") {
|
|
$verbose++;
|
|
shift(@ARGV);
|
|
} else {
|
|
$txtfile = $ARGV[0];
|
|
shift(@ARGV);
|
|
}
|
|
|
|
if ($ARGV[0] eq '') {
|
|
last;
|
|
}
|
|
}
|
|
|
|
# abort if no input file given
|
|
#
|
|
if($txtfile eq '') {
|
|
$error = 1;
|
|
&usage();
|
|
} elsif( !(-r $txtfile) ) {
|
|
print "txt2db: ERROR cannot read $f ($!)\n\n";
|
|
$error = 1;
|
|
&usage();
|
|
}
|
|
|
|
unless ($dbfile) {
|
|
($basename, $path, $ext) = fileparse($txtfile);
|
|
$dbfile = $basename;
|
|
$dbfile =~ s/\..*?$/\.sgml/;
|
|
}
|
|
|
|
$buf = '';
|
|
|
|
&proc_txt($txtfile);
|
|
|
|
open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n";
|
|
print DB $buf, "\n";
|
|
close(DB);
|
|
|
|
exit(0);
|
|
|
|
# -----------------------------------------------------------
|
|
|
|
sub proc_txt {
|
|
my($f) = @_;
|
|
|
|
my($linenumber);
|
|
$linenumber = 0;
|
|
|
|
my ($noparatag,
|
|
$noparadepth);
|
|
$noparadepth = 0;
|
|
$noparaline = 0;
|
|
|
|
# read in the text file
|
|
#
|
|
open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n";
|
|
while ($originalline = <TXT>) {
|
|
$line = $originalline;
|
|
$linenumber++;
|
|
|
|
&trimline;
|
|
|
|
# blank lines
|
|
if ($line eq '') {
|
|
if ($noparadepth == 0) {
|
|
&closenonsect;
|
|
next;
|
|
}
|
|
}
|
|
|
|
# capitalize hints that can be entered in lowercase
|
|
#
|
|
$line =~ s/^q:/Q:/;
|
|
$line =~ s/^a:/A:/;
|
|
|
|
# encode entities
|
|
#
|
|
# while ($line =~ //) {
|
|
# }
|
|
# decode_entities($line);
|
|
encode_entities($line);
|
|
|
|
# inline docbook
|
|
#
|
|
# ulink
|
|
#
|
|
while ($line =~ /\[\[/) {
|
|
unless ($line =~ /\]\]/) {
|
|
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
|
|
}
|
|
|
|
# separate link url from link name
|
|
#
|
|
$link = $line;
|
|
$link=~ s/\n//g;
|
|
$link =~ s/.*?\[\[//;
|
|
$link =~ s/\]\].*?$//;
|
|
if ($link =~ /\|/) {
|
|
$linkname = $link;
|
|
$link =~ s/\|.+$//;
|
|
$linkname =~ s/^\S+\|//;
|
|
} else {
|
|
$linkname = $link;
|
|
}
|
|
|
|
# kill quotes, they mess us up
|
|
#
|
|
$link =~ s/'/%27/g;
|
|
|
|
# namespaces are handled differently
|
|
#
|
|
print "$link\n" if ($verbose);
|
|
if ($link =~ /^http:/) {
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^mailto:/) {
|
|
$linkname =~ s/^mailto://;
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^wiki:/) {
|
|
$linkname =~ s/^wiki://;
|
|
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
|
|
$link =~ s/\ /+/;
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^ldp:/) {
|
|
$linkname =~ s/^ldp://;
|
|
$link =~ s/^ldp://;
|
|
$tempfile = "/tmp/txt2db-" . $rand;
|
|
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
|
|
system("$cmd");
|
|
open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n";
|
|
$link = "";
|
|
while ($url_line = <URL>) {
|
|
$url_line =~ s/\n//;
|
|
if ($url_line =~ /identifier/) {
|
|
$link .= $url_line;
|
|
}
|
|
}
|
|
close(URL);
|
|
unlink $tempfile;
|
|
$link =~ s/^.*?<identifier>//;
|
|
$link =~ s/<\/identifier>.*?$//;
|
|
if ($link eq '') {
|
|
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
|
|
}
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^file:/) {
|
|
$linkname =~ s/^file://;
|
|
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
|
} else {
|
|
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
|
}
|
|
}
|
|
|
|
# emphasis
|
|
#
|
|
while ($line =~ /'''.*'''/) {
|
|
$line =~ s/'''/<emphasis role='bold'>/;
|
|
$line =~ s/'''/<\/emphasis>/;
|
|
}
|
|
|
|
# this block defines DocBook structures that won't be broken up with
|
|
# paragraphs when we hit empty lines:
|
|
#
|
|
# <para>
|
|
# <sect1>
|
|
# <sect2>
|
|
# <sect3>
|
|
# <programlisting>
|
|
# <literallayout>
|
|
|
|
# forget about nopara
|
|
if ($noparadepth == 0) {
|
|
$noparatag = "";
|
|
}
|
|
|
|
# start a new nopara section
|
|
#
|
|
if ((($line =~ /^<para>/) or
|
|
($line =~ /^<sect/) or
|
|
($line =~ /^<screen>/) or
|
|
($line =~ /^<literallayout>/) or
|
|
($line =~ /^<programlisting>/)) and
|
|
($noparadepth == 0)) {
|
|
&closepara;
|
|
$noparatag = $line;
|
|
$noparatag =~ s/^.*?<//;
|
|
$noparatag =~ s/>.*?$//;
|
|
$noparaline = $linenumber;
|
|
if ($line =~ /^<screen>/) {
|
|
unless ($para) {
|
|
$line = "<para>" . $line;
|
|
$para = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
# count noparadepth
|
|
#
|
|
if ($noparatag ne '') {
|
|
$temp = $line;
|
|
while ($temp =~ /<$noparatag>/) {
|
|
$temp =~ s/<?$noparatag>//;
|
|
$noparadepth ++;
|
|
}
|
|
while ($temp =~ /<\/$noparatag>/) {
|
|
$temp =~ s/<?\/$noparatag>//;
|
|
$noparadepth --;
|
|
if ($noparadepth == 0) {
|
|
$noparaline == 0;
|
|
}
|
|
}
|
|
|
|
# runon protection
|
|
#
|
|
if ($linenumber >= ($noparaline + 100)) {
|
|
$buf .= "ERROR: runon block starting on line $noparaline\n";
|
|
last;
|
|
}
|
|
|
|
# recover original line -- no whitespace modifiers
|
|
#
|
|
$line = $originalline;
|
|
|
|
# sect3
|
|
#
|
|
} elsif ($line =~ /^===/) {
|
|
&close3;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<sect3><title>$title</title>\n";
|
|
} else {
|
|
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>\n";
|
|
}
|
|
$level3 = 1;
|
|
|
|
# sect2
|
|
#
|
|
} elsif ($line =~ /^==/) {
|
|
&close2;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<sect2><title>$title</title>\n";
|
|
} else {
|
|
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>\n";
|
|
}
|
|
$level2 = 1;
|
|
|
|
# sect1
|
|
#
|
|
} elsif ($line =~ /^=/) {
|
|
&close1;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<sect1><title>$title</title>\n";
|
|
} else {
|
|
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>\n";
|
|
}
|
|
$level1 = 1;
|
|
|
|
# orderedlist
|
|
#
|
|
} elsif ($line =~ /^#/) {
|
|
&closeitemizedlist;
|
|
if ($orderedlist == 0) {
|
|
$buf .= "\n<orderedlist>\n";
|
|
$orderedlist = 1;
|
|
}
|
|
&closelistitem;
|
|
$line =~ s/^#//;
|
|
&trimline;
|
|
$line =~ s/^/<listitem><para>/;
|
|
$listitem = 1;
|
|
$para = 1;
|
|
} elsif ($line =~ /^\/#/) {
|
|
$line =~ s/^\/#//;
|
|
&trimline;
|
|
&closeorderedlist;
|
|
|
|
# itemizedlist
|
|
#
|
|
} elsif ($line =~ /^\*/) {
|
|
&closeorderedlist;
|
|
if ($itemizedlist == 0) {
|
|
$buf .= "\n<itemizedlist>\n";
|
|
$itemizedlist = 1;
|
|
}
|
|
&closelistitem;
|
|
$line =~ s/^\*//;
|
|
&trimline;
|
|
$line =~ s/^/<listitem><para>/;
|
|
$listitem = 1;
|
|
$para = 1;
|
|
} elsif ($line =~ /\/\*/) {
|
|
$line =~ s/^\/\*//;
|
|
&trimline;
|
|
&closeitemizedlist;
|
|
|
|
# question
|
|
#
|
|
} elsif ($line =~ /^Q:/) {
|
|
&closelists;
|
|
&closeqandaentry;
|
|
$line =~ s/^Q://;
|
|
&trimline;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<question><para>" . $title . "</para></question>\n";
|
|
} else {
|
|
$line = "<question id='$id'><para>" . $title . "</para></question>\n";
|
|
}
|
|
unless ($qandaentry == 1) {
|
|
$line = "<qandaentry>\n" . $line;
|
|
$qandaentry = 1;
|
|
}
|
|
if ($qandaset == 0) {
|
|
$line = "<qandaset defaultlabel='qanda'>\n". $line;
|
|
$qandaset = 1;
|
|
}
|
|
|
|
# answer
|
|
#
|
|
} elsif ($line =~ /^A:/) {
|
|
$line =~ s/^A://;
|
|
&trimline;
|
|
&closeanswer;
|
|
$line = "<answer><para>" . $line;
|
|
$answer = 1;
|
|
$para = 1;
|
|
|
|
} elsif ($line =~ /^\s*----\s*$/) {
|
|
$line = '';
|
|
|
|
# para
|
|
#
|
|
} else {
|
|
if (($para == 0) and ($noparatag eq '')) {
|
|
$line = "<para>" . $line;
|
|
$para = 1;
|
|
} else {
|
|
$line .= " ";
|
|
}
|
|
}
|
|
|
|
$buf .= "$line ";
|
|
}
|
|
# close nesting
|
|
#
|
|
&close1;
|
|
|
|
if ($noparadepth > 0) {
|
|
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
|
|
}
|
|
}
|
|
|
|
sub close1 {
|
|
&close2;
|
|
if ($level1 == 1) {
|
|
$buf .= "</sect1>\n";
|
|
$level1 = 0;
|
|
}
|
|
}
|
|
|
|
sub close2 {
|
|
&close3;
|
|
if ($level2 == 1) {
|
|
$buf .= "</sect2>\n";
|
|
$level2 = 0;
|
|
}
|
|
}
|
|
|
|
sub close3 {
|
|
&closeorderedlist;
|
|
&closeitemizedlist;
|
|
&closepara;
|
|
&closeqandaset;
|
|
if ($level3 == 1) {
|
|
$buf .= "</sect3>\n";
|
|
$level3 = 0;
|
|
}
|
|
}
|
|
|
|
sub closenonsect {
|
|
&closepara;
|
|
# &closeorderedlist;
|
|
# &closeitemizedlist;
|
|
}
|
|
|
|
sub closelistitem {
|
|
&closepara;
|
|
if ($listitem == 1 ) {
|
|
$buf .= "</listitem>\n";
|
|
$listitem = 0;
|
|
}
|
|
}
|
|
|
|
sub closeorderedlist {
|
|
&closepara;
|
|
&closelistitem;
|
|
if ($orderedlist == 1 ) {
|
|
$buf .= "</orderedlist>\n";
|
|
$orderedlist = 0;
|
|
}
|
|
}
|
|
|
|
sub closeitemizedlist {
|
|
&closepara;
|
|
&closelistitem;
|
|
if ($itemizedlist == 1 ) {
|
|
$buf .= "</itemizedlist>\n";
|
|
$itemizedlist = 0;
|
|
}
|
|
}
|
|
|
|
sub closelists {
|
|
&closeitemizedlist;
|
|
&closeorderedlist;
|
|
}
|
|
|
|
sub closeanswer {
|
|
&closepara;
|
|
if ($answer == 1) {
|
|
$buf .= "</answer>\n";
|
|
$answer = 0;
|
|
}
|
|
}
|
|
|
|
sub closeqandaentry {
|
|
&closeanswer;
|
|
if ($qandaentry == 1) {
|
|
$buf .= "</qandaentry>\n";
|
|
$qandaentry = 0;
|
|
}
|
|
}
|
|
|
|
sub closeqandaset {
|
|
&closeqandaentry;
|
|
if ($qandaset == 1) {
|
|
$buf .= "</qandaset>\n";
|
|
$qandaset = 0;
|
|
}
|
|
}
|
|
|
|
sub closepara {
|
|
if ($para == 1) {
|
|
$buf .= "</para>\n";
|
|
$para = 0;
|
|
}
|
|
}
|
|
|
|
sub trimline {
|
|
$line =~ s/\s+$//;
|
|
$line =~ s/^\s+//;
|
|
}
|
|
|
|
sub splittitle {
|
|
$line =~ s/^=+//;
|
|
$line =~ s/=+$//;
|
|
$title = $line;
|
|
$id = "";
|
|
if ($line =~ /\|/) {
|
|
$title =~ s/\|.+//;
|
|
$id = $line;
|
|
$id =~ s/^.+\|//;
|
|
}
|
|
$title =~ s/\s+$//;
|
|
$title =~ s/^\s+//;
|
|
$id =~ s/\s+$//;
|
|
$id =~ s/^\s+//;
|
|
}
|
|
|
|
sub usage {
|
|
print "Usage: txt2db [-v] [-h|-o <sgml file>] <text file>\n";
|
|
print "-o, --output-to write to the specified file.\n";
|
|
print "-v, --verbose show diagnostic output.\n";
|
|
print "-h, --help show this usage message.\n";
|
|
exit($error);
|
|
}
|
|
|