LDP/LDP/wt2db/lib/Wt2Db.pm

665 lines
13 KiB
Perl
Executable File

#!/usr/bin/perl
#
# Converts WikiText files into docbook.
#
package Wt2Db;
use File::Basename;
use HTML::Entities;
use FileHandle;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(
new
ProcessFile
ProcessLine
ProcessEnd
Buffer
Reset
);
&Reset;
# Call this before rerunning ProcessLine to clear state.
#
sub Reset {
$level1 = 0;
$level2 = 0;
$level3 = 0;
$orderedlist = 0;
$listitem = 0;
$itemizedlist = 0;
$para = 0;
$qandaset = 0;
$qandaentry = 0;
$answer = 0;
# These are passed in by the caller
#
$txtfile = '';
$dbfile = '';
$verbose = 0;
$doctype = 0;
$nonet = 0;
# These maintain state
#
$line = '';
$linenumber = 0;
$id = '';
$title = '';
$buf = '';
$noparatag = 0;
$noparadepth = 0;
$noparaline = 0;
}
# -----------------------------------------------------------
sub new {
my $that = shift;
my $class = ref($that) || $that;
my $self = {};
bless $self, $class;
return $self;
}
sub ProcessFile {
($self, $txtfile, $dbfile, $verbose, $doctype, $nonet, $encoding) = @_;
# Read from STDIN if no input file given
#
if ($txtfile) {
if( !(-r $txtfile) ) {
print "wt2db: ERROR cannot read $f ($!)\n\n";
exit(1);
} else {
$fh = new FileHandle;
open $fh, "<$txtfile" or die "Cannot open $txtfile ($!)\n";
}
} else {
$fh = STDIN;
}
if ($dbfile) {
$outfh = new FileHandle;
open $outfh, ">$dbfile" or die "Cannot write to $dbfile\n\n";
} else {
$outfh = STDOUT;
}
# wrap article if requested
#
$encoding = 'ISO-8859-1' unless ($encoding);
if ($doctype eq 'XML') {
print "Adding XML DOCTYPE and article tags." if ($verbose);
$buf = '<?xml version="1.0" encoding="' . $encoding . '" standalone="no"?>' . "\n";
$buf .= '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"' . "\n";
$buf .= ' "http://docbook.org/xml/4.1.2/docbookx.dtd"';
$buf .= "\[\]\>\n";
$buf .= "\n";
$buf .= '<article>' . "\n";
} elsif ($doctype eq 'SGML') {
print "Adding SGML DOCTYPE and article tags." if ($verbose);
$buf = '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook V4.1//EN">' . "\n";
$buf .= '<article>' . "\n";
}
# read in the text file
#
while ($originalline = <$fh>) {
ProcessLine($foo, $originalline);
print $outfh "$buf";
$buf = '';
}
ProcessEnd();
# wrap article if requested
#
if ($doctype) {
$buf .= '</article>' . "\n";
}
print $outfh "$buf";
$buf = '';
close $fh;
close $outfh;
}
sub ProcessLine {
($foo, $originalline) = @_;
$line = $originalline;
$linenumber++;
&trimline;
# blank lines
unless ($line) {
unless ($noparadepth) {
&closenonsect;
return;
}
}
# capitalize hints that can be entered in lowercase
#
$line =~ s/^q:/Q:/;
$line =~ s/^a:/A:/;
# encode entities
#
# while ($line =~ //) {
# }
# decode_entities($line);
# encode_entities($line);
# inline docbook
#
# parse all links, internal and external
#
while ($line =~ /\[\[/) {
unless ($line =~ /\]\]/) {
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
}
# separate link url from link name
#
$link = $line;
$link=~ s/\n//g;
$link =~ s/.*?\[\[//;
$link =~ s/\]\].*?$//;
if ($link =~ /\|/) {
$linkname = $link;
$link =~ s/\|.+$//;
$linkname =~ s/^\S+\|//;
} else {
$linkname = $link;
}
# kill quotes inside links, they mess us up because
# we have to wrap this string with quotes.
# perhaps it should be encoding the entire URL?
#
$link =~ s/'/%27/g;
# namespaces are handled differently
#
print "$link\n" if ($verbose);
if ($link =~ /^http:\/\//) {
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^link:/) {
$link =~ s/^link://;
$linkname =~ s/^link://;
$line =~ s/\[\[.*?\]\]/<xref linkend='$link' endterm='$link'\>\<\/xref\>/;
} elsif ($link =~ /^mailto:/) {
$linkname =~ s/^mailto://;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^wiki:/) {
$linkname =~ s/^wiki://;
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
$link =~ s/\ /+/;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^ldp:/) {
$linkname =~ s/^ldp://;
$link =~ s/^ldp://;
if ($nonet) {
$line =~ s/\[\[.*?\]\]/<citetitle>$link<\/citetitle>/;
} else {
$tempfile = "/tmp/wt2db-" . $rand;
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
print "$cmd\n" if ($verbose > 1);
$return = system("$cmd");
unless ($return) {
open(URL, "$tempfile") || die "wt2db: cannot open temporary file ($!)\n\n";
$link = '';
while ($url_line = <URL>) {
$url_line =~ s/\n//;
if ($url_line =~ /identifier/) {
$link .= $url_line;
}
}
close(URL);
unlink $tempfile;
}
$link =~ s/^.*?<identifier>//;
$link =~ s/<\/identifier>.*?$//;
if ($link eq '') {
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
}
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
}
} elsif ($link =~ /^file:/) {
$linkname =~ s/^file://;
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
} elsif ($link =~ /^dir:/) {
# FIXME: need to check attribute on filename element
#
$linkname =~ s/^dir://;
$line =~ s/\[\[.*?\]\]/<filename type='directory'>$linkname<\/filename>/;
} else {
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
}
}
# emphasis
#
while ($line =~ /'''.*'''/) {
$line =~ s/'''/<emphasis role='bold'>/;
$line =~ s/'''/<\/emphasis>/;
}
# this block defines DocBook structures that won't be broken up with
# paragraphs when we hit empty lines:
#
# <para>
# <sect1>
# <sect2>
# <sect3>
# <programlisting>
# <literallayout>
# forget about being in nopara state if we're no longer in one
#
if ($noparadepth == 0) {
$noparatag = "";
}
# start a new nopara section
#
if ((($line =~ /^<para>/) or
($line =~ /^<sect/) or
($line =~ /^<screen>/) or
($line =~ /^<screen>/) or
($line =~ /^<blockquote>/) or
($line =~ /^<literallayout>/) or
($line =~ /^<articleinfo>/) or
($line =~ /^<programlisting>/)) and
($noparadepth == 0)) {
&closepara;
$noparatag = $line;
$noparatag =~ s/^.*?<//;
$noparatag =~ s/>.*?$//;
$noparaline = $linenumber;
# screen sections don't embed para tags, but are wrapped in them
#
if ($line =~ /^<screen>/) {
unless ($para) {
$buf .= "<para>";
$para = 1;
}
}
}
# count noparadepth
#
if ($noparatag ne '') {
$temp = $line;
while ($temp =~ /<$noparatag>/) {
$temp =~ s/<?$noparatag>//;
$noparadepth ++;
}
while ($temp =~ /<\/$noparatag>/) {
$temp =~ s/<?\/$noparatag>//;
$noparadepth --;
if ($noparadepth == 0) {
$noparaline = 0;
}
}
# runon protection
#
if (($noparaline) and ($linenumber >= ($noparaline + 100))) {
$buf .= "ERROR: runon block starting on line $noparaline\n";
return;
}
# recover original line -- no whitespace modifiers
# allow nonencoded text in unparsed lines, when in a literal block
#
$line = $originalline;
chomp($line);
if ($line =~ /^<$noparatag>/ ) {
$starttag = "<$noparatag>";
} else {
$starttag = '';
}
if ($line =~ /<\/$noparatag>/ ) {
$endtag = "<\/$noparatag>";
} else {
$endtag = '';
}
$line =~ s/<$noparatag>//;
$line =~ s/<\/$noparatag>//;
if (($noparatag eq 'screen') or
($noparatag eq 'literallayout') or
($noparatag eq 'programlisting')) {
encode_entities($line);
}
$line = "$starttag$line$endtag";
# sect3
#
} elsif ($line =~ /^===/) {
&close3;
&splittitle;
if ($id eq '') {
$line = "<sect3><title>$title</title>";
} else {
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>";
}
$level3 = 1;
# sect2
#
} elsif ($line =~ /^==/) {
&close2;
&splittitle;
if ($id eq '') {
$line = "<sect2><title>$title</title>";
} else {
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>";
}
$level2 = 1;
# sect1
#
} elsif ($line =~ /^=/) {
&close1;
&splittitle;
if ($id eq '') {
$line = "<sect1><title>$title</title>";
} else {
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>";
}
$level1 = 1;
# orderedlist
#
} elsif ($line =~ /^#/) {
&closeitemizedlist;
if ($orderedlist == 0) {
$buf .= "<orderedlist>\n";
$orderedlist = 1;
}
&closelistitem;
$line =~ s/^#//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /^\/#/) {
$line =~ s/^\/#//;
&trimline;
&closeorderedlist;
# itemizedlist
#
} elsif ($line =~ /^\*/) {
&closeorderedlist;
if ($itemizedlist == 0) {
$buf .= "<itemizedlist>\n";
$itemizedlist = 1;
}
&closelistitem;
$line =~ s/^\*//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /\/\*/) {
$line =~ s/^\/\*//;
&trimline;
&closeitemizedlist;
# question
#
} elsif ($line =~ /^Q:/) {
&closelists;
&closeqandaentry;
$line =~ s/^Q://;
&trimline;
&splittitle;
if ($id eq '') {
$line = "<question><para>$title</para></question>";
} else {
$line = "<question id='$id'><para>$title</para></question>";
}
unless ($qandaentry) {
$line = "<qandaentry>\n" . $line;
$qandaentry = 1;
}
if ($qandaset == 0) {
$line = "<qandaset defaultlabel='qanda'>\n". $line;
$qandaset = 1;
}
# answer
#
} elsif ($line =~ /^A:/) {
$line =~ s/^A://;
&trimline;
&closeanswer;
$line = "<answer><para>" . $line;
$answer = 1;
$para = 1;
} elsif ($line =~ /^\s*----\s*$/) {
$line = '';
# para
#
} else {
if (($para == 0) and ($noparatag eq '')) {
$line = "<para>" . $line;
$para = 1;
} else {
$line .= " ";
}
}
$buf .= "$line\n";
}
sub ProcessEnd {
# close nesting
#
&close1;
if ($noparadepth > 0) {
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
}
}
sub Buffer {
return $buf;
}
sub close1 {
&close2;
if ($level1) {
$buf .= "</sect1>\n";
$level1 = 0;
}
}
sub close2 {
&close3;
if ($level2) {
$buf .= "</sect2>\n";
$level2 = 0;
}
}
sub close3 {
&closeorderedlist;
&closeitemizedlist;
&closepara;
&closeqandaset;
if ($level3) {
$buf .= "</sect3>\n";
$level3 = 0;
}
}
sub closenonsect {
&closepara;
# &closeorderedlist;
# &closeitemizedlist;
}
sub closelistitem {
&closepara;
if ($listitem) {
$buf .= "</listitem>\n";
$listitem = 0;
}
}
sub closeorderedlist {
&closepara;
&closelistitem;
if ($orderedlist) {
$buf .= "</orderedlist>\n";
$orderedlist = 0;
}
}
sub closeitemizedlist {
&closepara;
&closelistitem;
if ($itemizedlist) {
$buf .= "</itemizedlist>\n";
$itemizedlist = 0;
}
}
sub closelists {
&closeitemizedlist;
&closeorderedlist;
}
sub closeanswer {
&closepara;
if ($answer) {
$buf .= "</answer>\n";
$answer = 0;
}
}
sub closeqandaentry {
&closeanswer;
if ($qandaentry) {
$buf .= "</qandaentry>\n";
$qandaentry = 0;
}
}
sub closeqandaset {
&closeqandaentry;
if ($qandaset) {
$buf .= "</qandaset>\n";
$qandaset = 0;
}
}
sub closepara {
if ($para) {
$buf .= "</para>\n";
$para = 0;
}
}
sub trimline {
$line =~ s/\s+$//;
$line =~ s/^\s+//;
}
sub splittitle {
$line =~ s/^=+//;
$line =~ s/=+$//;
$title = $line;
if ($line =~ /\|/) {
$title =~ s/\|.+//;
$id = $line;
$id =~ s/^.+\|//;
} else {
$id = &anchorfix($title);
}
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$id =~ s/\s+$//;
$id =~ s/^\s+//;
}
sub anchorfix {
my $anchor = $_[0];
$anchor = lc(&trim($anchor));
$anchor = decode_entities($anchor);
$anchor =~ s/-/-dash-/g;
$anchor =~ s/&/-and-/g;
$anchor =~ s/;//g;
$anchor = encode_entities($anchor);
$anchor =~ s/&(\w)grave/\1/g;
$anchor =~ s/&(\w)acute/\1/g;
$anchor =~ s/&(\w)circ/\1/g;
$anchor =~ s/&(\w)uml/\1/g;
$anchor =~ s/&(\w)tilde/\1/g;
$anchor =~ s/&(\w)cedil/\1/g;
$anchor =~ s/&/-and-/g;
$anchor =~ s/;//g;
$anchor =~ s/\//-slash-/g;
$anchor =~ s/\\/-bslash-/g;
$anchor =~ s/\s+/-/g;
$anchor =~ s/'//g;
$anchor =~ s/`//g;
$anchor =~ s/,/-comma-/g;
$anchor =~ s/\./-dot-/g;
$anchor =~ s/!/-bang-/g;
$anchor =~ s/\?/-question-/g;
$anchor =~ s/\+/-plus-/g;
$anchor =~ s/\*/-x-/g;
$anchor =~ s/\(/-op-/g;
$anchor =~ s/\)/-cp-/g;
$anchor =~ s/\@/-at-/g;
$anchor =~ s/dcm_at/-at-/gi;
$anchor =~ s/\^/-hat-/g;
$anchor =~ s/=/-eq-/g;
$anchor =~ s/\$/S/;
$anchor =~ s/~/-tilde-/g;
$anchor =~ s/0/-zero-/g;
$anchor =~ s/1/-one-/g;
$anchor =~ s/2/-two-/g;
$anchor =~ s/3/-three-/g;
$anchor =~ s/4/-four-/g;
$anchor =~ s/5/-five-/g;
$anchor =~ s/6/-six-/g;
$anchor =~ s/7/-seven-/g;
$anchor =~ s/8/-eight-/g;
$anchor =~ s/9/-nine-/g;
$anchor =~ s/\|/-pipe-/g;
$anchor =~ s/\[/-lsqb-/g;
$anchor =~ s/\]/-rsqb-/g;
$anchor =~ s/^-+//;
$anchor =~ s/-+$//;
$anchor =~ s/--/-/g; # get rid of double, initial and trailing hyphens
return &trim($anchor);
}
sub trim {
my $temp = $_[0];
$temp =~ s/^\s+//g;
$temp =~ s/\s+$//g;
return $temp;
}
1;