mirror of https://github.com/tLDP/LDP
665 lines
13 KiB
Perl
Executable File
665 lines
13 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
#
|
|
# Converts WikiText files into docbook.
|
|
#
|
|
package Wt2Db;
|
|
|
|
use File::Basename;
|
|
use HTML::Entities;
|
|
use FileHandle;
|
|
use Exporter;
|
|
|
|
@ISA = qw(Exporter);
|
|
@EXPORT = qw(
|
|
new
|
|
ProcessFile
|
|
ProcessLine
|
|
ProcessEnd
|
|
Buffer
|
|
Reset
|
|
);
|
|
|
|
&Reset;
|
|
|
|
# Call this before rerunning ProcessLine to clear state.
|
|
#
|
|
sub Reset {
|
|
$level1 = 0;
|
|
$level2 = 0;
|
|
$level3 = 0;
|
|
$orderedlist = 0;
|
|
$listitem = 0;
|
|
$itemizedlist = 0;
|
|
$para = 0;
|
|
$qandaset = 0;
|
|
$qandaentry = 0;
|
|
$answer = 0;
|
|
|
|
# These are passed in by the caller
|
|
#
|
|
$txtfile = '';
|
|
$dbfile = '';
|
|
$verbose = 0;
|
|
$doctype = 0;
|
|
$nonet = 0;
|
|
|
|
# These maintain state
|
|
#
|
|
$line = '';
|
|
$linenumber = 0;
|
|
$id = '';
|
|
$title = '';
|
|
$buf = '';
|
|
|
|
$noparatag = 0;
|
|
$noparadepth = 0;
|
|
$noparaline = 0;
|
|
}
|
|
|
|
|
|
# -----------------------------------------------------------
|
|
|
|
sub new {
|
|
my $that = shift;
|
|
my $class = ref($that) || $that;
|
|
my $self = {};
|
|
bless $self, $class;
|
|
return $self;
|
|
}
|
|
|
|
sub ProcessFile {
|
|
($self, $txtfile, $dbfile, $verbose, $doctype, $nonet, $encoding) = @_;
|
|
|
|
# Read from STDIN if no input file given
|
|
#
|
|
if ($txtfile) {
|
|
if( !(-r $txtfile) ) {
|
|
print "wt2db: ERROR cannot read $f ($!)\n\n";
|
|
exit(1);
|
|
} else {
|
|
$fh = new FileHandle;
|
|
open $fh, "<$txtfile" or die "Cannot open $txtfile ($!)\n";
|
|
}
|
|
} else {
|
|
$fh = STDIN;
|
|
}
|
|
|
|
if ($dbfile) {
|
|
$outfh = new FileHandle;
|
|
open $outfh, ">$dbfile" or die "Cannot write to $dbfile\n\n";
|
|
} else {
|
|
$outfh = STDOUT;
|
|
}
|
|
|
|
# wrap article if requested
|
|
#
|
|
$encoding = 'ISO-8859-1' unless ($encoding);
|
|
if ($doctype eq 'XML') {
|
|
print "Adding XML DOCTYPE and article tags." if ($verbose);
|
|
$buf = '<?xml version="1.0" encoding="' . $encoding . '" standalone="no"?>' . "\n";
|
|
$buf .= '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"' . "\n";
|
|
$buf .= ' "http://docbook.org/xml/4.1.2/docbookx.dtd"';
|
|
$buf .= "\[\]\>\n";
|
|
$buf .= "\n";
|
|
$buf .= '<article>' . "\n";
|
|
} elsif ($doctype eq 'SGML') {
|
|
print "Adding SGML DOCTYPE and article tags." if ($verbose);
|
|
$buf = '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook V4.1//EN">' . "\n";
|
|
$buf .= '<article>' . "\n";
|
|
}
|
|
|
|
# read in the text file
|
|
#
|
|
while ($originalline = <$fh>) {
|
|
ProcessLine($foo, $originalline);
|
|
print $outfh "$buf";
|
|
$buf = '';
|
|
}
|
|
|
|
ProcessEnd();
|
|
|
|
# wrap article if requested
|
|
#
|
|
if ($doctype) {
|
|
$buf .= '</article>' . "\n";
|
|
}
|
|
|
|
print $outfh "$buf";
|
|
$buf = '';
|
|
close $fh;
|
|
close $outfh;
|
|
}
|
|
|
|
sub ProcessLine {
|
|
($foo, $originalline) = @_;
|
|
|
|
$line = $originalline;
|
|
$linenumber++;
|
|
|
|
&trimline;
|
|
|
|
# blank lines
|
|
unless ($line) {
|
|
unless ($noparadepth) {
|
|
&closenonsect;
|
|
return;
|
|
}
|
|
}
|
|
|
|
# capitalize hints that can be entered in lowercase
|
|
#
|
|
$line =~ s/^q:/Q:/;
|
|
$line =~ s/^a:/A:/;
|
|
|
|
# encode entities
|
|
#
|
|
# while ($line =~ //) {
|
|
# }
|
|
# decode_entities($line);
|
|
# encode_entities($line);
|
|
|
|
# inline docbook
|
|
#
|
|
# parse all links, internal and external
|
|
#
|
|
while ($line =~ /\[\[/) {
|
|
unless ($line =~ /\]\]/) {
|
|
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
|
|
}
|
|
|
|
# separate link url from link name
|
|
#
|
|
$link = $line;
|
|
$link=~ s/\n//g;
|
|
$link =~ s/.*?\[\[//;
|
|
$link =~ s/\]\].*?$//;
|
|
if ($link =~ /\|/) {
|
|
$linkname = $link;
|
|
$link =~ s/\|.+$//;
|
|
$linkname =~ s/^\S+\|//;
|
|
} else {
|
|
$linkname = $link;
|
|
}
|
|
|
|
# kill quotes inside links, they mess us up because
|
|
# we have to wrap this string with quotes.
|
|
# perhaps it should be encoding the entire URL?
|
|
#
|
|
$link =~ s/'/%27/g;
|
|
|
|
# namespaces are handled differently
|
|
#
|
|
print "$link\n" if ($verbose);
|
|
|
|
if ($link =~ /^http:\/\//) {
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^link:/) {
|
|
$link =~ s/^link://;
|
|
$linkname =~ s/^link://;
|
|
$line =~ s/\[\[.*?\]\]/<xref linkend='$link' endterm='$link'\>\<\/xref\>/;
|
|
} elsif ($link =~ /^mailto:/) {
|
|
$linkname =~ s/^mailto://;
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^wiki:/) {
|
|
$linkname =~ s/^wiki://;
|
|
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
|
|
$link =~ s/\ /+/;
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
} elsif ($link =~ /^ldp:/) {
|
|
$linkname =~ s/^ldp://;
|
|
$link =~ s/^ldp://;
|
|
if ($nonet) {
|
|
$line =~ s/\[\[.*?\]\]/<citetitle>$link<\/citetitle>/;
|
|
} else {
|
|
$tempfile = "/tmp/wt2db-" . $rand;
|
|
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
|
|
print "$cmd\n" if ($verbose > 1);
|
|
$return = system("$cmd");
|
|
unless ($return) {
|
|
open(URL, "$tempfile") || die "wt2db: cannot open temporary file ($!)\n\n";
|
|
$link = '';
|
|
while ($url_line = <URL>) {
|
|
$url_line =~ s/\n//;
|
|
if ($url_line =~ /identifier/) {
|
|
$link .= $url_line;
|
|
}
|
|
}
|
|
close(URL);
|
|
unlink $tempfile;
|
|
}
|
|
$link =~ s/^.*?<identifier>//;
|
|
$link =~ s/<\/identifier>.*?$//;
|
|
if ($link eq '') {
|
|
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
|
|
}
|
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
}
|
|
} elsif ($link =~ /^file:/) {
|
|
$linkname =~ s/^file://;
|
|
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
|
} elsif ($link =~ /^dir:/) {
|
|
|
|
# FIXME: need to check attribute on filename element
|
|
#
|
|
$linkname =~ s/^dir://;
|
|
$line =~ s/\[\[.*?\]\]/<filename type='directory'>$linkname<\/filename>/;
|
|
} else {
|
|
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
|
}
|
|
}
|
|
|
|
# emphasis
|
|
#
|
|
while ($line =~ /'''.*'''/) {
|
|
$line =~ s/'''/<emphasis role='bold'>/;
|
|
$line =~ s/'''/<\/emphasis>/;
|
|
}
|
|
|
|
# this block defines DocBook structures that won't be broken up with
|
|
# paragraphs when we hit empty lines:
|
|
#
|
|
# <para>
|
|
# <sect1>
|
|
# <sect2>
|
|
# <sect3>
|
|
# <programlisting>
|
|
# <literallayout>
|
|
|
|
# forget about being in nopara state if we're no longer in one
|
|
#
|
|
if ($noparadepth == 0) {
|
|
$noparatag = "";
|
|
}
|
|
|
|
# start a new nopara section
|
|
#
|
|
if ((($line =~ /^<para>/) or
|
|
($line =~ /^<sect/) or
|
|
($line =~ /^<screen>/) or
|
|
($line =~ /^<screen>/) or
|
|
($line =~ /^<blockquote>/) or
|
|
($line =~ /^<literallayout>/) or
|
|
($line =~ /^<articleinfo>/) or
|
|
($line =~ /^<programlisting>/)) and
|
|
($noparadepth == 0)) {
|
|
&closepara;
|
|
$noparatag = $line;
|
|
$noparatag =~ s/^.*?<//;
|
|
$noparatag =~ s/>.*?$//;
|
|
$noparaline = $linenumber;
|
|
|
|
# screen sections don't embed para tags, but are wrapped in them
|
|
#
|
|
if ($line =~ /^<screen>/) {
|
|
unless ($para) {
|
|
$buf .= "<para>";
|
|
$para = 1;
|
|
}
|
|
}
|
|
}
|
|
|
|
# count noparadepth
|
|
#
|
|
if ($noparatag ne '') {
|
|
$temp = $line;
|
|
while ($temp =~ /<$noparatag>/) {
|
|
$temp =~ s/<?$noparatag>//;
|
|
$noparadepth ++;
|
|
}
|
|
while ($temp =~ /<\/$noparatag>/) {
|
|
$temp =~ s/<?\/$noparatag>//;
|
|
$noparadepth --;
|
|
if ($noparadepth == 0) {
|
|
$noparaline = 0;
|
|
}
|
|
}
|
|
|
|
# runon protection
|
|
#
|
|
if (($noparaline) and ($linenumber >= ($noparaline + 100))) {
|
|
$buf .= "ERROR: runon block starting on line $noparaline\n";
|
|
return;
|
|
}
|
|
|
|
# recover original line -- no whitespace modifiers
|
|
# allow nonencoded text in unparsed lines, when in a literal block
|
|
#
|
|
$line = $originalline;
|
|
chomp($line);
|
|
if ($line =~ /^<$noparatag>/ ) {
|
|
$starttag = "<$noparatag>";
|
|
} else {
|
|
$starttag = '';
|
|
}
|
|
if ($line =~ /<\/$noparatag>/ ) {
|
|
$endtag = "<\/$noparatag>";
|
|
} else {
|
|
$endtag = '';
|
|
}
|
|
|
|
$line =~ s/<$noparatag>//;
|
|
$line =~ s/<\/$noparatag>//;
|
|
if (($noparatag eq 'screen') or
|
|
($noparatag eq 'literallayout') or
|
|
($noparatag eq 'programlisting')) {
|
|
encode_entities($line);
|
|
}
|
|
$line = "$starttag$line$endtag";
|
|
|
|
# sect3
|
|
#
|
|
} elsif ($line =~ /^===/) {
|
|
&close3;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<sect3><title>$title</title>";
|
|
} else {
|
|
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>";
|
|
}
|
|
$level3 = 1;
|
|
|
|
# sect2
|
|
#
|
|
} elsif ($line =~ /^==/) {
|
|
&close2;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<sect2><title>$title</title>";
|
|
} else {
|
|
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>";
|
|
}
|
|
$level2 = 1;
|
|
|
|
# sect1
|
|
#
|
|
} elsif ($line =~ /^=/) {
|
|
&close1;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<sect1><title>$title</title>";
|
|
} else {
|
|
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>";
|
|
}
|
|
$level1 = 1;
|
|
|
|
# orderedlist
|
|
#
|
|
} elsif ($line =~ /^#/) {
|
|
&closeitemizedlist;
|
|
if ($orderedlist == 0) {
|
|
$buf .= "<orderedlist>\n";
|
|
$orderedlist = 1;
|
|
}
|
|
&closelistitem;
|
|
$line =~ s/^#//;
|
|
&trimline;
|
|
$line =~ s/^/<listitem><para>/;
|
|
$listitem = 1;
|
|
$para = 1;
|
|
} elsif ($line =~ /^\/#/) {
|
|
$line =~ s/^\/#//;
|
|
&trimline;
|
|
&closeorderedlist;
|
|
|
|
# itemizedlist
|
|
#
|
|
} elsif ($line =~ /^\*/) {
|
|
&closeorderedlist;
|
|
if ($itemizedlist == 0) {
|
|
$buf .= "<itemizedlist>\n";
|
|
$itemizedlist = 1;
|
|
}
|
|
&closelistitem;
|
|
$line =~ s/^\*//;
|
|
&trimline;
|
|
$line =~ s/^/<listitem><para>/;
|
|
$listitem = 1;
|
|
$para = 1;
|
|
} elsif ($line =~ /\/\*/) {
|
|
$line =~ s/^\/\*//;
|
|
&trimline;
|
|
&closeitemizedlist;
|
|
|
|
# question
|
|
#
|
|
} elsif ($line =~ /^Q:/) {
|
|
&closelists;
|
|
&closeqandaentry;
|
|
$line =~ s/^Q://;
|
|
&trimline;
|
|
&splittitle;
|
|
if ($id eq '') {
|
|
$line = "<question><para>$title</para></question>";
|
|
} else {
|
|
$line = "<question id='$id'><para>$title</para></question>";
|
|
}
|
|
unless ($qandaentry) {
|
|
$line = "<qandaentry>\n" . $line;
|
|
$qandaentry = 1;
|
|
}
|
|
if ($qandaset == 0) {
|
|
$line = "<qandaset defaultlabel='qanda'>\n". $line;
|
|
$qandaset = 1;
|
|
}
|
|
|
|
# answer
|
|
#
|
|
} elsif ($line =~ /^A:/) {
|
|
$line =~ s/^A://;
|
|
&trimline;
|
|
&closeanswer;
|
|
$line = "<answer><para>" . $line;
|
|
$answer = 1;
|
|
$para = 1;
|
|
|
|
} elsif ($line =~ /^\s*----\s*$/) {
|
|
$line = '';
|
|
|
|
# para
|
|
#
|
|
} else {
|
|
if (($para == 0) and ($noparatag eq '')) {
|
|
$line = "<para>" . $line;
|
|
$para = 1;
|
|
} else {
|
|
$line .= " ";
|
|
}
|
|
}
|
|
|
|
$buf .= "$line\n";
|
|
}
|
|
|
|
sub ProcessEnd {
|
|
# close nesting
|
|
#
|
|
&close1;
|
|
|
|
if ($noparadepth > 0) {
|
|
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
|
|
}
|
|
}
|
|
|
|
sub Buffer {
|
|
return $buf;
|
|
}
|
|
|
|
sub close1 {
|
|
&close2;
|
|
if ($level1) {
|
|
$buf .= "</sect1>\n";
|
|
$level1 = 0;
|
|
}
|
|
}
|
|
|
|
sub close2 {
|
|
&close3;
|
|
if ($level2) {
|
|
$buf .= "</sect2>\n";
|
|
$level2 = 0;
|
|
}
|
|
}
|
|
|
|
sub close3 {
|
|
&closeorderedlist;
|
|
&closeitemizedlist;
|
|
&closepara;
|
|
&closeqandaset;
|
|
if ($level3) {
|
|
$buf .= "</sect3>\n";
|
|
$level3 = 0;
|
|
}
|
|
}
|
|
|
|
sub closenonsect {
|
|
&closepara;
|
|
# &closeorderedlist;
|
|
# &closeitemizedlist;
|
|
}
|
|
|
|
sub closelistitem {
|
|
&closepara;
|
|
if ($listitem) {
|
|
$buf .= "</listitem>\n";
|
|
$listitem = 0;
|
|
}
|
|
}
|
|
|
|
sub closeorderedlist {
|
|
&closepara;
|
|
&closelistitem;
|
|
if ($orderedlist) {
|
|
$buf .= "</orderedlist>\n";
|
|
$orderedlist = 0;
|
|
}
|
|
}
|
|
|
|
sub closeitemizedlist {
|
|
&closepara;
|
|
&closelistitem;
|
|
if ($itemizedlist) {
|
|
$buf .= "</itemizedlist>\n";
|
|
$itemizedlist = 0;
|
|
}
|
|
}
|
|
|
|
sub closelists {
|
|
&closeitemizedlist;
|
|
&closeorderedlist;
|
|
}
|
|
|
|
sub closeanswer {
|
|
&closepara;
|
|
if ($answer) {
|
|
$buf .= "</answer>\n";
|
|
$answer = 0;
|
|
}
|
|
}
|
|
|
|
sub closeqandaentry {
|
|
&closeanswer;
|
|
if ($qandaentry) {
|
|
$buf .= "</qandaentry>\n";
|
|
$qandaentry = 0;
|
|
}
|
|
}
|
|
|
|
sub closeqandaset {
|
|
&closeqandaentry;
|
|
if ($qandaset) {
|
|
$buf .= "</qandaset>\n";
|
|
$qandaset = 0;
|
|
}
|
|
}
|
|
|
|
sub closepara {
|
|
if ($para) {
|
|
$buf .= "</para>\n";
|
|
$para = 0;
|
|
}
|
|
}
|
|
|
|
sub trimline {
|
|
$line =~ s/\s+$//;
|
|
$line =~ s/^\s+//;
|
|
}
|
|
|
|
sub splittitle {
|
|
$line =~ s/^=+//;
|
|
$line =~ s/=+$//;
|
|
$title = $line;
|
|
if ($line =~ /\|/) {
|
|
$title =~ s/\|.+//;
|
|
$id = $line;
|
|
$id =~ s/^.+\|//;
|
|
} else {
|
|
$id = &anchorfix($title);
|
|
}
|
|
$title =~ s/\s+$//;
|
|
$title =~ s/^\s+//;
|
|
$id =~ s/\s+$//;
|
|
$id =~ s/^\s+//;
|
|
}
|
|
|
|
sub anchorfix {
|
|
my $anchor = $_[0];
|
|
$anchor = lc(&trim($anchor));
|
|
$anchor = decode_entities($anchor);
|
|
$anchor =~ s/-/-dash-/g;
|
|
$anchor =~ s/&/-and-/g;
|
|
$anchor =~ s/;//g;
|
|
$anchor = encode_entities($anchor);
|
|
$anchor =~ s/&(\w)grave/\1/g;
|
|
$anchor =~ s/&(\w)acute/\1/g;
|
|
$anchor =~ s/&(\w)circ/\1/g;
|
|
$anchor =~ s/&(\w)uml/\1/g;
|
|
$anchor =~ s/&(\w)tilde/\1/g;
|
|
$anchor =~ s/&(\w)cedil/\1/g;
|
|
$anchor =~ s/&/-and-/g;
|
|
$anchor =~ s/;//g;
|
|
$anchor =~ s/\//-slash-/g;
|
|
$anchor =~ s/\\/-bslash-/g;
|
|
$anchor =~ s/\s+/-/g;
|
|
$anchor =~ s/'//g;
|
|
$anchor =~ s/`//g;
|
|
$anchor =~ s/,/-comma-/g;
|
|
$anchor =~ s/\./-dot-/g;
|
|
$anchor =~ s/!/-bang-/g;
|
|
$anchor =~ s/\?/-question-/g;
|
|
$anchor =~ s/\+/-plus-/g;
|
|
$anchor =~ s/\*/-x-/g;
|
|
$anchor =~ s/\(/-op-/g;
|
|
$anchor =~ s/\)/-cp-/g;
|
|
$anchor =~ s/\@/-at-/g;
|
|
$anchor =~ s/dcm_at/-at-/gi;
|
|
$anchor =~ s/\^/-hat-/g;
|
|
$anchor =~ s/=/-eq-/g;
|
|
$anchor =~ s/\$/S/;
|
|
$anchor =~ s/~/-tilde-/g;
|
|
$anchor =~ s/0/-zero-/g;
|
|
$anchor =~ s/1/-one-/g;
|
|
$anchor =~ s/2/-two-/g;
|
|
$anchor =~ s/3/-three-/g;
|
|
$anchor =~ s/4/-four-/g;
|
|
$anchor =~ s/5/-five-/g;
|
|
$anchor =~ s/6/-six-/g;
|
|
$anchor =~ s/7/-seven-/g;
|
|
$anchor =~ s/8/-eight-/g;
|
|
$anchor =~ s/9/-nine-/g;
|
|
$anchor =~ s/\|/-pipe-/g;
|
|
$anchor =~ s/\[/-lsqb-/g;
|
|
$anchor =~ s/\]/-rsqb-/g;
|
|
$anchor =~ s/^-+//;
|
|
$anchor =~ s/-+$//;
|
|
$anchor =~ s/--/-/g; # get rid of double, initial and trailing hyphens
|
|
return &trim($anchor);
|
|
}
|
|
|
|
sub trim {
|
|
my $temp = $_[0];
|
|
|
|
$temp =~ s/^\s+//g;
|
|
$temp =~ s/\s+$//g;
|
|
return $temp;
|
|
}
|
|
1;
|