moved from a straight script to a Perl module with a wrapper script.

This is so the module can be used by Lampadas directly without needing
to shell out to the OS.
This commit is contained in:
david 2002-04-06 13:13:21 +00:00
parent 1d11ea1682
commit 698c2baf70
5 changed files with 771 additions and 495 deletions

26
LDP/wt2db/Makefile.PL Normal file
View File

@ -0,0 +1,26 @@
#!/usr/bin/perl -sw
##
## Makefile for the wt2db Perl module.
##
## Copyright (c) 2001, 2002, David Merrill. All rights reserved.
## This code is free software; you can redistribute it and/or modify
## it under the same terms as Perl itself.
##
use ExtUtils::MakeMaker;
WriteMakefile(
NAME => 'Wt2Db',
AUTHOR => 'David Merrill <david@lupercalia.net>',
ABSTRACT => 'Converts WikiText documents into DocBook XML.',
NOECHO => '',
VERSION_FROM => 'lib/Wt2Db.pm',
EXE_FILES => 'wt2db',
PREREQ_PM => {
'File::Basename' => 0,
'HTML::Entities' => 0,
'FileHandle' => 0,
},
);

145
LDP/wt2db/doc/wt2db.sgml Normal file
View File

@ -0,0 +1,145 @@
<!doctype refentry PUBLIC "-//OASIS//DTD DocBook V4//EN" [
<!-- Process this file with docbook-to-man to generate an nroff manual
page: `docbook-to-man manpage.sgml > manpage.1'. You may view
the manual page with: `docbook-to-man manpage.sgml | nroff -man |
less'. A typical entry in a Makefile or Makefile.am is:
manpage.1: manpage.sgml
docbook-to-man $< > $@
The docbook-to-man binary is found in the docbook-to-man package.
Please remember that if you create the nroff version in one of the
debian/rules file targets (such as build), you will need to include
docbook-to-man in your Build-Depends control field.
-->
<!-- Fill in your name for FIRSTNAME and SURNAME. -->
<!ENTITY dhfirstname "<firstname>David</firstname>">
<!ENTITY dhsurname "<surname>Merrill</surname>">
<!-- Please adjust the date whenever revising the manpage. -->
<!ENTITY dhdate "<date>March 28, 2002</date>">
<!-- SECTION should be 1-8, maybe w/ subsection other parameters are
allowed: see man(7), man(1). -->
<!ENTITY dhsection "<manvolnum>1</manvolnum>">
<!ENTITY dhemail "<email>david@lupercalia.net</email>">
<!ENTITY dhusername "David C. Merrill">
<!ENTITY dhucpackage "<refentrytitle>WT2DB</refentrytitle>">
<!ENTITY dhpackage "wt2db">
<!ENTITY debian "<productname>Debian</productname>">
<!ENTITY gnu "<acronym>GNU</acronym>">
]>
<refentry>
<refentryinfo>
<address>
&dhemail;
</address>
<author>
&dhfirstname;
&dhsurname;
</author>
<copyright>
<year>2001</year>
<holder>&dhusername;</holder>
</copyright>
&dhdate;
</refentryinfo>
<refmeta>
&dhucpackage;
&dhsection;
</refmeta>
<refnamediv>
<refname>&dhpackage;</refname>
<refpurpose>utility to convert WikiText documents into DocBook XML</refpurpose>
</refnamediv>
<refsynopsisdiv>
<cmdsynopsis>
<command>&dhpackage;</command>
<arg><option>-o </option><replaceable>filename</replaceable></arg>
<arg><option>-v</option></arg>
<arg><option>-h</option></arg>
</cmdsynopsis>
</refsynopsisdiv>
<refsect1>
<title>DESCRIPTION</title>
<para>This manual page documents briefly the
<command>&dhpackage;</command> command.</para>
<para><command>&dhpackage;</command> is a program that converts a text
file in a format commonly used by WikiWikiWebs into DocBook SGML.</para>
</refsect1>
<refsect1>
<title>OPTIONS</title>
<para>These programs follow the usual GNU command line syntax,
with long options starting with two dashes (`-'). A summary of
options is included below. For a complete description, see the
<application>Info</application> files.</para>
<variablelist>
<varlistentry>
<varlistentry>
<term><option>-o</option>
<option>--output-to</option>
</term>
<listitem>
<para>Write to the specified file. The default is to write to standard output.</para>
</listitem>
</varlistentry>
<term><option>-v</option>
<option>--verbose</option>
</term>
<listitem>
<para>Show diagnostic output.</para>
</listitem>
</varlistentry>
<varlistentry>
<term><option>-h</option>
<option>--help</option>
</term>
<listitem>
<para>Show summary of options.</para>
</listitem>
</varlistentry>
</variablelist>
</refsect1>
<refsect1>
<title>AUTHOR</title>
<para>This manual page was written by &dhusername; &dhemail;.
Permission is
granted to copy, distribute and/or modify this document under
the terms of the <acronym>GNU</acronym> Free Documentation
License, Version 1.1 or any later version published by the Free
Software Foundation; with no Invariant Sections, no Front-Cover
Texts and no Back-Cover Texts.</para>
</refsect1>
</refentry>
<!-- Keep this comment at the end of the file
Local variables:
mode: sgml
sgml-omittag:t
sgml-shorttag:t
sgml-minimize-attributes:nil
sgml-always-quote-attributes:t
sgml-indent-step:2
sgml-indent-data:t
sgml-parent-document:nil
sgml-default-dtd-file:nil
sgml-exposed-tags:nil
sgml-local-catalogs:nil
sgml-local-ecat-files:nil
End:
-->

573
LDP/wt2db/lib/Wt2Db.pm Executable file
View File

@ -0,0 +1,573 @@
#!/usr/bin/perl
#
# Converts WikiText files into docbook.
#
package Wt2Db;
$VERSION = '0.2';
use File::Basename;
use HTML::Entities;
use FileHandle;
use Exporter;
@ISA = qw(Exporter);
@EXPORT = qw(
new,
ProcessFile,
ProcessLine,
ProcessEnd,
Buffer,
Reset,
Usage
);
# These keep track of which constructs we're in the middle of
#
$level1 = 0;
$level2 = 0;
$level3 = 0;
$orderedlist = 0;
$listitem = 0;
$itemizedlist = 0;
$para = 0;
$qandaset = 0;
$qandaentry = 0;
$answer = 0;
# These are passed in by the caller
#
$txtfile = '';
$dbfile = '';
$verbose = 0;
# These maintain state
#
$line = '';
$linenumber = 0;
$id = '';
$title = '';
$buf = '';
$noparatag = 0;
$noparadepth = 0;
$noparaline = 0;
# -----------------------------------------------------------
sub new {
my $that = shift;
my $class = ref($that) || $that;
my $self = {};
bless $self, $class;
return $self;
}
sub ProcessFile {
($self, $txtfile, $dbfile, $verbose) = @_;
# Read from STDIN if no input file given
#
if ($txtfile) {
if( !(-r $txtfile) ) {
print "txt2db: ERROR cannot read $f ($!)\n\n";
&usage(1);
} else {
$fh = new FileHandle;
open $fh, "<$txtfile" or die "Cannot open $txtfile ($!)\n";
}
} else {
$fh = STDIN;
}
if ($dbfile) {
$outfh = new FileHandle;
open $outfh, ">$dbfile" or die "Cannot write to $dbfile\n\n";
} else {
$outfh = STDOUT;
}
# read in the text file
#
while ($originalline = <$fh>) {
ProcessLine($foo, $originalline);
print $outfh "$buf";
$buf = '';
}
ProcessEnd();
print $outfh "$buf";
$buf = '';
close $fh;
close $outfh;
}
sub ProcessLine {
($foo, $originalline) = @_;
$line = $originalline;
$linenumber++;
&trimline;
# blank lines
unless ($line) {
unless ($noparadepth) {
&closenonsect;
return;
}
}
# capitalize hints that can be entered in lowercase
#
$line =~ s/^q:/Q:/;
$line =~ s/^a:/A:/;
# encode entities
#
# while ($line =~ //) {
# }
# decode_entities($line);
# encode_entities($line);
# inline docbook
#
# ulink
#
while ($line =~ /\[\[/) {
unless ($line =~ /\]\]/) {
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
}
# separate link url from link name
#
$link = $line;
$link=~ s/\n//g;
$link =~ s/.*?\[\[//;
$link =~ s/\]\].*?$//;
if ($link =~ /\|/) {
$linkname = $link;
$link =~ s/\|.+$//;
$linkname =~ s/^\S+\|//;
} else {
$linkname = $link;
}
# kill quotes, they mess us up
#
$link =~ s/'/%27/g;
# namespaces are handled differently
#
print "$link\n" if ($verbose);
if ($link =~ /^http:/) {
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^mailto:/) {
$linkname =~ s/^mailto://;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^wiki:/) {
$linkname =~ s/^wiki://;
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
$link =~ s/\ /+/;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^ldp:/) {
$linkname =~ s/^ldp://;
$link =~ s/^ldp://;
$tempfile = "/tmp/txt2db-" . $rand;
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
system("$cmd");
open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n\n";
$link = "";
while ($url_line = <URL>) {
$url_line =~ s/\n//;
if ($url_line =~ /identifier/) {
$link .= $url_line;
}
}
close(URL);
unlink $tempfile;
$link =~ s/^.*?<identifier>//;
$link =~ s/<\/identifier>.*?$//;
if ($link eq '') {
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
}
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^file:/) {
$linkname =~ s/^file://;
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
} else {
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
}
}
# emphasis
#
while ($line =~ /'''.*'''/) {
$line =~ s/'''/<emphasis role='bold'>/;
$line =~ s/'''/<\/emphasis>/;
}
# this block defines DocBook structures that won't be broken up with
# paragraphs when we hit empty lines:
#
# <para>
# <sect1>
# <sect2>
# <sect3>
# <programlisting>
# <literallayout>
# forget about nopara
if ($noparadepth == 0) {
$noparatag = "";
}
# start a new nopara section
#
if ((($line =~ /^<para>/) or
($line =~ /^<sect/) or
($line =~ /^<screen>/) or
($line =~ /^<literallayout>/) or
($line =~ /^<programlisting>/)) and
($noparadepth == 0)) {
&closepara;
$noparatag = $line;
$noparatag =~ s/^.*?<//;
$noparatag =~ s/>.*?$//;
$noparaline = $linenumber;
if ($line =~ /^<screen>/) {
unless ($para) {
$line = "<para>" . $line;
$para = 1;
}
}
}
# count noparadepth
#
if ($noparatag ne '') {
$temp = $line;
while ($temp =~ /<$noparatag>/) {
$temp =~ s/<?$noparatag>//;
$noparadepth ++;
}
while ($temp =~ /<\/$noparatag>/) {
$temp =~ s/<?\/$noparatag>//;
$noparadepth --;
if ($noparadepth == 0) {
$noparaline = 0;
}
}
# runon protection
#
if (($noparaline) and ($linenumber >= ($noparaline + 100))) {
$buf .= "ERROR: runon block starting on line $noparaline\n";
return;
}
# recover original line -- no whitespace modifiers
#
$line = $originalline;
# sect3
#
} elsif ($line =~ /^===/) {
&close3;
&splittitle;
if ($id eq '') {
$line = "<sect3><title>$title</title>";
} else {
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>";
}
$level3 = 1;
# sect2
#
} elsif ($line =~ /^==/) {
&close2;
&splittitle;
if ($id eq '') {
$line = "<sect2><title>$title</title>";
} else {
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>";
}
$level2 = 1;
# sect1
#
} elsif ($line =~ /^=/) {
&close1;
&splittitle;
if ($id eq '') {
$line = "<sect1><title>$title</title>";
} else {
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>";
}
$level1 = 1;
# orderedlist
#
} elsif ($line =~ /^#/) {
&closeitemizedlist;
if ($orderedlist == 0) {
$buf .= "<orderedlist>\n";
$orderedlist = 1;
}
&closelistitem;
$line =~ s/^#//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /^\/#/) {
$line =~ s/^\/#//;
&trimline;
&closeorderedlist;
# itemizedlist
#
} elsif ($line =~ /^\*/) {
&closeorderedlist;
if ($itemizedlist == 0) {
$buf .= "<itemizedlist>\n";
$itemizedlist = 1;
}
&closelistitem;
$line =~ s/^\*//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /\/\*/) {
$line =~ s/^\/\*//;
&trimline;
&closeitemizedlist;
# question
#
} elsif ($line =~ /^Q:/) {
&closelists;
&closeqandaentry;
$line =~ s/^Q://;
&trimline;
&splittitle;
if ($id eq '') {
$line = "<question><para>" . $title . "</para></question>";
} else {
$line = "<question id='$id'><para>" . $title . "</para></question>";
}
unless ($qandaentry) {
$line = "<qandaentry>\n" . $line;
$qandaentry = 1;
}
if ($qandaset == 0) {
$line = "<qandaset defaultlabel='qanda'>\n". $line;
$qandaset = 1;
}
# answer
#
} elsif ($line =~ /^A:/) {
$line =~ s/^A://;
&trimline;
&closeanswer;
$line = "<answer><para>" . $line;
$answer = 1;
$para = 1;
} elsif ($line =~ /^\s*----\s*$/) {
$line = '';
# para
#
} else {
if (($para == 0) and ($noparatag eq '')) {
$line = "<para>" . $line;
$para = 1;
} else {
$line .= " ";
}
}
$buf .= "$line\n";
}
sub ProcessEnd {
# close nesting
#
&close1;
if ($noparadepth > 0) {
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
}
}
sub Buffer {
return $buf;
}
# Basically a cut-and-paste of the original declarations,
# to make sure all variables are completely cleared.
#
# Call this before rerunning ProcessLine to clear state.
#
sub Reset {
$level1 = 0;
$level2 = 0;
$level3 = 0;
$orderedlist = 0;
$listitem = 0;
$itemizedlist = 0;
$para = 0;
$qandaset = 0;
$qandaentry = 0;
$answer = 0;
# These are passed in by the caller
#
$txtfile = '';
$dbfile = '';
$verbose = 0;
# These maintain state
#
$line = '';
$linenumber = 0;
$id = '';
$title = '';
$buf = '';
$noparatag = 0;
$noparadepth = 0;
$noparaline = 0;
}
sub close1 {
&close2;
if ($level1) {
$buf .= "</sect1>\n";
$level1 = 0;
}
}
sub close2 {
&close3;
if ($level2) {
$buf .= "</sect2>\n";
$level2 = 0;
}
}
sub close3 {
&closeorderedlist;
&closeitemizedlist;
&closepara;
&closeqandaset;
if ($level3) {
$buf .= "</sect3>\n";
$level3 = 0;
}
}
sub closenonsect {
&closepara;
# &closeorderedlist;
# &closeitemizedlist;
}
sub closelistitem {
&closepara;
if ($listitem) {
$buf .= "</listitem>\n";
$listitem = 0;
}
}
sub closeorderedlist {
&closepara;
&closelistitem;
if ($orderedlist) {
$buf .= "</orderedlist>\n";
$orderedlist = 0;
}
}
sub closeitemizedlist {
&closepara;
&closelistitem;
if ($itemizedlist) {
$buf .= "</itemizedlist>\n";
$itemizedlist = 0;
}
}
sub closelists {
&closeitemizedlist;
&closeorderedlist;
}
sub closeanswer {
&closepara;
if ($answer) {
$buf .= "</answer>\n";
$answer = 0;
}
}
sub closeqandaentry {
&closeanswer;
if ($qandaentry) {
$buf .= "</qandaentry>\n";
$qandaentry = 0;
}
}
sub closeqandaset {
&closeqandaentry;
if ($qandaset) {
$buf .= "</qandaset>\n";
$qandaset = 0;
}
}
sub closepara {
if ($para) {
$buf .= "</para>\n";
$para = 0;
}
}
sub trimline {
$line =~ s/\s+$//;
$line =~ s/^\s+//;
}
sub splittitle {
$line =~ s/^=+//;
$line =~ s/=+$//;
$title = $line;
$id = "";
if ($line =~ /\|/) {
$title =~ s/\|.+//;
$id = $line;
$id =~ s/^.+\|//;
}
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$id =~ s/\s+$//;
$id =~ s/^\s+//;
}
sub Usage {
my $error = shift;
print "Usage: wt2db [OPTIONS] {FILE]\n";
print "\n";
print "Options:\n";
print "-o, --output-to write to the specified file.\n";
print "-v, --verbose show diagnostic output.\n";
print "-h, --help show this usage message.\n";
exit($error);
}
1;

19
LDP/wt2db/test.pl Executable file
View File

@ -0,0 +1,19 @@
#!/usr/bin/perl
#
use Wt2Db;
$WT = new Wt2Db;
$buffer = "foo bar
baz
";
$outbuf = '';
foreach $line (split /\n/, $buffer) {
$WT->ProcessLine($line);
}
$WT->ProcessEnd();
$outbuf = $WT->Buffer();
print $outbuf;

View File

@ -2,37 +2,13 @@
#
#Converts txt files into docbook.
#
# Requirements:
#
# If you use the "ldp:" namespace, you must have wget installed.
# Wget is used to request an xml record from the LDP # database,
# http://db.linuxdoc.org.
#
use Wt2Db;
$WT2DB = new Wt2Db;
use File::Basename;
use HTML::Entities;
my($txtfile, $dbfile) = '';
#These keep track of which constructs we're in the middle of
my($level1,
$level2,
$level3,
$orderedlist,
$listitem,
$itemizedlist,
$para,
$qandaset,
$qandaentry,
$answer);
my($line);
my($id, $title);
my($verbose);
my($error);
$error = 0;
my $txtfile = '';
my $dbfile = '';
my $verbose = 0;
my $error = 0;
# read in cmd-line arguments
#
@ -42,7 +18,7 @@ while (1) {
$dbfile = $ARGV[0];
shift(@ARGV);
} elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
&usage;
$WT2DB->Usage();
} elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") {
$verbose++;
shift(@ARGV);
@ -56,467 +32,4 @@ while (1) {
}
}
# abort if no input file given
#
if($txtfile eq '') {
$error = 1;
&usage();
} elsif( !(-r $txtfile) ) {
print "txt2db: ERROR cannot read $f ($!)\n\n";
$error = 1;
&usage();
}
unless ($dbfile) {
($basename, $path, $ext) = fileparse($txtfile);
$dbfile = $basename;
$dbfile =~ s/\..*?$/\.sgml/;
}
$buf = '';
&proc_txt($txtfile);
open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n";
print DB $buf, "\n";
close(DB);
exit(0);
# -----------------------------------------------------------
sub proc_txt {
my($f) = @_;
my($linenumber);
$linenumber = 0;
my ($noparatag,
$noparadepth);
$noparadepth = 0;
$noparaline = 0;
# read in the text file
#
open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n";
while ($originalline = <TXT>) {
$line = $originalline;
$linenumber++;
&trimline;
# blank lines
if ($line eq '') {
if ($noparadepth == 0) {
&closenonsect;
next;
}
}
# capitalize hints that can be entered in lowercase
#
$line =~ s/^q:/Q:/;
$line =~ s/^a:/A:/;
# encode entities
#
# while ($line =~ //) {
# }
# decode_entities($line);
encode_entities($line);
# inline docbook
#
# ulink
#
while ($line =~ /\[\[/) {
unless ($line =~ /\]\]/) {
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
}
# separate link url from link name
#
$link = $line;
$link=~ s/\n//g;
$link =~ s/.*?\[\[//;
$link =~ s/\]\].*?$//;
if ($link =~ /\|/) {
$linkname = $link;
$link =~ s/\|.+$//;
$linkname =~ s/^\S+\|//;
} else {
$linkname = $link;
}
# kill quotes, they mess us up
#
$link =~ s/'/%27/g;
# namespaces are handled differently
#
print "$link\n" if ($verbose);
if ($link =~ /^http:/) {
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^mailto:/) {
$linkname =~ s/^mailto://;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^wiki:/) {
$linkname =~ s/^wiki://;
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
$link =~ s/\ /+/;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^ldp:/) {
$linkname =~ s/^ldp://;
$link =~ s/^ldp://;
$tempfile = "/tmp/txt2db-" . $rand;
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
system("$cmd");
open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n";
$link = "";
while ($url_line = <URL>) {
$url_line =~ s/\n//;
if ($url_line =~ /identifier/) {
$link .= $url_line;
}
}
close(URL);
unlink $tempfile;
$link =~ s/^.*?<identifier>//;
$link =~ s/<\/identifier>.*?$//;
if ($link eq '') {
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
}
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^file:/) {
$linkname =~ s/^file://;
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
} else {
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
}
}
# emphasis
#
while ($line =~ /'''.*'''/) {
$line =~ s/'''/<emphasis role='bold'>/;
$line =~ s/'''/<\/emphasis>/;
}
# this block defines DocBook structures that won't be broken up with
# paragraphs when we hit empty lines:
#
# <para>
# <sect1>
# <sect2>
# <sect3>
# <programlisting>
# <literallayout>
# forget about nopara
if ($noparadepth == 0) {
$noparatag = "";
}
# start a new nopara section
#
if ((($line =~ /^<para>/) or
($line =~ /^<sect/) or
($line =~ /^<screen>/) or
($line =~ /^<literallayout>/) or
($line =~ /^<programlisting>/)) and
($noparadepth == 0)) {
&closepara;
$noparatag = $line;
$noparatag =~ s/^.*?<//;
$noparatag =~ s/>.*?$//;
$noparaline = $linenumber;
if ($line =~ /^<screen>/) {
unless ($para) {
$line = "<para>" . $line;
$para = 1;
}
}
}
# count noparadepth
#
if ($noparatag ne '') {
$temp = $line;
while ($temp =~ /<$noparatag>/) {
$temp =~ s/<?$noparatag>//;
$noparadepth ++;
}
while ($temp =~ /<\/$noparatag>/) {
$temp =~ s/<?\/$noparatag>//;
$noparadepth --;
if ($noparadepth == 0) {
$noparaline == 0;
}
}
# runon protection
#
if ($linenumber >= ($noparaline + 100)) {
$buf .= "ERROR: runon block starting on line $noparaline\n";
last;
}
# recover original line -- no whitespace modifiers
#
$line = $originalline;
# sect3
#
} elsif ($line =~ /^===/) {
&close3;
&splittitle;
if ($id eq '') {
$line = "<sect3><title>$title</title>\n";
} else {
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>\n";
}
$level3 = 1;
# sect2
#
} elsif ($line =~ /^==/) {
&close2;
&splittitle;
if ($id eq '') {
$line = "<sect2><title>$title</title>\n";
} else {
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>\n";
}
$level2 = 1;
# sect1
#
} elsif ($line =~ /^=/) {
&close1;
&splittitle;
if ($id eq '') {
$line = "<sect1><title>$title</title>\n";
} else {
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>\n";
}
$level1 = 1;
# orderedlist
#
} elsif ($line =~ /^#/) {
&closeitemizedlist;
if ($orderedlist == 0) {
$buf .= "\n<orderedlist>\n";
$orderedlist = 1;
}
&closelistitem;
$line =~ s/^#//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /^\/#/) {
$line =~ s/^\/#//;
&trimline;
&closeorderedlist;
# itemizedlist
#
} elsif ($line =~ /^\*/) {
&closeorderedlist;
if ($itemizedlist == 0) {
$buf .= "\n<itemizedlist>\n";
$itemizedlist = 1;
}
&closelistitem;
$line =~ s/^\*//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /\/\*/) {
$line =~ s/^\/\*//;
&trimline;
&closeitemizedlist;
# question
#
} elsif ($line =~ /^Q:/) {
&closelists;
&closeqandaentry;
$line =~ s/^Q://;
&trimline;
&splittitle;
if ($id eq '') {
$line = "<question><para>" . $title . "</para></question>\n";
} else {
$line = "<question id='$id'><para>" . $title . "</para></question>\n";
}
unless ($qandaentry == 1) {
$line = "<qandaentry>\n" . $line;
$qandaentry = 1;
}
if ($qandaset == 0) {
$line = "<qandaset defaultlabel='qanda'>\n". $line;
$qandaset = 1;
}
# answer
#
} elsif ($line =~ /^A:/) {
$line =~ s/^A://;
&trimline;
&closeanswer;
$line = "<answer><para>" . $line;
$answer = 1;
$para = 1;
} elsif ($line =~ /^\s*----\s*$/) {
$line = '';
# para
#
} else {
if (($para == 0) and ($noparatag eq '')) {
$line = "<para>" . $line;
$para = 1;
} else {
$line .= " ";
}
}
$buf .= "$line ";
}
# close nesting
#
&close1;
if ($noparadepth > 0) {
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
}
}
sub close1 {
&close2;
if ($level1 == 1) {
$buf .= "</sect1>\n";
$level1 = 0;
}
}
sub close2 {
&close3;
if ($level2 == 1) {
$buf .= "</sect2>\n";
$level2 = 0;
}
}
sub close3 {
&closeorderedlist;
&closeitemizedlist;
&closepara;
&closeqandaset;
if ($level3 == 1) {
$buf .= "</sect3>\n";
$level3 = 0;
}
}
sub closenonsect {
&closepara;
# &closeorderedlist;
# &closeitemizedlist;
}
sub closelistitem {
&closepara;
if ($listitem == 1 ) {
$buf .= "</listitem>\n";
$listitem = 0;
}
}
sub closeorderedlist {
&closepara;
&closelistitem;
if ($orderedlist == 1 ) {
$buf .= "</orderedlist>\n";
$orderedlist = 0;
}
}
sub closeitemizedlist {
&closepara;
&closelistitem;
if ($itemizedlist == 1 ) {
$buf .= "</itemizedlist>\n";
$itemizedlist = 0;
}
}
sub closelists {
&closeitemizedlist;
&closeorderedlist;
}
sub closeanswer {
&closepara;
if ($answer == 1) {
$buf .= "</answer>\n";
$answer = 0;
}
}
sub closeqandaentry {
&closeanswer;
if ($qandaentry == 1) {
$buf .= "</qandaentry>\n";
$qandaentry = 0;
}
}
sub closeqandaset {
&closeqandaentry;
if ($qandaset == 1) {
$buf .= "</qandaset>\n";
$qandaset = 0;
}
}
sub closepara {
if ($para == 1) {
$buf .= "</para>\n";
$para = 0;
}
}
sub trimline {
$line =~ s/\s+$//;
$line =~ s/^\s+//;
}
sub splittitle {
$line =~ s/^=+//;
$line =~ s/=+$//;
$title = $line;
$id = "";
if ($line =~ /\|/) {
$title =~ s/\|.+//;
$id = $line;
$id =~ s/^.+\|//;
}
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$id =~ s/\s+$//;
$id =~ s/^\s+//;
}
sub usage {
print "Usage: txt2db [-v] [-h|-o <sgml file>] <text file>\n";
print "-o, --output-to write to the specified file.\n";
print "-v, --verbose show diagnostic output.\n";
print "-h, --help show this usage message.\n";
exit($error);
}
$WT2DB->ProcessFile($txtfile, $dbfile, $verbose);