added texi2db, texinfo to docbook converter nearing initial release

This commit is contained in:
david 2002-02-02 04:55:52 +00:00
parent abd7408625
commit 1872402c7f
5 changed files with 1509 additions and 665 deletions

View File

@ -28,8 +28,9 @@ scrollserver/ python web application server front end to scrollkeeper
stylesheets/ xsl stylesheets for xml -> html conversion
www/ www.scrollserver.org website
test/ to test your cvs without disturbing things, use this
txt2db/ utility to convert text files into docbook
texi2db/ utility to convert Texinfo files into docbook
users/ individual users' areas
wt2db/ utility to convert WikiText files into docbook
www/ websites
db./ ldp database website
cgi-bin/ perl scripts for the ldp database

1507
LDP/texi2db/texi2db Executable file

File diff suppressed because it is too large Load Diff

View File

@ -1,69 +0,0 @@
This is a utility to convert text files in a specific format into valid
DocBook. Just pass it the input filename on the commmand line and you'll
get a .sgml file out. It won't be a complete valid document, as it will
have no header information or dtd specification. It's just a DocBook
fragment, not a complete document.
The following constructs are currently supported. If you need support for
an addition construct, write discuss@linuxdoc.org if you're subscribed,
or feedback@linuxdoc.org if you're not.
Or just add it in the cvs. :-)
Foo <para>Foo</para>
=Title= <sect1><title>Title</title>
</sect1>
=Title|id= <sect1 id='id'><title>Title</title>
</sect1>
works for other sect levels as well, and many other
tags. It is either the "id" value, or the "title"
value, depending on the semantics of the particular
tag. Usage should be obvious in context.
==Title== <sect2><title>Title</title>
</sect2>
===Title=== <sect3><title>Title</title>
</sect3>
#Foo <orderedlist>
#Bar <listitem><para>Foo</para></listitem>
#Baz <listitem><para>Bar</para></listitem>
/# <listitem><para>Baz</para></listitem>
</orderedlist>
*Foo <simplelist>
*Bar <listitem><para>Foo</para></listitem>
*Baz <listitem><para>Bar</para></listitem>
/* <listitem><para>Baz</para></listitem>
</simplelist>
[[http://foo.org]] <ulink url='http://foo.org'>
<citetitle>http://foo.org</citetitle>
</ulink>
[[http://foo.org Foo]] <ulink url='http://foo.org'>
<citetitle>Foo</citetitle>
</ulink>
[http://foo.org|Foo]] You can also delimit with the pipe character "|".
This works on any of these [[]] tags.
[[file:Foo]] <filename>Foo</filename>
'''Foo''' <emphasis>Foo</emphasis>
A few DocBook structures do not have <para> tags wrapped around them. They
are <para> itself (duh!), <sect?> and <programlisting>. If you insert anything
using these tags, no <para> tags will be wrapped around it or inserted into it.
So if you want fine control over your <para> tags, insert them yourself.
These tags include:
<programlisting/>
<screen/>

View File

@ -1,72 +0,0 @@
=Introduction|intro-to-the-program=
The following list should be rendered as a qandaset:
Q: Why?|why-id
A: Why
not?
Multiple questions and multiple answers:
Q: Why?
A: Why not?
A: Why not2?
Q: Why?
A: Why not?
Simple List
*item
*item
*item
/*
This tests arbitrary DocBook. It should be passed right on to the output file
with no changes. It can be nested arbitrarily deep.
<informaltable>test
<foo>test some more
</foo>
<informaltable>This is the second level!
</informaltable>
</informaltable>
This document is from the [[http://www.linuxdoc.org Linux Documentation Project]].
Numbered List
This is an '''important''' [file].
#item
#item
#item
/#
Another to make sure the numbers restart at one.
#item
#item
#item
/#
=Bar=
Just another section.
==Level 2|level2==
===Level 3|level3===
=Conclusion|conclusion=
All previous sections should be properly closed.
=test again=
[[ldp:INFO-SHEET]]
[[ldp:Distributions-HOWTO]]

View File

@ -1,523 +0,0 @@
#!/usr/bin/perl
#
#Converts txt files into docbook.
#
# Requirements:
#
# If you use the "ldp:" namespace, you must have wget installed.
# Wget is used to request an xml record from the LDP # database,
# http://db.linuxdoc.org.
#
use File::Basename;
use HTML::Entities;
my($txtfile, $dbfile) = '';
#These keep track of which constructs we're in the middle of
my($level1,
$level2,
$level3,
$orderedlist,
$listitem,
$itemizedlist,
$para,
$qandaset,
$qandaentry,
$answer);
my($line);
my($id, $title);
my($verbose);
my($error);
$error = 0;
# read in cmd-line arguments
#
while (1) {
if($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
shift(@ARGV);
$dbfile = $ARGV[0];
shift(@ARGV);
} elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
&usage;
} elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") {
$verbose++;
shift(@ARGV);
} else {
$txtfile = $ARGV[0];
shift(@ARGV);
}
if ($ARGV[0] eq '') {
last;
}
}
# abort if no input file given
#
if($txtfile eq '') {
print "txt2db: ERROR text file not specified.\n\n";
$error = 1;
&usage();
} elsif( !(-r $txtfile) ) {
print "txt2db: ERROR cannot read $f ($!)\n\n";
$error = 1;
&usage();
}
unless ($dbfile) {
($basename, $path, $ext) = fileparse($txtfile);
$dbfile = $basename;
$dbfile =~ s/\..*?$/\.sgml/;
}
$buf = '';
&proc_txt($txtfile);
open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n";
print DB $buf, "\n";
close(DB);
exit(0);
# -----------------------------------------------------------
sub proc_txt {
my($f) = @_;
my($linenumber);
$linenumber = 0;
my ($noparatag,
$noparadepth);
$noparadepth = 0;
$noparaline = 0;
# read in the text file
#
open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n";
while ($originalline = <TXT>) {
$line = $originalline;
$linenumber++;
&trimline;
# blank lines
if ($line eq '') {
if ($noparadepth == 0) {
&closenonsect;
next;
}
}
# capitalize hints that can be entered in lowercase
#
$line =~ s/^q:/Q:/;
$line =~ s/^a:/A:/;
# encode entities
#
# while ($line =~ //) {
# }
# decode_entities($line);
encode_entities($line);
# inline docbook
#
# ulink
#
while ($line =~ /\[\[/) {
unless ($line =~ /\]\]/) {
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
}
# separate link url from link name
#
$link = $line;
$link=~ s/\n//g;
$link =~ s/.*?\[\[//;
$link =~ s/\]\].*?$//;
if ($link =~ /\|/) {
$linkname = $link;
$link =~ s/\|.+$//;
$linkname =~ s/^\S+\|//;
} else {
$linkname = $link;
}
# kill quotes, they mess us up
#
$link =~ s/'/%27/g;
# namespaces are handled differently
#
print "$link\n" if ($verbose);
if ($link =~ /^http:/) {
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^mailto:/) {
$linkname =~ s/^mailto://;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^wiki:/) {
$linkname =~ s/^wiki://;
$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
$link =~ s/\ /+/;
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^ldp:/) {
$linkname =~ s/^ldp://;
$link =~ s/^ldp://;
$tempfile = "/tmp/txt2db-" . $rand;
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
system("$cmd");
open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n";
$link = "";
while ($url_line = <URL>) {
$url_line =~ s/\n//;
if ($url_line =~ /identifier/) {
$link .= $url_line;
}
}
close(URL);
unlink $tempfile;
$link =~ s/^.*?<identifier>//;
$link =~ s/<\/identifier>.*?$//;
if ($link eq '') {
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
}
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
} elsif ($link =~ /^file:/) {
$linkname =~ s/^file://;
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
} else {
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
}
}
# emphasis
#
while ($line =~ /'''.*'''/) {
$line =~ s/'''/<emphasis role='bold'>/;
$line =~ s/'''/<\/emphasis>/;
}
# this block defines DocBook structures that won't be broken up with
# paragraphs when we hit empty lines:
#
# <para>
# <sect1>
# <sect2>
# <sect3>
# <programlisting>
# <literallayout>
# forget about nopara
if ($noparadepth == 0) {
$noparatag = "";
}
# start a new nopara section
#
if ((($line =~ /^<para>/) or
($line =~ /^<sect/) or
($line =~ /^<screen>/) or
($line =~ /^<literallayout>/) or
($line =~ /^<programlisting>/)) and
($noparadepth == 0)) {
&closepara;
$noparatag = $line;
$noparatag =~ s/^.*?<//;
$noparatag =~ s/>.*?$//;
$noparaline = $linenumber;
if ($line =~ /^<screen>/) {
unless ($para) {
$line = "<para>" . $line;
$para = 1;
}
}
}
# count noparadepth
#
if ($noparatag ne '') {
$temp = $line;
while ($temp =~ /<$noparatag>/) {
$temp =~ s/<?$noparatag>//;
$noparadepth ++;
}
while ($temp =~ /<\/$noparatag>/) {
$temp =~ s/<?\/$noparatag>//;
$noparadepth --;
if ($noparadepth == 0) {
$noparaline == 0;
}
}
# runon protection
#
if ($linenumber >= ($noparaline + 100)) {
$buf .= "ERROR: runon block starting on line $noparaline\n";
last;
}
# recover original line -- no whitespace modifiers
#
$line = $originalline;
# sect3
#
} elsif ($line =~ /^===/) {
&close3;
&splittitle;
if ($id eq '') {
$line = "<sect3><title>$title</title>\n";
} else {
$line = "<sect3 id='$id'><title id='$id-title'>$title</title>\n";
}
$level3 = 1;
# sect2
#
} elsif ($line =~ /^==/) {
&close2;
&splittitle;
if ($id eq '') {
$line = "<sect2><title>$title</title>\n";
} else {
$line = "<sect2 id='$id'><title id='$id-title'>$title</title>\n";
}
$level2 = 1;
# sect1
#
} elsif ($line =~ /^=/) {
&close1;
&splittitle;
if ($id eq '') {
$line = "<sect1><title>$title</title>\n";
} else {
$line = "<sect1 id='$id'><title id='$id-title'>$title</title>\n";
}
$level1 = 1;
# orderedlist
#
} elsif ($line =~ /^#/) {
&closeitemizedlist;
if ($orderedlist == 0) {
$buf .= "\n<orderedlist>\n";
$orderedlist = 1;
}
&closelistitem;
$line =~ s/^#//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /^\/#/) {
$line =~ s/^\/#//;
&trimline;
&closeorderedlist;
# itemizedlist
#
} elsif ($line =~ /^\*/) {
&closeorderedlist;
if ($itemizedlist == 0) {
$buf .= "\n<itemizedlist>\n";
$itemizedlist = 1;
}
&closelistitem;
$line =~ s/^\*//;
&trimline;
$line =~ s/^/<listitem><para>/;
$listitem = 1;
$para = 1;
} elsif ($line =~ /\/\*/) {
$line =~ s/^\/\*//;
&trimline;
&closeitemizedlist;
# question
#
} elsif ($line =~ /^Q:/) {
&closelists;
&closeqandaentry;
$line =~ s/^Q://;
&trimline;
&splittitle;
if ($id eq '') {
$line = "<question><para>" . $title . "</para></question>\n";
} else {
$line = "<question id='$id'><para>" . $title . "</para></question>\n";
}
unless ($qandaentry == 1) {
$line = "<qandaentry>\n" . $line;
$qandaentry = 1;
}
if ($qandaset == 0) {
$line = "<qandaset defaultlabel='qanda'>\n". $line;
$qandaset = 1;
}
# answer
#
} elsif ($line =~ /^A:/) {
$line =~ s/^A://;
&trimline;
&closeanswer;
$line = "<answer><para>" . $line;
$answer = 1;
$para = 1;
} elsif ($line =~ /^\s*----\s*$/) {
$line = '';
# para
#
} else {
if (($para == 0) and ($noparatag eq '')) {
$line = "<para>" . $line;
$para = 1;
} else {
$line .= " ";
}
}
$buf .= "$line ";
}
# close nesting
#
&close1;
if ($noparadepth > 0) {
$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
}
}
sub close1 {
&close2;
if ($level1 == 1) {
$buf .= "</sect1>\n";
$level1 = 0;
}
}
sub close2 {
&close3;
if ($level2 == 1) {
$buf .= "</sect2>\n";
$level2 = 0;
}
}
sub close3 {
&closeorderedlist;
&closeitemizedlist;
&closepara;
&closeqandaset;
if ($level3 == 1) {
$buf .= "</sect3>\n";
$level3 = 0;
}
}
sub closenonsect {
&closepara;
# &closeorderedlist;
# &closeitemizedlist;
}
sub closelistitem {
&closepara;
if ($listitem == 1 ) {
$buf .= "</listitem>\n";
$listitem = 0;
}
}
sub closeorderedlist {
&closepara;
&closelistitem;
if ($orderedlist == 1 ) {
$buf .= "</orderedlist>\n";
$orderedlist = 0;
}
}
sub closeitemizedlist {
&closepara;
&closelistitem;
if ($itemizedlist == 1 ) {
$buf .= "</itemizedlist>\n";
$itemizedlist = 0;
}
}
sub closelists {
&closeitemizedlist;
&closeorderedlist;
}
sub closeanswer {
&closepara;
if ($answer == 1) {
$buf .= "</answer>\n";
$answer = 0;
}
}
sub closeqandaentry {
&closeanswer;
if ($qandaentry == 1) {
$buf .= "</qandaentry>\n";
$qandaentry = 0;
}
}
sub closeqandaset {
&closeqandaentry;
if ($qandaset == 1) {
$buf .= "</qandaset>\n";
$qandaset = 0;
}
}
sub closepara {
if ($para == 1) {
$buf .= "</para>\n";
$para = 0;
}
}
sub trimline {
$line =~ s/\s+$//;
$line =~ s/^\s+//;
}
sub splittitle {
$line =~ s/^=+//;
$line =~ s/=+$//;
$title = $line;
$id = "";
if ($line =~ /\|/) {
$title =~ s/\|.+//;
$id = $line;
$id =~ s/^.+\|//;
}
$title =~ s/\s+$//;
$title =~ s/^\s+//;
$id =~ s/\s+$//;
$id =~ s/^\s+//;
}
sub usage {
print "Usage: txt2db [-v] [-h|-o <sgml file>] <text file>\n";
print "-o, --output-to write to the specified file.\n";
print "-v, --verbose show diagnostic output.\n";
print "-h, --help show this usage message.\n";
exit($error);
}