added texi2db, texinfo to docbook converter nearing initial release

2002-02-02 04:55:52 +00:00 · 2002-02-02 04:55:52 +00:00 · 1872402c7f
parent abd7408625
commit 1872402c7f
5 changed files with 1509 additions and 665 deletions
--- a/LDP/README
+++ b/LDP/README
@ -28,8 +28,9 @@ scrollserver/	python web application server front end to scrollkeeper
  stylesheets/	xsl stylesheets for xml -> html conversion
  www/		www.scrollserver.org website
 test/		to test your cvs without disturbing things, use this
-txt2db/		utility to convert text files into docbook
+texi2db/	utility to convert Texinfo files into docbook
 users/		individual users' areas
 wt2db/		utility to convert WikiText files into docbook
 www/		websites
  db./		ldp database website
    cgi-bin/	perl scripts for the ldp database
--- a/LDP/texi2db/texi2db
+++ b/LDP/texi2db/texi2db
--- a/LDP/txt2db/README
+++ b/LDP/txt2db/README
@ -1,69 +0,0 @@
 This is a utility to convert text files in a specific format into valid
 DocBook. Just pass it the input filename on the commmand line and you'll
 get a .sgml file out. It won't be a complete valid document, as it will
 have no header information or dtd specification. It's just a DocBook
 fragment, not a complete document.
 The following constructs are currently supported. If you need support for
 an addition construct, write discuss@linuxdoc.org if you're subscribed,
 or feedback@linuxdoc.org if you're not.
 Or just add it in the cvs. :-)
 Foo			<para>Foo</para>
 =Title=			<sect1><title>Title</title>
 			</sect1>
 =Title|id=		<sect1 id='id'><title>Title</title>
 			</sect1>
 			works for other sect levels as well, and many other
 			tags. It is either the "id" value, or the "title"
 			value, depending on the semantics of the particular
 			tag. Usage should be obvious in context.
 ==Title==		<sect2><title>Title</title>
 			</sect2>
 ===Title===		<sect3><title>Title</title>
 			</sect3>
 #Foo			<orderedlist>
 #Bar			<listitem><para>Foo</para></listitem>
 #Baz			<listitem><para>Bar</para></listitem>
 /#			<listitem><para>Baz</para></listitem>
 			</orderedlist>
 *Foo			<simplelist>
 *Bar			<listitem><para>Foo</para></listitem>
 *Baz			<listitem><para>Bar</para></listitem>
 /*			<listitem><para>Baz</para></listitem>
 			</simplelist>
 [[http://foo.org]]	<ulink url='http://foo.org'>
 			  <citetitle>http://foo.org</citetitle>
 			</ulink>
 [[http://foo.org Foo]]	<ulink url='http://foo.org'>
 			  <citetitle>Foo</citetitle>
 			</ulink>
 [http://foo.org|Foo]]	You can also delimit with the pipe character "|".
 			This works on any of these [[]] tags.
 [[file:Foo]]		<filename>Foo</filename>
 '''Foo'''		<emphasis>Foo</emphasis>
 A few DocBook structures do not have <para> tags wrapped around them. They
 are <para> itself (duh!), <sect?> and <programlisting>. If you insert anything
 using these tags, no <para> tags will be wrapped around it or inserted into it.
 So if you want fine control over your <para> tags, insert them yourself.
 These tags include:
 <programlisting/>
 <screen/>
--- a/LDP/txt2db/sample.txt
+++ b/LDP/txt2db/sample.txt
@ -1,72 +0,0 @@
 =Introduction|intro-to-the-program=
 The following list should be rendered as a qandaset:
 Q: Why?|why-id
 A: Why
 not?
 Multiple questions and multiple answers:
 Q: Why?
 A: Why not?
 A: Why not2?
 Q: Why?
 A: Why not?
 Simple List
 *item
 *item
 *item
 /*
 This tests arbitrary DocBook. It should be passed right on to the output file
 with no changes. It can be nested arbitrarily deep.
 <informaltable>test
 <foo>test some more
 </foo>
 <informaltable>This is the second level!
 </informaltable>
 </informaltable>
 This document is from the [[http://www.linuxdoc.org Linux Documentation Project]].
 Numbered List
 This is an '''important''' [file].
 #item
 #item
 #item
 /#
 Another to make sure the numbers restart at one.
 #item
 #item
 #item
 /#
 =Bar=
 Just another section.
 ==Level 2|level2==
 ===Level 3|level3===
 =Conclusion|conclusion=
 All previous sections should be properly closed.
 =test again=
 [[ldp:INFO-SHEET]]
 [[ldp:Distributions-HOWTO]]
--- a/LDP/txt2db/txt2db.pl
+++ b/LDP/txt2db/txt2db.pl
@ -1,523 +0,0 @@
 #!/usr/bin/perl
 #
 #Converts txt files into docbook.
 #
 # Requirements:
 # 
 # If you use the "ldp:" namespace, you must have wget installed.
 # Wget is used to request an xml record from the LDP # database,
 # http://db.linuxdoc.org.
 # 
 use File::Basename;
 use HTML::Entities;
 my($txtfile, $dbfile) = '';
 #These keep track of which constructs we're in the middle of
 my($level1,
   $level2,
   $level3,
   $orderedlist,
   $listitem,
   $itemizedlist,
   $para,
   $qandaset,
   $qandaentry,
   $answer);
 my($line);
 my($id, $title);
 my($verbose);
 my($error);
 $error = 0;
 # read in cmd-line arguments
 #
 while (1) {
 	if($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
 		shift(@ARGV);
 		$dbfile = $ARGV[0];
 		shift(@ARGV);
 	} elsif($ARGV[0] eq "-h" or $ARGV[0] eq "--help") {
 		&usage;
 	} elsif($ARGV[0] eq "-v" or $ARGV[0] eq "--verbose") {
 		$verbose++;
 		shift(@ARGV);
 	} else {
 		$txtfile = $ARGV[0];
 		shift(@ARGV);
 	}
 	if ($ARGV[0] eq '') {
 		last;
 	}
 }
 # abort if no input file given
 # 
 if($txtfile eq '') {
 	print "txt2db: ERROR text file not specified.\n\n";
 	$error = 1;
 	&usage();
 } elsif( !(-r $txtfile) ) {
 	print "txt2db: ERROR cannot read $f ($!)\n\n";
 	$error = 1;
 	&usage();
 }
 unless ($dbfile) {
 	($basename, $path, $ext) = fileparse($txtfile);
 	$dbfile = $basename;
 	$dbfile =~ s/\..*?$/\.sgml/;
 }
 $buf = '';
 &proc_txt($txtfile);
 open(DB, "> $dbfile") || die "txt2db: cannot write to $dbfile ($!)\n";
 print DB $buf, "\n";
 close(DB);
 exit(0);
 # -----------------------------------------------------------
 sub proc_txt {
 	my($f) = @_;
 	my($linenumber);
 	$linenumber = 0;
 	my ($noparatag,
 	    $noparadepth);
 	$noparadepth = 0;
 	$noparaline = 0;
 	# read in the text file
 	#
 	open(TXT, "$f") || die "txt2db: cannot open $f ($!)\n";
 	while ($originalline = <TXT>) {
 		$line = $originalline;
 		$linenumber++;
 		&trimline;
 		# blank lines
 		if ($line eq '') {
 			if ($noparadepth == 0) {
 				&closenonsect;
 				next;
 			}
 		}
 		# capitalize hints that can be entered in lowercase
 		#
 		$line =~ s/^q:/Q:/;
 		$line =~ s/^a:/A:/;
 		# encode entities
 		#
 #		while ($line =~ //) {
 #		}
 #		decode_entities($line);
 		encode_entities($line);
 		# inline docbook
 		#
 		# ulink
 		# 
 		while ($line =~ /\[\[/) {
 			unless ($line =~ /\]\]/) {
 				$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
 			}
 			# separate link url from link name
 			#
 			$link = $line;
 			$link=~ s/\n//g;
 			$link =~ s/.*?\[\[//;
 			$link =~ s/\]\].*?$//;
 			if ($link =~ /\|/) {
 				$linkname = $link;
 				$link =~ s/\|.+$//;
 				$linkname =~ s/^\S+\|//;
 			} else {
 				$linkname = $link;
 			}
 			# kill quotes, they mess us up
 			# 
 			$link =~ s/'/%27/g;
 			# namespaces are handled differently
 			#
 			print "$link\n" if ($verbose);
 			if ($link =~ /^http:/) {
 				$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
 			} elsif ($link =~ /^mailto:/) {
 				$linkname =~ s/^mailto://;
 				$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
 			} elsif ($link =~ /^wiki:/) {
 				$linkname =~ s/^wiki://;
 				$link =~ s/^wiki:/http:\/\/www\.wikipedia\.com\/wiki\.phtml\?title=/;
 				$link =~ s/\ /+/;
 				$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
 			} elsif ($link =~ /^ldp:/) {
 				$linkname =~ s/^ldp://;
 				$link =~ s/^ldp://;
 				$tempfile = "/tmp/txt2db-" . $rand;
 				$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
 				system("$cmd");
 				open(URL, "$tempfile") || die "txt2db: cannot open temporary file ($!)\n";
 				$link = "";
 				while ($url_line = <URL>) {
 					$url_line =~ s/\n//;
 					if ($url_line =~ /identifier/) {
 						$link .= $url_line;
 					}
 				}
 				close(URL);
 				unlink $tempfile;
 				$link =~ s/^.*?<identifier>//;
 				$link =~ s/<\/identifier>.*?$//;
 				if ($link eq '') {
 					$linkname = "ERROR: LDP namespace resolution failure on $linkname";
 				}
 				$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
 			} elsif ($link =~ /^file:/) {
 				$linkname =~ s/^file://;
 				$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
 			} else {
 				$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
 			}
 		}
 		# emphasis
 		#
 		while ($line =~ /'''.*'''/) {
 			$line =~ s/'''/<emphasis role='bold'>/;
 			$line =~ s/'''/<\/emphasis>/;
 		}
 		# this block defines DocBook structures that won't be broken up with 
 		# paragraphs when we hit empty lines:
 		#
 		#	<para>
 		#	<sect1>
 		#	<sect2>
 		#	<sect3>
 		#	<programlisting>
 		#	<literallayout>
 		# forget about nopara
 		if ($noparadepth == 0) {
 			$noparatag = "";
 		}
 		# start a new nopara section
 		#
 		if ((($line =~ /^<para>/) or
 		     ($line =~ /^<sect/) or
 		     ($line =~ /^<screen>/) or
 		     ($line =~ /^<literallayout>/) or
 		     ($line =~ /^<programlisting>/)) and
 		    ($noparadepth == 0)) { 
 		    	&closepara;
 			$noparatag = $line;
 			$noparatag =~ s/^.*?<//;
 			$noparatag =~ s/>.*?$//;
 			$noparaline = $linenumber;
 			if ($line =~ /^<screen>/) {
 				unless ($para) {
 					$line = "<para>" . $line;
 					$para = 1;
 				}
 			}
 		}
 		# count noparadepth
 		#
 		if ($noparatag ne '') {
 			$temp = $line;
 			while ($temp =~ /<$noparatag>/) {
 				$temp =~ s/<?$noparatag>//;
 				$noparadepth ++;
 			}
 			while ($temp =~ /<\/$noparatag>/) {
 				$temp =~ s/<?\/$noparatag>//;
 				$noparadepth --;
 				if ($noparadepth == 0) {
 					$noparaline == 0;
 				}
 			}
 			# runon protection
 			#
 			if ($linenumber >= ($noparaline + 100)) {
 				$buf .= "ERROR: runon block starting on line $noparaline\n";
 				last;
 			}
 			# recover original line -- no whitespace modifiers
 			#
 			$line = $originalline;
 		# sect3
 		#
 		} elsif ($line =~ /^===/) {
 			&close3;
 			&splittitle;
 			if ($id eq '') {
 				$line = "<sect3><title>$title</title>\n";
 			} else {
 				$line = "<sect3 id='$id'><title id='$id-title'>$title</title>\n";
 			}
 			$level3 = 1;
 		# sect2
 		#
 		} elsif ($line =~ /^==/) {
 			&close2;
 			&splittitle;
 			if ($id eq '') {
 				$line = "<sect2><title>$title</title>\n";
 			} else {
 				$line = "<sect2 id='$id'><title id='$id-title'>$title</title>\n";
 			}
 			$level2 = 1;
 		# sect1
 		#
 		} elsif ($line =~ /^=/) {
 			&close1;
 			&splittitle;
 			if ($id eq '') {
 				$line = "<sect1><title>$title</title>\n";
 			} else {
 				$line = "<sect1 id='$id'><title id='$id-title'>$title</title>\n";
 			}
 			$level1 = 1;
 		# orderedlist
 		#
 		} elsif ($line =~ /^#/) {
 			&closeitemizedlist;
 			if ($orderedlist == 0) {
 				$buf .= "\n<orderedlist>\n";
 				$orderedlist = 1;
 			}
 			&closelistitem;
 			$line =~ s/^#//;
 			&trimline;
 			$line =~ s/^/<listitem><para>/;
 			$listitem = 1;
 			$para = 1;
 		} elsif ($line =~ /^\/#/) {
 			$line =~ s/^\/#//;
 			&trimline;
 			&closeorderedlist;
 		# itemizedlist
 		#
 		} elsif ($line =~ /^\*/) {
 			&closeorderedlist;
 			if ($itemizedlist == 0) {
 				$buf .= "\n<itemizedlist>\n";
 				$itemizedlist = 1;
 			}
 			&closelistitem;
 			$line =~ s/^\*//;
 			&trimline;
 			$line =~ s/^/<listitem><para>/;
 			$listitem = 1;
 			$para = 1;
 		} elsif ($line =~ /\/\*/) {
 			$line =~ s/^\/\*//;
 			&trimline;
 			&closeitemizedlist;
 		# question
 		#
 		} elsif ($line =~ /^Q:/) {
 			&closelists;
 			&closeqandaentry;
 			$line =~ s/^Q://;
 			&trimline;
 			&splittitle;
 			if ($id eq '') {
 				$line = "<question><para>" . $title . "</para></question>\n";
 			} else {
 				$line = "<question id='$id'><para>" . $title . "</para></question>\n";
 			}
 			unless ($qandaentry == 1) {
 				$line = "<qandaentry>\n" . $line;
 				$qandaentry = 1;
 			}
 			if ($qandaset == 0) {
 				$line = "<qandaset defaultlabel='qanda'>\n". $line;
 				$qandaset = 1;
 			}
 		# answer
 		#
 		} elsif ($line =~ /^A:/) {
 			$line =~ s/^A://;
 			&trimline;
 			&closeanswer;
 			$line = "<answer><para>" . $line;
 			$answer = 1;
 			$para = 1;
 		} elsif ($line =~ /^\s*----\s*$/) {
 			$line = '';
 		# para
 		#
 		} else {
 			if (($para == 0) and ($noparatag eq '')) {
 				$line = "<para>" . $line;
 				$para = 1;
 			} else {
 				$line .= " ";
 			}
 		}
 		$buf .= "$line ";
 	}
 	# close nesting
 	#
 	&close1;
 	if ($noparadepth > 0) {
 		$buf .= "ERROR tag $noparatag on line $noparaline unterminated.\n";
 	}
 }
 sub close1 {
 	&close2;
 	if ($level1 == 1) {
 		$buf .= "</sect1>\n";
 		$level1 = 0;
 	}
 }
 sub close2 {
 	&close3;
 	if ($level2 == 1) {
 		$buf .= "</sect2>\n";
 		$level2 = 0;
 	}
 }
 sub close3 {
 	&closeorderedlist;
 	&closeitemizedlist;
 	&closepara;
 	&closeqandaset;
 	if ($level3 == 1) {
 		$buf .= "</sect3>\n";
 		$level3 = 0;
 	}
 }
 sub closenonsect {
 	&closepara;
 #	&closeorderedlist;
 #	&closeitemizedlist;
 }
 sub closelistitem {
 	&closepara;
 	if ($listitem == 1 ) {
 		$buf .= "</listitem>\n";
 		$listitem = 0;
 	}
 }
 sub closeorderedlist {
 	&closepara;
 	&closelistitem;
 	if ($orderedlist == 1 ) {
 		$buf .= "</orderedlist>\n";
 		$orderedlist = 0;
 	}
 }
 sub closeitemizedlist {
 	&closepara;
 	&closelistitem;
 	if ($itemizedlist == 1 ) {
 		$buf .= "</itemizedlist>\n";
 		$itemizedlist = 0;
 	}
 }
 sub closelists {
 	&closeitemizedlist;
 	&closeorderedlist;
 }
 sub closeanswer {
 	&closepara;
 	if ($answer == 1) {
 		$buf .= "</answer>\n";
 		$answer = 0;
 	}
 }
 sub closeqandaentry {
 	&closeanswer;
 	if ($qandaentry == 1) {
 		$buf .= "</qandaentry>\n";
 		$qandaentry = 0;
 	}
 }
 sub closeqandaset {
 	&closeqandaentry;
 	if ($qandaset == 1) {
 		$buf .= "</qandaset>\n";
 		$qandaset = 0;
 	}
 }
 sub closepara {
 	if ($para == 1) {
 		$buf .= "</para>\n";
 		$para = 0;
 	}
 }
 sub trimline {
 	$line =~ s/\s+$//;
 	$line =~ s/^\s+//;
 }
 sub splittitle {
 	$line =~ s/^=+//;
 	$line =~ s/=+$//;
 	$title = $line;
 	$id = "";
 	if ($line =~ /\|/) {
 		$title =~ s/\|.+//;
 		$id = $line;
 		$id =~ s/^.+\|//;
 	}
 	$title =~ s/\s+$//;
 	$title =~ s/^\s+//;
 	$id =~ s/\s+$//;
 	$id =~ s/^\s+//;
 }
 sub usage {
 	print "Usage: txt2db [-v] [-h|-o <sgml file>] <text file>\n";
 	print "-o, --output-to    write to the specified file.\n";
 	print "-v, --verbose      show diagnostic output.\n";
 	print "-h, --help         show this usage message.\n";
 	exit($error);
 }