mirror of https://github.com/tLDP/LDP
added encoding option
This commit is contained in:
parent
199e47d087
commit
041e777c89
|
@ -13,3 +13,16 @@ CVS Switched -V and -v, now -v is version, -V is verbose.
|
||||||
Changed -a, --article to -x, --xml and -s, --sgml to
|
Changed -a, --article to -x, --xml and -s, --sgml to
|
||||||
let you pick your own doctype.
|
let you pick your own doctype.
|
||||||
|
|
||||||
|
Added -n, --nonet parameter to disable network
|
||||||
|
lookups.
|
||||||
|
|
||||||
|
Fixed bug in screen blocks, para tags were being
|
||||||
|
closed when not open.
|
||||||
|
|
||||||
|
Deleted [ for <filename>, added [[file: namespace.
|
||||||
|
|
||||||
|
[[ is now for internal links and become <xref>.
|
||||||
|
|
||||||
|
Encoding of literal block tags into entities.
|
||||||
|
|
||||||
|
Added -e, --encoding to specify encoding.
|
||||||
|
|
|
@ -16,7 +16,7 @@ to build the package, and run:
|
||||||
make test
|
make test
|
||||||
|
|
||||||
to convert a small, arbitrary bit of WikiText. If the output looks
|
to convert a small, arbitrary bit of WikiText. If the output looks
|
||||||
like DocBook, go ahead and run, as root:
|
like DocBook, go ahead and run:
|
||||||
|
|
||||||
make install
|
make install
|
||||||
|
|
||||||
|
|
|
@ -20,7 +20,7 @@ WriteMakefile(
|
||||||
PREREQ_PM => {
|
PREREQ_PM => {
|
||||||
'File::Basename' => 0,
|
'File::Basename' => 0,
|
||||||
'HTML::Entities' => 0,
|
'HTML::Entities' => 0,
|
||||||
'FileHandle' => 0,
|
'FileHandle' => 0,
|
||||||
},
|
},
|
||||||
MAN1PODS => {
|
MAN1PODS => {
|
||||||
"doc/wt2db.pod" => 'blib/man1/wt2db.1',
|
"doc/wt2db.pod" => 'blib/man1/wt2db.1',
|
||||||
|
|
|
@ -1,75 +1,11 @@
|
||||||
This is wt2db version 0.1.
|
|
||||||
|
|
||||||
wt2db is a utility to convert text files in WikiText format into
|
wt2db is a utility to convert text files in WikiText format into
|
||||||
DocBook. It generates a DocBook fragment, not valid DocBook.
|
DocBook. It generates a DocBook fragment, not valid DocBook.
|
||||||
|
|
||||||
|
See the manpage for more information and WikiText help.
|
||||||
|
|
||||||
|
|
||||||
Reporting Bugs
|
Reporting Bugs
|
||||||
--------------
|
--------------
|
||||||
|
|
||||||
Bugs should be reported at sourceforge.net/projects/linuxdoc. Select
|
Bugs should be reported at sourceforge.net/projects/linuxdoc. Select
|
||||||
'wt2db' in the 'Category' field.
|
'wt2db' in the 'Category' field.
|
||||||
|
|
||||||
|
|
||||||
WikiText Tags
|
|
||||||
-------------
|
|
||||||
|
|
||||||
The following constructs are currently supported.
|
|
||||||
|
|
||||||
Foo <para>Foo</para>
|
|
||||||
|
|
||||||
=Title= <sect1><title>Title</title>
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
=Title|id= <sect1 id='id'><title>Title</title>
|
|
||||||
</sect1>
|
|
||||||
|
|
||||||
works for other sect levels as well, and many other
|
|
||||||
tags. It is either the "id" value, or the "title"
|
|
||||||
value, depending on the semantics of the particular
|
|
||||||
tag. Usage should be obvious in context.
|
|
||||||
|
|
||||||
==Title== <sect2><title>Title</title>
|
|
||||||
</sect2>
|
|
||||||
|
|
||||||
===Title=== <sect3><title>Title</title>
|
|
||||||
</sect3>
|
|
||||||
|
|
||||||
|
|
||||||
#Foo <orderedlist>
|
|
||||||
#Bar <listitem><para>Foo</para></listitem>
|
|
||||||
#Baz <listitem><para>Bar</para></listitem>
|
|
||||||
/# <listitem><para>Baz</para></listitem>
|
|
||||||
</orderedlist>
|
|
||||||
|
|
||||||
*Foo <simplelist>
|
|
||||||
*Bar <listitem><para>Foo</para></listitem>
|
|
||||||
*Baz <listitem><para>Bar</para></listitem>
|
|
||||||
/* <listitem><para>Baz</para></listitem>
|
|
||||||
</simplelist>
|
|
||||||
|
|
||||||
[[http://foo.org]] <ulink url='http://foo.org'>
|
|
||||||
<citetitle>http://foo.org</citetitle>
|
|
||||||
</ulink>
|
|
||||||
|
|
||||||
[[http://foo.org Foo]] <ulink url='http://foo.org'>
|
|
||||||
<citetitle>Foo</citetitle>
|
|
||||||
</ulink>
|
|
||||||
|
|
||||||
[http://foo.org|Foo]] You can also delimit with the pipe character "|".
|
|
||||||
This works on any of these [[]] tags.
|
|
||||||
|
|
||||||
[[file:Foo]] <filename>Foo</filename>
|
|
||||||
|
|
||||||
'''Foo''' <emphasis>Foo</emphasis>
|
|
||||||
|
|
||||||
A few DocBook structures do not have <para> tags wrapped around them. They
|
|
||||||
are <para> itself (duh!), <sect?> and <programlisting>. If you insert anything
|
|
||||||
using these tags, no <para> tags will be wrapped around it or inserted into it.
|
|
||||||
So if you want fine control over your <para> tags, insert them yourself.
|
|
||||||
|
|
||||||
These tags include:
|
|
||||||
|
|
||||||
<programlisting/>
|
|
||||||
<screen/>
|
|
||||||
|
|
||||||
|
|
|
@ -1,34 +1,33 @@
|
||||||
=head1 NAME
|
=head1 NAME
|
||||||
|
|
||||||
B<wt2db> - utility to convert WikiText documents into DocBook XML/SGML.
|
B<wt2db> - converts WikiText documents into DocBook XML/SGML.
|
||||||
|
|
||||||
|
|
||||||
=head1 SYNOPSIS
|
=head1 SYNOPSIS
|
||||||
|
|
||||||
B<wt2db> [I<OPTION>] [I<FILE>]
|
B<wt2db> [I<OPTION>] [I<FILE>]
|
||||||
|
|
||||||
|
|
||||||
=head1 DESCRIPTION
|
=head1 DESCRIPTION
|
||||||
|
|
||||||
B<wt2db> converts a text file in a special format similar to that used
|
B<wt2db> converts a text file in a special format similar to that used
|
||||||
in WikiWikiWebs into DocBook XML/SGML.
|
in WikiWikiWebs into DocBook XML/SGML.
|
||||||
|
The DocBook it writes out by default is only
|
||||||
The DocBook it writes out is only
|
a fragment, but it will write a complete document upon request.
|
||||||
a fragment, not a complete document, because it has no DOCTYPE declaration.
|
|
||||||
And due to the source format, there is no meta-data, such as in an
|
|
||||||
<articleinfo> structure.
|
|
||||||
|
|
||||||
As part of a larger publishing or document processing system, it is
|
|
||||||
expected that later processing will supply these elements.
|
|
||||||
|
|
||||||
By default it reads from STDIN and writes to STDOUT. However, if given a
|
By default it reads from STDIN and writes to STDOUT. However, if given a
|
||||||
filename, it will read that file, and an output filename can also be
|
filename, it will read that file, and an output filename can also be
|
||||||
specified as a command-line option.
|
specified as a command-line option.
|
||||||
|
|
||||||
|
|
||||||
=head1 OPTIONS
|
=head1 OPTIONS
|
||||||
|
|
||||||
B<-x>, B<--xml> add XML DOCTYPE and article tags.
|
B<-x>, B<--xml> add XML DOCTYPE and article tags.
|
||||||
|
|
||||||
B<-s>, B<--sgml> add SGML DOCTYPE and article tags.
|
B<-s>, B<--sgml> add SGML DOCTYPE and article tags.
|
||||||
|
|
||||||
|
B<-n>, B<--nonet> do not access the network.
|
||||||
|
|
||||||
B<-o>, B<--output-to> I<filename> write to the specified file.
|
B<-o>, B<--output-to> I<filename> write to the specified file.
|
||||||
|
|
||||||
B<-v>, B<--verbose> show diagnostic output.
|
B<-v>, B<--verbose> show diagnostic output.
|
||||||
|
@ -37,6 +36,126 @@ B<-V>, B<--version> show program version.
|
||||||
|
|
||||||
B<-h>, B<--help> show a usage message.
|
B<-h>, B<--help> show a usage message.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 NOTES
|
||||||
|
|
||||||
|
B<wt2db> was developed to provide an easier way to write
|
||||||
|
DocBook documentation.
|
||||||
|
|
||||||
|
WikiText is based on the form of text that is used in a
|
||||||
|
WikiWikiWeb. It provides very simple and easy to remember
|
||||||
|
tags so you can write a Wiki article without learning HTML.
|
||||||
|
B<wt2db> was originally written to convert Wikipedia articles
|
||||||
|
into DocBook.
|
||||||
|
|
||||||
|
The Wiki format has been enhanced in several ways to make it
|
||||||
|
more powerful for authors. Support has been added for Wiki tags
|
||||||
|
that don't exist in any real Wiki, by giving common DocBook
|
||||||
|
elements their own Wiki tags. Support has also been added
|
||||||
|
for including DocBook elements right in the source file.
|
||||||
|
|
||||||
|
This means WikiText is a merging of DocBook into a plain text file.
|
||||||
|
In in its simplest form, it is plain text. A plain text
|
||||||
|
document can be processed by B<wt2db> and converted into
|
||||||
|
DocBook.
|
||||||
|
Or, a complete and valid DocBook document can be processed,
|
||||||
|
and will pass through the B<w2db> filters and come out
|
||||||
|
unchanged.
|
||||||
|
Virtually any combination of DocBook with plain text will work,
|
||||||
|
with the additional Wiki style tags to make things even easier
|
||||||
|
for authors.
|
||||||
|
It put all of the semantic of DocBook
|
||||||
|
at your disposal, while being as easy to write as a Wiki page.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 WIKITEXT
|
||||||
|
|
||||||
|
These are the tags which are supported in this release of
|
||||||
|
B<wt2db>. All DocBook tags are also supported. If you encounter
|
||||||
|
any valid DocBook that is not handled correctly, please file
|
||||||
|
a bug report.
|
||||||
|
|
||||||
|
Foo <para>Foo</para>
|
||||||
|
|
||||||
|
=Title= <sect1>
|
||||||
|
<title>Title</title>
|
||||||
|
</sect1>
|
||||||
|
|
||||||
|
=Title|id= <sect1 id='id'>
|
||||||
|
<title>Title</title>
|
||||||
|
</sect1>
|
||||||
|
|
||||||
|
The id attribute, delimited with a pipe character,
|
||||||
|
works for other sect levels as well, and many other
|
||||||
|
tags. In some cases it is not an id value, but the
|
||||||
|
title, depending on the semantics of the particular
|
||||||
|
tag. Usage should be obvious in context.
|
||||||
|
|
||||||
|
==Title== <sect2>
|
||||||
|
<title>Title</title>
|
||||||
|
</sect2>
|
||||||
|
|
||||||
|
===Title=== <sect3>
|
||||||
|
<title>Title</title>
|
||||||
|
</sect3>
|
||||||
|
|
||||||
|
#Foo <orderedlist>
|
||||||
|
#Bar <listitem>
|
||||||
|
#Baz <para>Foo</para>
|
||||||
|
/# </listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>Bar</para>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>Baz</para>
|
||||||
|
</listitem>
|
||||||
|
</orderedlist>
|
||||||
|
|
||||||
|
*Foo <simplelist>
|
||||||
|
*Bar <listitem>
|
||||||
|
*Baz <para>Foo</para>
|
||||||
|
/* </listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>Bar</para>
|
||||||
|
</listitem>
|
||||||
|
<listitem>
|
||||||
|
<para>Baz</para>
|
||||||
|
</listitem>
|
||||||
|
</simplelist>
|
||||||
|
|
||||||
|
[[foo]] <xref linkend='foo' linkterm='foo'/>
|
||||||
|
[[link:Foo]]
|
||||||
|
|
||||||
|
[[file:/dev/foo]] <filename>/dev/foo</filename>
|
||||||
|
|
||||||
|
|
||||||
|
[[http://foo.org]] <ulink url='http://foo.org'>
|
||||||
|
<citetitle>http://foo.org</citetitle>
|
||||||
|
</ulink>
|
||||||
|
|
||||||
|
[[http://foo.org Foo]] <ulink url='http://foo.org'>
|
||||||
|
[[http://foo.org|Foo]] <citetitle>Foo</citetitle>
|
||||||
|
</ulink>
|
||||||
|
|
||||||
|
'''Foo''' <emphasis>Foo</emphasis>
|
||||||
|
|
||||||
|
A few DocBook structures will not have <para> tags wrapped around them. They
|
||||||
|
are <para> itself, <sect?> and <programlisting>. If you insert anything
|
||||||
|
using these tags, no <para> tags will be wrapped around it or inserted into it.
|
||||||
|
So if you want fine control over your <para> tags, insert them yourself.
|
||||||
|
|
||||||
|
The <screen> element will be wrapped with <para> tags, but no internal
|
||||||
|
paragraph breaks will be generated.
|
||||||
|
|
||||||
|
|
||||||
|
=head1 RESTRICTIONS
|
||||||
|
|
||||||
|
Currently only a single form of WikiText is supported, which is very
|
||||||
|
similar to that used by the Wikipedia (see http://www.wikipedia.com).
|
||||||
|
A future release will be configurable to support additional styles of
|
||||||
|
WikiText.
|
||||||
|
|
||||||
|
|
||||||
=head1 BUGS
|
=head1 BUGS
|
||||||
|
|
||||||
Bugs are tracked in the SourceForge project page at:
|
Bugs are tracked in the SourceForge project page at:
|
||||||
|
@ -46,25 +165,13 @@ http://www.sourceforge.net/projects/linuxdoc
|
||||||
If you report a bug in B<wt2db>, specify wt2db as the category so it will
|
If you report a bug in B<wt2db>, specify wt2db as the category so it will
|
||||||
be routed the appropriate person.
|
be routed the appropriate person.
|
||||||
|
|
||||||
=head1 RESTRICTIONS
|
|
||||||
|
|
||||||
Currently only a single form of WikiText is supported, which is very
|
|
||||||
similar to that used by the Wikipedia (see http://www.wikipedia.com).
|
|
||||||
A future release will be configurable to support additional styles of
|
|
||||||
WikiText.
|
|
||||||
|
|
||||||
=head1 NOTES
|
|
||||||
|
|
||||||
B<wt2db> was developed as a project of the Linux Documentation Project
|
|
||||||
to create an easier way of writing DocBook documentation. While it is
|
|
||||||
useful on its own, it is part of Lampadas, the LDP's document
|
|
||||||
production system.
|
|
||||||
|
|
||||||
=head1 SEE ALSO
|
=head1 SEE ALSO
|
||||||
|
|
||||||
See the home page of the Linux Documentation Project,
|
See the home page of the Linux Documentation Project,
|
||||||
http://www.tldp.org for updates and more information.
|
http://www.tldp.org for updates and more information.
|
||||||
|
|
||||||
|
|
||||||
=head1 AUTHOR
|
=head1 AUTHOR
|
||||||
|
|
||||||
This man page was written by David C. Merrill <david@lupercalia.net>.
|
This man page was written by David C. Merrill <david@lupercalia.net>.
|
||||||
|
|
|
@ -19,36 +19,42 @@ use Exporter;
|
||||||
Reset
|
Reset
|
||||||
);
|
);
|
||||||
|
|
||||||
# These keep track of which constructs we're in the middle of
|
&Reset;
|
||||||
#
|
|
||||||
$level1 = 0;
|
|
||||||
$level2 = 0;
|
|
||||||
$level3 = 0;
|
|
||||||
$orderedlist = 0;
|
|
||||||
$listitem = 0;
|
|
||||||
$itemizedlist = 0;
|
|
||||||
$para = 0;
|
|
||||||
$qandaset = 0;
|
|
||||||
$qandaentry = 0;
|
|
||||||
$answer = 0;
|
|
||||||
|
|
||||||
# These are passed in by the caller
|
# Call this before rerunning ProcessLine to clear state.
|
||||||
#
|
#
|
||||||
$txtfile = '';
|
sub Reset {
|
||||||
$dbfile = '';
|
$level1 = 0;
|
||||||
$verbose = 0;
|
$level2 = 0;
|
||||||
|
$level3 = 0;
|
||||||
|
$orderedlist = 0;
|
||||||
|
$listitem = 0;
|
||||||
|
$itemizedlist = 0;
|
||||||
|
$para = 0;
|
||||||
|
$qandaset = 0;
|
||||||
|
$qandaentry = 0;
|
||||||
|
$answer = 0;
|
||||||
|
|
||||||
# These maintain state
|
# These are passed in by the caller
|
||||||
#
|
#
|
||||||
$line = '';
|
$txtfile = '';
|
||||||
$linenumber = 0;
|
$dbfile = '';
|
||||||
$id = '';
|
$verbose = 0;
|
||||||
$title = '';
|
$doctype = 0;
|
||||||
$buf = '';
|
$nonet = 0;
|
||||||
|
|
||||||
$noparatag = 0;
|
# These maintain state
|
||||||
$noparadepth = 0;
|
#
|
||||||
$noparaline = 0;
|
$line = '';
|
||||||
|
$linenumber = 0;
|
||||||
|
$id = '';
|
||||||
|
$title = '';
|
||||||
|
$buf = '';
|
||||||
|
|
||||||
|
$noparatag = 0;
|
||||||
|
$noparadepth = 0;
|
||||||
|
$noparaline = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
# -----------------------------------------------------------
|
# -----------------------------------------------------------
|
||||||
|
@ -62,7 +68,7 @@ sub new {
|
||||||
}
|
}
|
||||||
|
|
||||||
sub ProcessFile {
|
sub ProcessFile {
|
||||||
($self, $txtfile, $dbfile, $verbose, $doctype) = @_;
|
($self, $txtfile, $dbfile, $verbose, $doctype, $nonet, $encoding) = @_;
|
||||||
|
|
||||||
# Read from STDIN if no input file given
|
# Read from STDIN if no input file given
|
||||||
#
|
#
|
||||||
|
@ -87,9 +93,10 @@ sub ProcessFile {
|
||||||
|
|
||||||
# wrap article if requested
|
# wrap article if requested
|
||||||
#
|
#
|
||||||
|
$encoding = 'ISO-8859-1' unless ($encoding);
|
||||||
if ($doctype eq 'XML') {
|
if ($doctype eq 'XML') {
|
||||||
print "Adding XML DOCTYPE and article tags." if ($verbose);
|
print "Adding XML DOCTYPE and article tags." if ($verbose);
|
||||||
$buf = '<?xml version="1.0" standalone="no"?>' . "\n";
|
$buf = '<?xml version="1.0" encoding="' . $encoding . '" standalone="no"?>' . "\n";
|
||||||
$buf .= '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"' . "\n";
|
$buf .= '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"' . "\n";
|
||||||
$buf .= ' "http://docbook.org/xml/4.1.2/docbookx.dtd"';
|
$buf .= ' "http://docbook.org/xml/4.1.2/docbookx.dtd"';
|
||||||
$buf .= "\[\]\>\n";
|
$buf .= "\[\]\>\n";
|
||||||
|
@ -153,8 +160,8 @@ sub ProcessLine {
|
||||||
|
|
||||||
# inline docbook
|
# inline docbook
|
||||||
#
|
#
|
||||||
# ulink
|
# parse all links, internal and external
|
||||||
#
|
#
|
||||||
while ($line =~ /\[\[/) {
|
while ($line =~ /\[\[/) {
|
||||||
unless ($line =~ /\]\]/) {
|
unless ($line =~ /\]\]/) {
|
||||||
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
|
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
|
||||||
|
@ -174,15 +181,22 @@ sub ProcessLine {
|
||||||
$linkname = $link;
|
$linkname = $link;
|
||||||
}
|
}
|
||||||
|
|
||||||
# kill quotes, they mess us up
|
# kill quotes inside links, they mess us up because
|
||||||
|
# we have to wrap this string with quotes.
|
||||||
|
# perhaps it should be encoding the entire URL?
|
||||||
#
|
#
|
||||||
$link =~ s/'/%27/g;
|
$link =~ s/'/%27/g;
|
||||||
|
|
||||||
# namespaces are handled differently
|
# namespaces are handled differently
|
||||||
#
|
#
|
||||||
print "$link\n" if ($verbose);
|
print "$link\n" if ($verbose);
|
||||||
if ($link =~ /^http:/) {
|
|
||||||
|
if ($link =~ /^http:\/\//) {
|
||||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||||
|
} elsif ($link =~ /^link:/) {
|
||||||
|
$link =~ s/^link://;
|
||||||
|
$linkname =~ s/^link://;
|
||||||
|
$line =~ s/\[\[.*?\]\]/<xref linkend='$link' endterm='$link'\>\<\/xref\>/;
|
||||||
} elsif ($link =~ /^mailto:/) {
|
} elsif ($link =~ /^mailto:/) {
|
||||||
$linkname =~ s/^mailto://;
|
$linkname =~ s/^mailto://;
|
||||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||||
|
@ -194,28 +208,41 @@ sub ProcessLine {
|
||||||
} elsif ($link =~ /^ldp:/) {
|
} elsif ($link =~ /^ldp:/) {
|
||||||
$linkname =~ s/^ldp://;
|
$linkname =~ s/^ldp://;
|
||||||
$link =~ s/^ldp://;
|
$link =~ s/^ldp://;
|
||||||
$tempfile = "/tmp/wt2db-" . $rand;
|
if ($nonet) {
|
||||||
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
|
$line =~ s/\[\[.*?\]\]/<citetitle>$link<\/citetitle>/;
|
||||||
system("$cmd");
|
} else {
|
||||||
open(URL, "$tempfile") || die "wt2db: cannot open temporary file ($!)\n\n";
|
$tempfile = "/tmp/wt2db-" . $rand;
|
||||||
$link = "";
|
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
|
||||||
while ($url_line = <URL>) {
|
print "$cmd\n" if ($verbose > 1);
|
||||||
$url_line =~ s/\n//;
|
$return = system("$cmd");
|
||||||
if ($url_line =~ /identifier/) {
|
unless ($return) {
|
||||||
$link .= $url_line;
|
open(URL, "$tempfile") || die "wt2db: cannot open temporary file ($!)\n\n";
|
||||||
|
$link = '';
|
||||||
|
while ($url_line = <URL>) {
|
||||||
|
$url_line =~ s/\n//;
|
||||||
|
if ($url_line =~ /identifier/) {
|
||||||
|
$link .= $url_line;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
close(URL);
|
||||||
|
unlink $tempfile;
|
||||||
}
|
}
|
||||||
|
$link =~ s/^.*?<identifier>//;
|
||||||
|
$link =~ s/<\/identifier>.*?$//;
|
||||||
|
if ($link eq '') {
|
||||||
|
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
|
||||||
|
}
|
||||||
|
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||||
}
|
}
|
||||||
close(URL);
|
|
||||||
unlink $tempfile;
|
|
||||||
$link =~ s/^.*?<identifier>//;
|
|
||||||
$link =~ s/<\/identifier>.*?$//;
|
|
||||||
if ($link eq '') {
|
|
||||||
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
|
|
||||||
}
|
|
||||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
|
||||||
} elsif ($link =~ /^file:/) {
|
} elsif ($link =~ /^file:/) {
|
||||||
$linkname =~ s/^file://;
|
$linkname =~ s/^file://;
|
||||||
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
||||||
|
} elsif ($link =~ /^dir:/) {
|
||||||
|
|
||||||
|
# FIXME: need to check attribute on filename element
|
||||||
|
#
|
||||||
|
$linkname =~ s/^dir://;
|
||||||
|
$line =~ s/\[\[.*?\]\]/<filename type='directory'>$linkname<\/filename>/;
|
||||||
} else {
|
} else {
|
||||||
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
||||||
}
|
}
|
||||||
|
@ -238,7 +265,8 @@ sub ProcessLine {
|
||||||
# <programlisting>
|
# <programlisting>
|
||||||
# <literallayout>
|
# <literallayout>
|
||||||
|
|
||||||
# forget about nopara
|
# forget about being in nopara state if we're no longer in one
|
||||||
|
#
|
||||||
if ($noparadepth == 0) {
|
if ($noparadepth == 0) {
|
||||||
$noparatag = "";
|
$noparatag = "";
|
||||||
}
|
}
|
||||||
|
@ -248,6 +276,8 @@ sub ProcessLine {
|
||||||
if ((($line =~ /^<para>/) or
|
if ((($line =~ /^<para>/) or
|
||||||
($line =~ /^<sect/) or
|
($line =~ /^<sect/) or
|
||||||
($line =~ /^<screen>/) or
|
($line =~ /^<screen>/) or
|
||||||
|
($line =~ /^<screen>/) or
|
||||||
|
($line =~ /^<blockquote>/) or
|
||||||
($line =~ /^<literallayout>/) or
|
($line =~ /^<literallayout>/) or
|
||||||
($line =~ /^<articleinfo>/) or
|
($line =~ /^<articleinfo>/) or
|
||||||
($line =~ /^<programlisting>/)) and
|
($line =~ /^<programlisting>/)) and
|
||||||
|
@ -257,9 +287,12 @@ sub ProcessLine {
|
||||||
$noparatag =~ s/^.*?<//;
|
$noparatag =~ s/^.*?<//;
|
||||||
$noparatag =~ s/>.*?$//;
|
$noparatag =~ s/>.*?$//;
|
||||||
$noparaline = $linenumber;
|
$noparaline = $linenumber;
|
||||||
|
|
||||||
|
# screen sections don't embed para tags, but are wrapped in them
|
||||||
|
#
|
||||||
if ($line =~ /^<screen>/) {
|
if ($line =~ /^<screen>/) {
|
||||||
unless ($para) {
|
unless ($para) {
|
||||||
$line = "<para>" . $line;
|
$buf .= "<para>";
|
||||||
$para = 1;
|
$para = 1;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -289,9 +322,29 @@ sub ProcessLine {
|
||||||
}
|
}
|
||||||
|
|
||||||
# recover original line -- no whitespace modifiers
|
# recover original line -- no whitespace modifiers
|
||||||
|
# allow nonencoded text in unparsed lines, when in a literal block
|
||||||
#
|
#
|
||||||
$line = $originalline;
|
$line = $originalline;
|
||||||
chomp($line);
|
chomp($line);
|
||||||
|
if ($line =~ /^<$noparatag>/ ) {
|
||||||
|
$starttag = "<$noparatag>";
|
||||||
|
} else {
|
||||||
|
$starttag = '';
|
||||||
|
}
|
||||||
|
if ($line =~ /<\/$noparatag>/ ) {
|
||||||
|
$endtag = "<\/$noparatag>";
|
||||||
|
} else {
|
||||||
|
$endtag = '';
|
||||||
|
}
|
||||||
|
|
||||||
|
$line =~ s/<$noparatag>//;
|
||||||
|
$line =~ s/<\/$noparatag>//;
|
||||||
|
if (($noparatag eq 'screen') or
|
||||||
|
($noparatag eq 'literallayout') or
|
||||||
|
($noparatag eq 'programlisting')) {
|
||||||
|
encode_entities($line);
|
||||||
|
}
|
||||||
|
$line = "$starttag$line$endtag";
|
||||||
|
|
||||||
# sect3
|
# sect3
|
||||||
#
|
#
|
||||||
|
@ -376,9 +429,9 @@ sub ProcessLine {
|
||||||
&trimline;
|
&trimline;
|
||||||
&splittitle;
|
&splittitle;
|
||||||
if ($id eq '') {
|
if ($id eq '') {
|
||||||
$line = "<question><para>" . $title . "</para></question>";
|
$line = "<question><para>$title</para></question>";
|
||||||
} else {
|
} else {
|
||||||
$line = "<question id='$id'><para>" . $title . "</para></question>";
|
$line = "<question id='$id'><para>$title</para></question>";
|
||||||
}
|
}
|
||||||
unless ($qandaentry) {
|
unless ($qandaentry) {
|
||||||
$line = "<qandaentry>\n" . $line;
|
$line = "<qandaentry>\n" . $line;
|
||||||
|
@ -430,42 +483,6 @@ sub Buffer {
|
||||||
return $buf;
|
return $buf;
|
||||||
}
|
}
|
||||||
|
|
||||||
# Basically a cut-and-paste of the original declarations,
|
|
||||||
# to make sure all variables are completely cleared.
|
|
||||||
#
|
|
||||||
# Call this before rerunning ProcessLine to clear state.
|
|
||||||
#
|
|
||||||
sub Reset {
|
|
||||||
$level1 = 0;
|
|
||||||
$level2 = 0;
|
|
||||||
$level3 = 0;
|
|
||||||
$orderedlist = 0;
|
|
||||||
$listitem = 0;
|
|
||||||
$itemizedlist = 0;
|
|
||||||
$para = 0;
|
|
||||||
$qandaset = 0;
|
|
||||||
$qandaentry = 0;
|
|
||||||
$answer = 0;
|
|
||||||
|
|
||||||
# These are passed in by the caller
|
|
||||||
#
|
|
||||||
$txtfile = '';
|
|
||||||
$dbfile = '';
|
|
||||||
$verbose = 0;
|
|
||||||
|
|
||||||
# These maintain state
|
|
||||||
#
|
|
||||||
$line = '';
|
|
||||||
$linenumber = 0;
|
|
||||||
$id = '';
|
|
||||||
$title = '';
|
|
||||||
$buf = '';
|
|
||||||
|
|
||||||
$noparatag = 0;
|
|
||||||
$noparadepth = 0;
|
|
||||||
$noparaline = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
sub close1 {
|
sub close1 {
|
||||||
&close2;
|
&close2;
|
||||||
if ($level1) {
|
if ($level1) {
|
||||||
|
@ -570,15 +587,78 @@ sub splittitle {
|
||||||
$line =~ s/^=+//;
|
$line =~ s/^=+//;
|
||||||
$line =~ s/=+$//;
|
$line =~ s/=+$//;
|
||||||
$title = $line;
|
$title = $line;
|
||||||
$id = "";
|
|
||||||
if ($line =~ /\|/) {
|
if ($line =~ /\|/) {
|
||||||
$title =~ s/\|.+//;
|
$title =~ s/\|.+//;
|
||||||
$id = $line;
|
$id = $line;
|
||||||
$id =~ s/^.+\|//;
|
$id =~ s/^.+\|//;
|
||||||
|
} else {
|
||||||
|
$id = &anchorfix($title);
|
||||||
}
|
}
|
||||||
$title =~ s/\s+$//;
|
$title =~ s/\s+$//;
|
||||||
$title =~ s/^\s+//;
|
$title =~ s/^\s+//;
|
||||||
$id =~ s/\s+$//;
|
$id =~ s/\s+$//;
|
||||||
$id =~ s/^\s+//;
|
$id =~ s/^\s+//;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
sub anchorfix {
|
||||||
|
my $anchor = $_[0];
|
||||||
|
$anchor = lc(&trim($anchor));
|
||||||
|
$anchor = decode_entities($anchor);
|
||||||
|
$anchor =~ s/-/-dash-/g;
|
||||||
|
$anchor =~ s/&/-and-/g;
|
||||||
|
$anchor =~ s/;//g;
|
||||||
|
$anchor = encode_entities($anchor);
|
||||||
|
$anchor =~ s/&(\w)grave/\1/g;
|
||||||
|
$anchor =~ s/&(\w)acute/\1/g;
|
||||||
|
$anchor =~ s/&(\w)circ/\1/g;
|
||||||
|
$anchor =~ s/&(\w)uml/\1/g;
|
||||||
|
$anchor =~ s/&(\w)tilde/\1/g;
|
||||||
|
$anchor =~ s/&(\w)cedil/\1/g;
|
||||||
|
$anchor =~ s/&/-and-/g;
|
||||||
|
$anchor =~ s/;//g;
|
||||||
|
$anchor =~ s/\//-slash-/g;
|
||||||
|
$anchor =~ s/\\/-bslash-/g;
|
||||||
|
$anchor =~ s/\s+/-/g;
|
||||||
|
$anchor =~ s/'//g;
|
||||||
|
$anchor =~ s/`//g;
|
||||||
|
$anchor =~ s/,/-comma-/g;
|
||||||
|
$anchor =~ s/\./-dot-/g;
|
||||||
|
$anchor =~ s/!/-bang-/g;
|
||||||
|
$anchor =~ s/\?/-question-/g;
|
||||||
|
$anchor =~ s/\+/-plus-/g;
|
||||||
|
$anchor =~ s/\*/-x-/g;
|
||||||
|
$anchor =~ s/\(/-op-/g;
|
||||||
|
$anchor =~ s/\)/-cp-/g;
|
||||||
|
$anchor =~ s/\@/-at-/g;
|
||||||
|
$anchor =~ s/dcm_at/-at-/gi;
|
||||||
|
$anchor =~ s/\^/-hat-/g;
|
||||||
|
$anchor =~ s/=/-eq-/g;
|
||||||
|
$anchor =~ s/\$/S/;
|
||||||
|
$anchor =~ s/~/-tilde-/g;
|
||||||
|
$anchor =~ s/0/-zero-/g;
|
||||||
|
$anchor =~ s/1/-one-/g;
|
||||||
|
$anchor =~ s/2/-two-/g;
|
||||||
|
$anchor =~ s/3/-three-/g;
|
||||||
|
$anchor =~ s/4/-four-/g;
|
||||||
|
$anchor =~ s/5/-five-/g;
|
||||||
|
$anchor =~ s/6/-six-/g;
|
||||||
|
$anchor =~ s/7/-seven-/g;
|
||||||
|
$anchor =~ s/8/-eight-/g;
|
||||||
|
$anchor =~ s/9/-nine-/g;
|
||||||
|
$anchor =~ s/\|/-pipe-/g;
|
||||||
|
$anchor =~ s/\[/-lsqb-/g;
|
||||||
|
$anchor =~ s/\]/-rsqb-/g;
|
||||||
|
$anchor =~ s/^-+//;
|
||||||
|
$anchor =~ s/-+$//;
|
||||||
|
$anchor =~ s/--/-/g; # get rid of double, initial and trailing hyphens
|
||||||
|
return &trim($anchor);
|
||||||
|
}
|
||||||
|
|
||||||
|
sub trim {
|
||||||
|
my $temp = $_[0];
|
||||||
|
|
||||||
|
$temp =~ s/^\s+//g;
|
||||||
|
$temp =~ s/\s+$//g;
|
||||||
|
return $temp;
|
||||||
|
}
|
||||||
1;
|
1;
|
||||||
|
|
|
@ -5,7 +5,26 @@ $WT = new Wt2Db;
|
||||||
|
|
||||||
$buffer = "foo bar
|
$buffer = "foo bar
|
||||||
|
|
||||||
baz
|
=Section|section=
|
||||||
|
|
||||||
|
paragraph
|
||||||
|
|
||||||
|
==Subsection|subsection==
|
||||||
|
|
||||||
|
paragraph
|
||||||
|
|
||||||
|
|
||||||
|
=Namespaces=
|
||||||
|
|
||||||
|
==MailTo==
|
||||||
|
|
||||||
|
[[mailto:david@lupercalia.net]]
|
||||||
|
[[mailto:david@lupercalia.net|David Merrill]]
|
||||||
|
|
||||||
|
==HTTP==
|
||||||
|
|
||||||
|
[[http://www.tldp.org]]
|
||||||
|
[[http://www.tldp.org|The Linux Documentation Project]]
|
||||||
|
|
||||||
";
|
";
|
||||||
|
|
||||||
|
|
|
@ -10,6 +10,8 @@ $WT2DB = new Wt2Db;
|
||||||
my $txtfile = '';
|
my $txtfile = '';
|
||||||
my $dbfile = '';
|
my $dbfile = '';
|
||||||
my $doctype = '';
|
my $doctype = '';
|
||||||
|
my $nonet = 0;
|
||||||
|
my $encoding = 'ISO-8859-1';
|
||||||
my $verbose = 0;
|
my $verbose = 0;
|
||||||
my $error = 0;
|
my $error = 0;
|
||||||
|
|
||||||
|
@ -22,6 +24,13 @@ while (1) {
|
||||||
} elsif($ARGV[0] eq "-x" or $ARGV[0] eq "--xml") {
|
} elsif($ARGV[0] eq "-x" or $ARGV[0] eq "--xml") {
|
||||||
$doctype = 'XML';
|
$doctype = 'XML';
|
||||||
shift(@ARGV);
|
shift(@ARGV);
|
||||||
|
} elsif($ARGV[0] eq "-e" or $ARGV[0] eq "--encoding") {
|
||||||
|
shift(@ARGV);
|
||||||
|
$encoding = $ARGV[0];
|
||||||
|
shift(@ARGV);
|
||||||
|
} elsif($ARGV[0] eq "-n" or $ARGV[0] eq "--nonet") {
|
||||||
|
$nonet = 1;
|
||||||
|
shift(@ARGV);
|
||||||
} elsif($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
|
} elsif($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
|
||||||
shift(@ARGV);
|
shift(@ARGV);
|
||||||
$dbfile = $ARGV[0];
|
$dbfile = $ARGV[0];
|
||||||
|
@ -44,7 +53,7 @@ while (1) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
$WT2DB->ProcessFile($txtfile, $dbfile, $verbose, $doctype);
|
$WT2DB->ProcessFile($txtfile, $dbfile, $verbose, $doctype, $nonet, $encoding);
|
||||||
|
|
||||||
sub version {
|
sub version {
|
||||||
print "wt2db version $VERSION\n";
|
print "wt2db version $VERSION\n";
|
||||||
|
@ -53,7 +62,7 @@ sub version {
|
||||||
print "Converts a WikiText file into DocBook XML/SGML.\n";
|
print "Converts a WikiText file into DocBook XML/SGML.\n";
|
||||||
print "\n";
|
print "\n";
|
||||||
print "This is free software; see the source for copying conditions. There is no\n";
|
print "This is free software; see the source for copying conditions. There is no\n";
|
||||||
print "warranty; not even for merchantability or fitness for a particular purpose.\n";
|
print "warranty; not even for merchantability or fitness for a particular purpose.\n\n";
|
||||||
}
|
}
|
||||||
|
|
||||||
sub usage {
|
sub usage {
|
||||||
|
@ -64,9 +73,11 @@ sub usage {
|
||||||
print "Options:\n";
|
print "Options:\n";
|
||||||
print "-s, --SGML add XML DOCTYPE and article tags.\n";
|
print "-s, --SGML add XML DOCTYPE and article tags.\n";
|
||||||
print "-x, --XML add SGML DOCTYPE and article tags.\n";
|
print "-x, --XML add SGML DOCTYPE and article tags.\n";
|
||||||
|
print "-e, --encoding specify character encoding.\n";
|
||||||
|
print "-n, --nonet do not look up documents on the net.\n";
|
||||||
print "-o, --output-to write to the specified file.\n";
|
print "-o, --output-to write to the specified file.\n";
|
||||||
print "-v, --verbose show diagnostic output.\n";
|
print "-V, --verbose show diagnostic output.\n";
|
||||||
print "-V, --version show program version.\n";
|
print "-v, --version show program version.\n";
|
||||||
print "-h, --help show this usage message.\n";
|
print "-h, --help show this usage message.\n";
|
||||||
exit($error);
|
exit($error);
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in New Issue