mirror of https://github.com/tLDP/LDP
added encoding option
This commit is contained in:
parent
199e47d087
commit
041e777c89
|
@ -13,3 +13,16 @@ CVS Switched -V and -v, now -v is version, -V is verbose.
|
|||
Changed -a, --article to -x, --xml and -s, --sgml to
|
||||
let you pick your own doctype.
|
||||
|
||||
Added -n, --nonet parameter to disable network
|
||||
lookups.
|
||||
|
||||
Fixed bug in screen blocks, para tags were being
|
||||
closed when not open.
|
||||
|
||||
Deleted [ for <filename>, added [[file: namespace.
|
||||
|
||||
[[ is now for internal links and become <xref>.
|
||||
|
||||
Encoding of literal block tags into entities.
|
||||
|
||||
Added -e, --encoding to specify encoding.
|
||||
|
|
|
@ -16,7 +16,7 @@ to build the package, and run:
|
|||
make test
|
||||
|
||||
to convert a small, arbitrary bit of WikiText. If the output looks
|
||||
like DocBook, go ahead and run, as root:
|
||||
like DocBook, go ahead and run:
|
||||
|
||||
make install
|
||||
|
||||
|
|
|
@ -20,7 +20,7 @@ WriteMakefile(
|
|||
PREREQ_PM => {
|
||||
'File::Basename' => 0,
|
||||
'HTML::Entities' => 0,
|
||||
'FileHandle' => 0,
|
||||
'FileHandle' => 0,
|
||||
},
|
||||
MAN1PODS => {
|
||||
"doc/wt2db.pod" => 'blib/man1/wt2db.1',
|
||||
|
|
|
@ -1,75 +1,11 @@
|
|||
This is wt2db version 0.1.
|
||||
|
||||
wt2db is a utility to convert text files in WikiText format into
|
||||
DocBook. It generates a DocBook fragment, not valid DocBook.
|
||||
|
||||
See the manpage for more information and WikiText help.
|
||||
|
||||
|
||||
Reporting Bugs
|
||||
--------------
|
||||
|
||||
Bugs should be reported at sourceforge.net/projects/linuxdoc. Select
|
||||
'wt2db' in the 'Category' field.
|
||||
|
||||
|
||||
WikiText Tags
|
||||
-------------
|
||||
|
||||
The following constructs are currently supported.
|
||||
|
||||
Foo <para>Foo</para>
|
||||
|
||||
=Title= <sect1><title>Title</title>
|
||||
</sect1>
|
||||
|
||||
=Title|id= <sect1 id='id'><title>Title</title>
|
||||
</sect1>
|
||||
|
||||
works for other sect levels as well, and many other
|
||||
tags. It is either the "id" value, or the "title"
|
||||
value, depending on the semantics of the particular
|
||||
tag. Usage should be obvious in context.
|
||||
|
||||
==Title== <sect2><title>Title</title>
|
||||
</sect2>
|
||||
|
||||
===Title=== <sect3><title>Title</title>
|
||||
</sect3>
|
||||
|
||||
|
||||
#Foo <orderedlist>
|
||||
#Bar <listitem><para>Foo</para></listitem>
|
||||
#Baz <listitem><para>Bar</para></listitem>
|
||||
/# <listitem><para>Baz</para></listitem>
|
||||
</orderedlist>
|
||||
|
||||
*Foo <simplelist>
|
||||
*Bar <listitem><para>Foo</para></listitem>
|
||||
*Baz <listitem><para>Bar</para></listitem>
|
||||
/* <listitem><para>Baz</para></listitem>
|
||||
</simplelist>
|
||||
|
||||
[[http://foo.org]] <ulink url='http://foo.org'>
|
||||
<citetitle>http://foo.org</citetitle>
|
||||
</ulink>
|
||||
|
||||
[[http://foo.org Foo]] <ulink url='http://foo.org'>
|
||||
<citetitle>Foo</citetitle>
|
||||
</ulink>
|
||||
|
||||
[http://foo.org|Foo]] You can also delimit with the pipe character "|".
|
||||
This works on any of these [[]] tags.
|
||||
|
||||
[[file:Foo]] <filename>Foo</filename>
|
||||
|
||||
'''Foo''' <emphasis>Foo</emphasis>
|
||||
|
||||
A few DocBook structures do not have <para> tags wrapped around them. They
|
||||
are <para> itself (duh!), <sect?> and <programlisting>. If you insert anything
|
||||
using these tags, no <para> tags will be wrapped around it or inserted into it.
|
||||
So if you want fine control over your <para> tags, insert them yourself.
|
||||
|
||||
These tags include:
|
||||
|
||||
<programlisting/>
|
||||
<screen/>
|
||||
|
||||
|
|
|
@ -1,34 +1,33 @@
|
|||
=head1 NAME
|
||||
|
||||
B<wt2db> - utility to convert WikiText documents into DocBook XML/SGML.
|
||||
B<wt2db> - converts WikiText documents into DocBook XML/SGML.
|
||||
|
||||
|
||||
=head1 SYNOPSIS
|
||||
|
||||
B<wt2db> [I<OPTION>] [I<FILE>]
|
||||
|
||||
|
||||
=head1 DESCRIPTION
|
||||
|
||||
B<wt2db> converts a text file in a special format similar to that used
|
||||
in WikiWikiWebs into DocBook XML/SGML.
|
||||
|
||||
The DocBook it writes out is only
|
||||
a fragment, not a complete document, because it has no DOCTYPE declaration.
|
||||
And due to the source format, there is no meta-data, such as in an
|
||||
<articleinfo> structure.
|
||||
|
||||
As part of a larger publishing or document processing system, it is
|
||||
expected that later processing will supply these elements.
|
||||
The DocBook it writes out by default is only
|
||||
a fragment, but it will write a complete document upon request.
|
||||
|
||||
By default it reads from STDIN and writes to STDOUT. However, if given a
|
||||
filename, it will read that file, and an output filename can also be
|
||||
specified as a command-line option.
|
||||
|
||||
|
||||
=head1 OPTIONS
|
||||
|
||||
B<-x>, B<--xml> add XML DOCTYPE and article tags.
|
||||
|
||||
B<-s>, B<--sgml> add SGML DOCTYPE and article tags.
|
||||
|
||||
B<-n>, B<--nonet> do not access the network.
|
||||
|
||||
B<-o>, B<--output-to> I<filename> write to the specified file.
|
||||
|
||||
B<-v>, B<--verbose> show diagnostic output.
|
||||
|
@ -37,6 +36,126 @@ B<-V>, B<--version> show program version.
|
|||
|
||||
B<-h>, B<--help> show a usage message.
|
||||
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
B<wt2db> was developed to provide an easier way to write
|
||||
DocBook documentation.
|
||||
|
||||
WikiText is based on the form of text that is used in a
|
||||
WikiWikiWeb. It provides very simple and easy to remember
|
||||
tags so you can write a Wiki article without learning HTML.
|
||||
B<wt2db> was originally written to convert Wikipedia articles
|
||||
into DocBook.
|
||||
|
||||
The Wiki format has been enhanced in several ways to make it
|
||||
more powerful for authors. Support has been added for Wiki tags
|
||||
that don't exist in any real Wiki, by giving common DocBook
|
||||
elements their own Wiki tags. Support has also been added
|
||||
for including DocBook elements right in the source file.
|
||||
|
||||
This means WikiText is a merging of DocBook into a plain text file.
|
||||
In in its simplest form, it is plain text. A plain text
|
||||
document can be processed by B<wt2db> and converted into
|
||||
DocBook.
|
||||
Or, a complete and valid DocBook document can be processed,
|
||||
and will pass through the B<w2db> filters and come out
|
||||
unchanged.
|
||||
Virtually any combination of DocBook with plain text will work,
|
||||
with the additional Wiki style tags to make things even easier
|
||||
for authors.
|
||||
It put all of the semantic of DocBook
|
||||
at your disposal, while being as easy to write as a Wiki page.
|
||||
|
||||
|
||||
=head1 WIKITEXT
|
||||
|
||||
These are the tags which are supported in this release of
|
||||
B<wt2db>. All DocBook tags are also supported. If you encounter
|
||||
any valid DocBook that is not handled correctly, please file
|
||||
a bug report.
|
||||
|
||||
Foo <para>Foo</para>
|
||||
|
||||
=Title= <sect1>
|
||||
<title>Title</title>
|
||||
</sect1>
|
||||
|
||||
=Title|id= <sect1 id='id'>
|
||||
<title>Title</title>
|
||||
</sect1>
|
||||
|
||||
The id attribute, delimited with a pipe character,
|
||||
works for other sect levels as well, and many other
|
||||
tags. In some cases it is not an id value, but the
|
||||
title, depending on the semantics of the particular
|
||||
tag. Usage should be obvious in context.
|
||||
|
||||
==Title== <sect2>
|
||||
<title>Title</title>
|
||||
</sect2>
|
||||
|
||||
===Title=== <sect3>
|
||||
<title>Title</title>
|
||||
</sect3>
|
||||
|
||||
#Foo <orderedlist>
|
||||
#Bar <listitem>
|
||||
#Baz <para>Foo</para>
|
||||
/# </listitem>
|
||||
<listitem>
|
||||
<para>Bar</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Baz</para>
|
||||
</listitem>
|
||||
</orderedlist>
|
||||
|
||||
*Foo <simplelist>
|
||||
*Bar <listitem>
|
||||
*Baz <para>Foo</para>
|
||||
/* </listitem>
|
||||
<listitem>
|
||||
<para>Bar</para>
|
||||
</listitem>
|
||||
<listitem>
|
||||
<para>Baz</para>
|
||||
</listitem>
|
||||
</simplelist>
|
||||
|
||||
[[foo]] <xref linkend='foo' linkterm='foo'/>
|
||||
[[link:Foo]]
|
||||
|
||||
[[file:/dev/foo]] <filename>/dev/foo</filename>
|
||||
|
||||
|
||||
[[http://foo.org]] <ulink url='http://foo.org'>
|
||||
<citetitle>http://foo.org</citetitle>
|
||||
</ulink>
|
||||
|
||||
[[http://foo.org Foo]] <ulink url='http://foo.org'>
|
||||
[[http://foo.org|Foo]] <citetitle>Foo</citetitle>
|
||||
</ulink>
|
||||
|
||||
'''Foo''' <emphasis>Foo</emphasis>
|
||||
|
||||
A few DocBook structures will not have <para> tags wrapped around them. They
|
||||
are <para> itself, <sect?> and <programlisting>. If you insert anything
|
||||
using these tags, no <para> tags will be wrapped around it or inserted into it.
|
||||
So if you want fine control over your <para> tags, insert them yourself.
|
||||
|
||||
The <screen> element will be wrapped with <para> tags, but no internal
|
||||
paragraph breaks will be generated.
|
||||
|
||||
|
||||
=head1 RESTRICTIONS
|
||||
|
||||
Currently only a single form of WikiText is supported, which is very
|
||||
similar to that used by the Wikipedia (see http://www.wikipedia.com).
|
||||
A future release will be configurable to support additional styles of
|
||||
WikiText.
|
||||
|
||||
|
||||
=head1 BUGS
|
||||
|
||||
Bugs are tracked in the SourceForge project page at:
|
||||
|
@ -46,25 +165,13 @@ http://www.sourceforge.net/projects/linuxdoc
|
|||
If you report a bug in B<wt2db>, specify wt2db as the category so it will
|
||||
be routed the appropriate person.
|
||||
|
||||
=head1 RESTRICTIONS
|
||||
|
||||
Currently only a single form of WikiText is supported, which is very
|
||||
similar to that used by the Wikipedia (see http://www.wikipedia.com).
|
||||
A future release will be configurable to support additional styles of
|
||||
WikiText.
|
||||
|
||||
=head1 NOTES
|
||||
|
||||
B<wt2db> was developed as a project of the Linux Documentation Project
|
||||
to create an easier way of writing DocBook documentation. While it is
|
||||
useful on its own, it is part of Lampadas, the LDP's document
|
||||
production system.
|
||||
|
||||
=head1 SEE ALSO
|
||||
|
||||
See the home page of the Linux Documentation Project,
|
||||
http://www.tldp.org for updates and more information.
|
||||
|
||||
|
||||
=head1 AUTHOR
|
||||
|
||||
This man page was written by David C. Merrill <david@lupercalia.net>.
|
||||
|
|
|
@ -19,36 +19,42 @@ use Exporter;
|
|||
Reset
|
||||
);
|
||||
|
||||
# These keep track of which constructs we're in the middle of
|
||||
#
|
||||
$level1 = 0;
|
||||
$level2 = 0;
|
||||
$level3 = 0;
|
||||
$orderedlist = 0;
|
||||
$listitem = 0;
|
||||
$itemizedlist = 0;
|
||||
$para = 0;
|
||||
$qandaset = 0;
|
||||
$qandaentry = 0;
|
||||
$answer = 0;
|
||||
&Reset;
|
||||
|
||||
# These are passed in by the caller
|
||||
#
|
||||
$txtfile = '';
|
||||
$dbfile = '';
|
||||
$verbose = 0;
|
||||
# Call this before rerunning ProcessLine to clear state.
|
||||
#
|
||||
sub Reset {
|
||||
$level1 = 0;
|
||||
$level2 = 0;
|
||||
$level3 = 0;
|
||||
$orderedlist = 0;
|
||||
$listitem = 0;
|
||||
$itemizedlist = 0;
|
||||
$para = 0;
|
||||
$qandaset = 0;
|
||||
$qandaentry = 0;
|
||||
$answer = 0;
|
||||
|
||||
# These maintain state
|
||||
#
|
||||
$line = '';
|
||||
$linenumber = 0;
|
||||
$id = '';
|
||||
$title = '';
|
||||
$buf = '';
|
||||
# These are passed in by the caller
|
||||
#
|
||||
$txtfile = '';
|
||||
$dbfile = '';
|
||||
$verbose = 0;
|
||||
$doctype = 0;
|
||||
$nonet = 0;
|
||||
|
||||
$noparatag = 0;
|
||||
$noparadepth = 0;
|
||||
$noparaline = 0;
|
||||
# These maintain state
|
||||
#
|
||||
$line = '';
|
||||
$linenumber = 0;
|
||||
$id = '';
|
||||
$title = '';
|
||||
$buf = '';
|
||||
|
||||
$noparatag = 0;
|
||||
$noparadepth = 0;
|
||||
$noparaline = 0;
|
||||
}
|
||||
|
||||
|
||||
# -----------------------------------------------------------
|
||||
|
@ -62,7 +68,7 @@ sub new {
|
|||
}
|
||||
|
||||
sub ProcessFile {
|
||||
($self, $txtfile, $dbfile, $verbose, $doctype) = @_;
|
||||
($self, $txtfile, $dbfile, $verbose, $doctype, $nonet, $encoding) = @_;
|
||||
|
||||
# Read from STDIN if no input file given
|
||||
#
|
||||
|
@ -87,9 +93,10 @@ sub ProcessFile {
|
|||
|
||||
# wrap article if requested
|
||||
#
|
||||
$encoding = 'ISO-8859-1' unless ($encoding);
|
||||
if ($doctype eq 'XML') {
|
||||
print "Adding XML DOCTYPE and article tags." if ($verbose);
|
||||
$buf = '<?xml version="1.0" standalone="no"?>' . "\n";
|
||||
$buf = '<?xml version="1.0" encoding="' . $encoding . '" standalone="no"?>' . "\n";
|
||||
$buf .= '<!DOCTYPE article PUBLIC "-//OASIS//DTD DocBook XML V4.1.2//EN"' . "\n";
|
||||
$buf .= ' "http://docbook.org/xml/4.1.2/docbookx.dtd"';
|
||||
$buf .= "\[\]\>\n";
|
||||
|
@ -153,8 +160,8 @@ sub ProcessLine {
|
|||
|
||||
# inline docbook
|
||||
#
|
||||
# ulink
|
||||
#
|
||||
# parse all links, internal and external
|
||||
#
|
||||
while ($line =~ /\[\[/) {
|
||||
unless ($line =~ /\]\]/) {
|
||||
$buf .= "ERROR unterminated '[[' tag on line $linenumber.\n";
|
||||
|
@ -174,15 +181,22 @@ sub ProcessLine {
|
|||
$linkname = $link;
|
||||
}
|
||||
|
||||
# kill quotes, they mess us up
|
||||
# kill quotes inside links, they mess us up because
|
||||
# we have to wrap this string with quotes.
|
||||
# perhaps it should be encoding the entire URL?
|
||||
#
|
||||
$link =~ s/'/%27/g;
|
||||
|
||||
# namespaces are handled differently
|
||||
#
|
||||
print "$link\n" if ($verbose);
|
||||
if ($link =~ /^http:/) {
|
||||
|
||||
if ($link =~ /^http:\/\//) {
|
||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||
} elsif ($link =~ /^link:/) {
|
||||
$link =~ s/^link://;
|
||||
$linkname =~ s/^link://;
|
||||
$line =~ s/\[\[.*?\]\]/<xref linkend='$link' endterm='$link'\>\<\/xref\>/;
|
||||
} elsif ($link =~ /^mailto:/) {
|
||||
$linkname =~ s/^mailto://;
|
||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||
|
@ -194,28 +208,41 @@ sub ProcessLine {
|
|||
} elsif ($link =~ /^ldp:/) {
|
||||
$linkname =~ s/^ldp://;
|
||||
$link =~ s/^ldp://;
|
||||
$tempfile = "/tmp/wt2db-" . $rand;
|
||||
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
|
||||
system("$cmd");
|
||||
open(URL, "$tempfile") || die "wt2db: cannot open temporary file ($!)\n\n";
|
||||
$link = "";
|
||||
while ($url_line = <URL>) {
|
||||
$url_line =~ s/\n//;
|
||||
if ($url_line =~ /identifier/) {
|
||||
$link .= $url_line;
|
||||
if ($nonet) {
|
||||
$line =~ s/\[\[.*?\]\]/<citetitle>$link<\/citetitle>/;
|
||||
} else {
|
||||
$tempfile = "/tmp/wt2db-" . $rand;
|
||||
$cmd = "wget -q http://db.linuxdoc.org/cgi-pub/ldp-xml.pl?name=$link -O $tempfile";
|
||||
print "$cmd\n" if ($verbose > 1);
|
||||
$return = system("$cmd");
|
||||
unless ($return) {
|
||||
open(URL, "$tempfile") || die "wt2db: cannot open temporary file ($!)\n\n";
|
||||
$link = '';
|
||||
while ($url_line = <URL>) {
|
||||
$url_line =~ s/\n//;
|
||||
if ($url_line =~ /identifier/) {
|
||||
$link .= $url_line;
|
||||
}
|
||||
}
|
||||
close(URL);
|
||||
unlink $tempfile;
|
||||
}
|
||||
$link =~ s/^.*?<identifier>//;
|
||||
$link =~ s/<\/identifier>.*?$//;
|
||||
if ($link eq '') {
|
||||
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
|
||||
}
|
||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||
}
|
||||
close(URL);
|
||||
unlink $tempfile;
|
||||
$link =~ s/^.*?<identifier>//;
|
||||
$link =~ s/<\/identifier>.*?$//;
|
||||
if ($link eq '') {
|
||||
$linkname = "ERROR: LDP namespace resolution failure on $linkname";
|
||||
}
|
||||
$line =~ s/\[\[.*?\]\]/<ulink url='$link'><citetitle>$linkname<\/citetitle><\/ulink>/;
|
||||
} elsif ($link =~ /^file:/) {
|
||||
$linkname =~ s/^file://;
|
||||
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
||||
} elsif ($link =~ /^dir:/) {
|
||||
|
||||
# FIXME: need to check attribute on filename element
|
||||
#
|
||||
$linkname =~ s/^dir://;
|
||||
$line =~ s/\[\[.*?\]\]/<filename type='directory'>$linkname<\/filename>/;
|
||||
} else {
|
||||
$line =~ s/\[\[.*?\]\]/<filename>$linkname<\/filename>/;
|
||||
}
|
||||
|
@ -238,7 +265,8 @@ sub ProcessLine {
|
|||
# <programlisting>
|
||||
# <literallayout>
|
||||
|
||||
# forget about nopara
|
||||
# forget about being in nopara state if we're no longer in one
|
||||
#
|
||||
if ($noparadepth == 0) {
|
||||
$noparatag = "";
|
||||
}
|
||||
|
@ -248,6 +276,8 @@ sub ProcessLine {
|
|||
if ((($line =~ /^<para>/) or
|
||||
($line =~ /^<sect/) or
|
||||
($line =~ /^<screen>/) or
|
||||
($line =~ /^<screen>/) or
|
||||
($line =~ /^<blockquote>/) or
|
||||
($line =~ /^<literallayout>/) or
|
||||
($line =~ /^<articleinfo>/) or
|
||||
($line =~ /^<programlisting>/)) and
|
||||
|
@ -257,9 +287,12 @@ sub ProcessLine {
|
|||
$noparatag =~ s/^.*?<//;
|
||||
$noparatag =~ s/>.*?$//;
|
||||
$noparaline = $linenumber;
|
||||
|
||||
# screen sections don't embed para tags, but are wrapped in them
|
||||
#
|
||||
if ($line =~ /^<screen>/) {
|
||||
unless ($para) {
|
||||
$line = "<para>" . $line;
|
||||
$buf .= "<para>";
|
||||
$para = 1;
|
||||
}
|
||||
}
|
||||
|
@ -289,9 +322,29 @@ sub ProcessLine {
|
|||
}
|
||||
|
||||
# recover original line -- no whitespace modifiers
|
||||
# allow nonencoded text in unparsed lines, when in a literal block
|
||||
#
|
||||
$line = $originalline;
|
||||
chomp($line);
|
||||
if ($line =~ /^<$noparatag>/ ) {
|
||||
$starttag = "<$noparatag>";
|
||||
} else {
|
||||
$starttag = '';
|
||||
}
|
||||
if ($line =~ /<\/$noparatag>/ ) {
|
||||
$endtag = "<\/$noparatag>";
|
||||
} else {
|
||||
$endtag = '';
|
||||
}
|
||||
|
||||
$line =~ s/<$noparatag>//;
|
||||
$line =~ s/<\/$noparatag>//;
|
||||
if (($noparatag eq 'screen') or
|
||||
($noparatag eq 'literallayout') or
|
||||
($noparatag eq 'programlisting')) {
|
||||
encode_entities($line);
|
||||
}
|
||||
$line = "$starttag$line$endtag";
|
||||
|
||||
# sect3
|
||||
#
|
||||
|
@ -376,9 +429,9 @@ sub ProcessLine {
|
|||
&trimline;
|
||||
&splittitle;
|
||||
if ($id eq '') {
|
||||
$line = "<question><para>" . $title . "</para></question>";
|
||||
$line = "<question><para>$title</para></question>";
|
||||
} else {
|
||||
$line = "<question id='$id'><para>" . $title . "</para></question>";
|
||||
$line = "<question id='$id'><para>$title</para></question>";
|
||||
}
|
||||
unless ($qandaentry) {
|
||||
$line = "<qandaentry>\n" . $line;
|
||||
|
@ -430,42 +483,6 @@ sub Buffer {
|
|||
return $buf;
|
||||
}
|
||||
|
||||
# Basically a cut-and-paste of the original declarations,
|
||||
# to make sure all variables are completely cleared.
|
||||
#
|
||||
# Call this before rerunning ProcessLine to clear state.
|
||||
#
|
||||
sub Reset {
|
||||
$level1 = 0;
|
||||
$level2 = 0;
|
||||
$level3 = 0;
|
||||
$orderedlist = 0;
|
||||
$listitem = 0;
|
||||
$itemizedlist = 0;
|
||||
$para = 0;
|
||||
$qandaset = 0;
|
||||
$qandaentry = 0;
|
||||
$answer = 0;
|
||||
|
||||
# These are passed in by the caller
|
||||
#
|
||||
$txtfile = '';
|
||||
$dbfile = '';
|
||||
$verbose = 0;
|
||||
|
||||
# These maintain state
|
||||
#
|
||||
$line = '';
|
||||
$linenumber = 0;
|
||||
$id = '';
|
||||
$title = '';
|
||||
$buf = '';
|
||||
|
||||
$noparatag = 0;
|
||||
$noparadepth = 0;
|
||||
$noparaline = 0;
|
||||
}
|
||||
|
||||
sub close1 {
|
||||
&close2;
|
||||
if ($level1) {
|
||||
|
@ -570,15 +587,78 @@ sub splittitle {
|
|||
$line =~ s/^=+//;
|
||||
$line =~ s/=+$//;
|
||||
$title = $line;
|
||||
$id = "";
|
||||
if ($line =~ /\|/) {
|
||||
$title =~ s/\|.+//;
|
||||
$id = $line;
|
||||
$id =~ s/^.+\|//;
|
||||
} else {
|
||||
$id = &anchorfix($title);
|
||||
}
|
||||
$title =~ s/\s+$//;
|
||||
$title =~ s/^\s+//;
|
||||
$id =~ s/\s+$//;
|
||||
$id =~ s/^\s+//;
|
||||
}
|
||||
|
||||
sub anchorfix {
|
||||
my $anchor = $_[0];
|
||||
$anchor = lc(&trim($anchor));
|
||||
$anchor = decode_entities($anchor);
|
||||
$anchor =~ s/-/-dash-/g;
|
||||
$anchor =~ s/&/-and-/g;
|
||||
$anchor =~ s/;//g;
|
||||
$anchor = encode_entities($anchor);
|
||||
$anchor =~ s/&(\w)grave/\1/g;
|
||||
$anchor =~ s/&(\w)acute/\1/g;
|
||||
$anchor =~ s/&(\w)circ/\1/g;
|
||||
$anchor =~ s/&(\w)uml/\1/g;
|
||||
$anchor =~ s/&(\w)tilde/\1/g;
|
||||
$anchor =~ s/&(\w)cedil/\1/g;
|
||||
$anchor =~ s/&/-and-/g;
|
||||
$anchor =~ s/;//g;
|
||||
$anchor =~ s/\//-slash-/g;
|
||||
$anchor =~ s/\\/-bslash-/g;
|
||||
$anchor =~ s/\s+/-/g;
|
||||
$anchor =~ s/'//g;
|
||||
$anchor =~ s/`//g;
|
||||
$anchor =~ s/,/-comma-/g;
|
||||
$anchor =~ s/\./-dot-/g;
|
||||
$anchor =~ s/!/-bang-/g;
|
||||
$anchor =~ s/\?/-question-/g;
|
||||
$anchor =~ s/\+/-plus-/g;
|
||||
$anchor =~ s/\*/-x-/g;
|
||||
$anchor =~ s/\(/-op-/g;
|
||||
$anchor =~ s/\)/-cp-/g;
|
||||
$anchor =~ s/\@/-at-/g;
|
||||
$anchor =~ s/dcm_at/-at-/gi;
|
||||
$anchor =~ s/\^/-hat-/g;
|
||||
$anchor =~ s/=/-eq-/g;
|
||||
$anchor =~ s/\$/S/;
|
||||
$anchor =~ s/~/-tilde-/g;
|
||||
$anchor =~ s/0/-zero-/g;
|
||||
$anchor =~ s/1/-one-/g;
|
||||
$anchor =~ s/2/-two-/g;
|
||||
$anchor =~ s/3/-three-/g;
|
||||
$anchor =~ s/4/-four-/g;
|
||||
$anchor =~ s/5/-five-/g;
|
||||
$anchor =~ s/6/-six-/g;
|
||||
$anchor =~ s/7/-seven-/g;
|
||||
$anchor =~ s/8/-eight-/g;
|
||||
$anchor =~ s/9/-nine-/g;
|
||||
$anchor =~ s/\|/-pipe-/g;
|
||||
$anchor =~ s/\[/-lsqb-/g;
|
||||
$anchor =~ s/\]/-rsqb-/g;
|
||||
$anchor =~ s/^-+//;
|
||||
$anchor =~ s/-+$//;
|
||||
$anchor =~ s/--/-/g; # get rid of double, initial and trailing hyphens
|
||||
return &trim($anchor);
|
||||
}
|
||||
|
||||
sub trim {
|
||||
my $temp = $_[0];
|
||||
|
||||
$temp =~ s/^\s+//g;
|
||||
$temp =~ s/\s+$//g;
|
||||
return $temp;
|
||||
}
|
||||
1;
|
||||
|
|
|
@ -5,7 +5,26 @@ $WT = new Wt2Db;
|
|||
|
||||
$buffer = "foo bar
|
||||
|
||||
baz
|
||||
=Section|section=
|
||||
|
||||
paragraph
|
||||
|
||||
==Subsection|subsection==
|
||||
|
||||
paragraph
|
||||
|
||||
|
||||
=Namespaces=
|
||||
|
||||
==MailTo==
|
||||
|
||||
[[mailto:david@lupercalia.net]]
|
||||
[[mailto:david@lupercalia.net|David Merrill]]
|
||||
|
||||
==HTTP==
|
||||
|
||||
[[http://www.tldp.org]]
|
||||
[[http://www.tldp.org|The Linux Documentation Project]]
|
||||
|
||||
";
|
||||
|
||||
|
|
|
@ -10,6 +10,8 @@ $WT2DB = new Wt2Db;
|
|||
my $txtfile = '';
|
||||
my $dbfile = '';
|
||||
my $doctype = '';
|
||||
my $nonet = 0;
|
||||
my $encoding = 'ISO-8859-1';
|
||||
my $verbose = 0;
|
||||
my $error = 0;
|
||||
|
||||
|
@ -22,6 +24,13 @@ while (1) {
|
|||
} elsif($ARGV[0] eq "-x" or $ARGV[0] eq "--xml") {
|
||||
$doctype = 'XML';
|
||||
shift(@ARGV);
|
||||
} elsif($ARGV[0] eq "-e" or $ARGV[0] eq "--encoding") {
|
||||
shift(@ARGV);
|
||||
$encoding = $ARGV[0];
|
||||
shift(@ARGV);
|
||||
} elsif($ARGV[0] eq "-n" or $ARGV[0] eq "--nonet") {
|
||||
$nonet = 1;
|
||||
shift(@ARGV);
|
||||
} elsif($ARGV[0] eq "-o" or $ARGV[0] eq "--output-to") {
|
||||
shift(@ARGV);
|
||||
$dbfile = $ARGV[0];
|
||||
|
@ -44,7 +53,7 @@ while (1) {
|
|||
}
|
||||
}
|
||||
|
||||
$WT2DB->ProcessFile($txtfile, $dbfile, $verbose, $doctype);
|
||||
$WT2DB->ProcessFile($txtfile, $dbfile, $verbose, $doctype, $nonet, $encoding);
|
||||
|
||||
sub version {
|
||||
print "wt2db version $VERSION\n";
|
||||
|
@ -53,7 +62,7 @@ sub version {
|
|||
print "Converts a WikiText file into DocBook XML/SGML.\n";
|
||||
print "\n";
|
||||
print "This is free software; see the source for copying conditions. There is no\n";
|
||||
print "warranty; not even for merchantability or fitness for a particular purpose.\n";
|
||||
print "warranty; not even for merchantability or fitness for a particular purpose.\n\n";
|
||||
}
|
||||
|
||||
sub usage {
|
||||
|
@ -64,9 +73,11 @@ sub usage {
|
|||
print "Options:\n";
|
||||
print "-s, --SGML add XML DOCTYPE and article tags.\n";
|
||||
print "-x, --XML add SGML DOCTYPE and article tags.\n";
|
||||
print "-e, --encoding specify character encoding.\n";
|
||||
print "-n, --nonet do not look up documents on the net.\n";
|
||||
print "-o, --output-to write to the specified file.\n";
|
||||
print "-v, --verbose show diagnostic output.\n";
|
||||
print "-V, --version show program version.\n";
|
||||
print "-V, --verbose show diagnostic output.\n";
|
||||
print "-v, --version show program version.\n";
|
||||
print "-h, --help show this usage message.\n";
|
||||
exit($error);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue