LDP/LDP/builder/ldp_print/ldp_print

#!/usr/bin/perl -w
#
# usage: ldp_print [options] <single_file.html>
#
# Creates a PDF variant of a single-file HTML representation of a
# DocBook SGML (or XML) instance. This simple wrapper assumes that
# the file was created using {open}jade in a manner similar to:
#
#	jade -t sgml -i html -V nochunks -d $style $fname > $fname.html
#
# Give this script the filename as an argument. It will then parse
# the file into 'title.html' and 'body.html' and send each to
# htmldoc (as the corresponding title page and body of the document).
#
# OPTIONS:
#  --postscript        Generate Postscript (by default, doesn't)
#  --nopdf             Don't generate PDF (by default, generates PDF)
#  --size X            Set output page size (default "universal")
#                      X can be "A4", "Letter", or "WIDTHxLENGTHunits".
#                      where units can be in, mm, or cm.
#
# CAVEATS:
#
# Assumes perl is in /usr/bin; adjust if necessary
#
# You may need to specify where the htmldoc executable resides.
# The script assumes it's within your $PATH.
#
# If you want Postscript as an output variant, uncomment the
# appropriate lines (see below).
#
# Relies on output from a DocBook instance created via DSSSL/{open}jade!
#
# Cleans up (removes) the intermediate files it creates (but not the
# PDF or Postscript files, obviously!)
#
# Works silently; PDF (PostScript) will be created in the same directory
# as was specified for the input (single-file HTML) file.
#
# Provided without warranty or support!
#
#	<gferg (at) metalab.unc.edu> / Ferg
#       <dan.scott (at) acm.org> / Dan Scott
#       <dwheeler (at) dwheeler.com) / David A. Wheeler
#
# Licensed under the GNU GPL version 2.
#
# ChangeLog:
#     16Oct2000 - 0.1   - initial entry <gferg (at) sgi.com>
#     03Apr2001 - 0.2   - fix for <preface>
#     05Jul2001 - 0.3   - fix for <tt> and -f
#     12Oct2001 - 0.4   - fix for sections; loop thru both files (body/title)
#     27Nov2001 - 0.5   - fixed bug in determining where doc-index lies
#     18Jan2002 - 0.5.1 - entity fix (822*)
#     02Apr2002 - 0.6   - misc fixes (bibliography/appendix, etc).
#     04Apr2002 - 0.7   - fix for newer DSSSL
#     27May2002 - 0.8   - Merged library and driver, greatly simplifying
#                         installation, and added options to driver.
#     09Aug2002 - 0.9   - Some minor clean-up
#

use strict;

sub fix_print_html {

   my($in,$out,$ttl) = @_;

   open(IN_FILE, "< $in") || do {
        print "fix_print_html: cannot open $in: $!\n";
        return 0;
   };

   my($buf, $ttl_buf) = '';
   my($indx) = -1;
   my($is_article) = 1;
   while(<IN_FILE>) {

         if( $indx == 1 ) {

             # ignore everything until we see the chapter or sect
             #
             if( $_ =~ /CLASS="CHAP/i || $_ =~ /CLASS="PREF/i
                 ||
                 $_ =~ /CLASS="SECT/i )  {

                 $buf .= $_;
                 $indx++;

             } else {
                 next;
             }

         } elsif( $indx == 0 ) {

             # write out the title page file
             #
             if( $_ =~ /CLASS="TOC"/ ) {

                 $ttl_buf .= "></DIV>\n</BODY>\n</HTML>\n";
                 $ttl_buf =~ s/<\/H1\n/<\/H1\n><P><BR><BR\n/ms;
                 $ttl_buf =~ s/<HR><\/DIV\n><HR>/<HR><\/DIV\n>/ms;
                 &fix_html(\$ttl_buf, 1);

                 open(TOC_FILE, "> $ttl") || do {
                      print "fix_print_html: cannot open $ttl: $!\n";
                      close(IN_FILE);
                      return 0;
                 };
                 print TOC_FILE $ttl_buf;
                 close(TOC_FILE);
                 $ttl_buf = '';
                 $indx++;

             } else {
                $ttl_buf .= $_;
             }

         } elsif( $indx < 0 ) {

             if( $_ =~ /CLASS="BOOK"/i ) {
                 $is_article = 0;
             }

             # up to this point, both buffers get the line
             #
             if( $_ =~ /CLASS="TITLEPAGE"/ ) {

                 $ttl_buf .= $_ . ">\n<P>\n<BR><BR><BR><BR>\n<\/P\n";
                 $indx++;

             } else {
                 $buf .= $_;
                 $ttl_buf .= $_;
             }

         } else {

             $buf .= $_;
         }
   }
   close(IN_FILE);


   # fix body file
   #
   open(OUT_FILE, "> $out") || do {
        print "fix_print_html: cannot open $out: $!\n";
        return 0;
   };

   &fix_html(\$buf, $is_article);

   print OUT_FILE $buf;
   close(OUT_FILE);


   return 1;
}


sub fix_html {

   my($buf, $is_article) = @_;
   my($indx) = -1;


   # make corrections and write out the file
   #

   $$buf =~ s/(\n><LI\n)><P\n(.*?)<\/P\n>/$1$2\n/gms;
   $$buf =~ s/(\n><LI\n><DIV\nCLASS="FORMALPARA"\n)><P\n(.*?)<\/P\n>/$1$2\n/gms;
   $$buf =~ s/(\n><LI\nSTYLE="[^\"]+"\n)><P\n(.*?)<\/P\n>/$1$2\n/gms;
   if( $is_article == 0 ) {
       $$buf =~
         s/(\nCLASS="SECT[TION\d]+"\n>)<H1\n(.*?)<\/H1/$1<H0\n$2<\/H0/gims;
       $$buf =~
         s/(\nCLASS="SECT[TION\d]+"\n><HR>)<H1\n(.*?)<\/H1/$1<H0\n$2<\/H0/gims;
   }
   $$buf =~ s/<H1(\nCLASS="INDEXDIV"\n)(.*?)<\/H1/<H2$1$2<\/H2/gims;
   if( ($indx = rindex($$buf, "<H1\n><A\nNAME=\"DOC-INDEX\"")) > -1 ) {
       $$buf = substr($$buf, 0, $indx);
       $$buf .= "\n<\/BODY>\n<\/HTML>\n\n";
   } elsif( ($indx = rindex($$buf, "<H1\n><A\nNAME=\"doc-index\"")) > -1 ) {
       $$buf = substr($$buf, 0, $indx);
       $$buf .= "\n<\/BODY>\n<\/HTML>\n\n";
   }

   $$buf =~ s/\&\#13;//g;
   $$buf =~ s/\&\#60;/\&lt;/g;
   $$buf =~ s/\&\#62;/\&gt;/g;
   $$buf =~ s/\&\#8211;/\-/g;
   $$buf =~ s/\&\#8220;/\"/g;
   $$buf =~ s/\&\#8221;/\"/g;
   $$buf =~ s/WIDTH=\"\d\"//g;
   $$buf =~ s/><[\/]*TBODY//g;
   $$buf =~ s/><[\/]*THEAD//g;
   $$buf =~ s/TYPE=\"1\"\n//gim;

   $$buf =~ s/<P\nCLASS="LITERALLAYOUT"(.*?)<\/P/<P CLASS="LITERALLAYOUT"><FONT FACE=\"courier\"$1<\/FONT><\/P/gms;

   my($cnt, $j) = 0;

   if( $$buf !~ /<H1/ ) {

       # for newer docbook styles, set h2 to h1, etc.
       #
       for($cnt=2; $cnt < 7; $cnt++ ) {
           $j = $cnt - 1;
           $$buf =~ s/<H${cnt}/<H${j}/g;
           $$buf =~ s/<\/H${cnt}/<\/H${j}/g;
       }

   } elsif( $is_article == 0 ) {

       # decrement the headers by 1 and then re-set the
       # chapter level only to H1...
       #
       for($cnt=5; $cnt >= 0; $cnt--) {
           $j = $cnt + 1;
           if( $cnt == 0 ) {
               $j = 2;
           }
           $$buf =~ s/<H${cnt}/<H${j}/g;
           $$buf =~ s/<\/H${cnt}/<\/H${j}/g;
       }

       my(@l) = split(/\n/, $$buf);
       for( $cnt=0; $cnt < (@l + 0); $cnt++ ) {

            if( $j == 1 ) {
                if( $l[$cnt] =~ /<DIV/ ) {
                    $j = 0;
                    next;
                }
                $l[$cnt] =~ s/<H2/<H1/g;
                $l[$cnt] =~ s/<\/H2/<\/H1/g;
            }

            if( $l[$cnt] =~ /^CLASS=\"CHAP/i
                ||
                $l[$cnt] =~ /^NAME=\"BIBL/i
                ||
                $l[$cnt] =~ /^CLASS=\"APPENDIX/i
                ||
                $l[$cnt] =~ /^CLASS=\"GLOSSARY/i
                ||
                $l[$cnt] =~ /^CLASS=\"PREF/i
                ||
                $l[$cnt] =~ /^CLASS=\"TITLE/i ) {
                $j = 1;
            }
       }

       $$buf = join("\n", @l);

   }
   $$buf =~ s/><DIV\nCLASS="\w+"\n//gms;
   $$buf =~ s/><\/DIV\n//gms;

   $buf =~ s/<SPAN\n[^>]*?>//gms;
   $buf =~ s/<\/SPAN\n>//gms;

   $$buf =~ s/(><LI\n)><P\n(.*?)<\/P\n>(<\/LI\n)/$1$2$3/gms;

   return;
}


########### MAIN DRIVER ##############

# Default values for options:
my($generate_pdf) = 1;
my($generate_ps) = 0;
my($pagesize) = "universal";
my($pth) = '';


# Process options.
my($arg);
while (($#ARGV >= 0) && ($ARGV[0] =~ m/^-/)) {
  $arg = shift;
  if ($arg eq "--") {last;}
  elsif ($arg eq "--postscript") {$generate_ps = 1;}
  elsif ($arg eq "--nopostscript") {$generate_ps = 0;}
  elsif ($arg eq "--pdf") {$generate_pdf = 1;}
  elsif ($arg eq "--nopdf") {$generate_pdf = 0;}
  elsif ($arg eq "--size") {$pagesize = shift;}
  elsif ($arg eq "--toolroot") {$pth = shift; $pth .= "/";}
  else {die "\nldp_print: unknown option $arg\n";}
}

if( $ARGV[0] eq '' || !(-r $ARGV[0]) ) {
    die "\nusage: ldp_print [options] <single_file.html>\n\n";
}

# Compute htmldoc options.
my($htmldoc_options) = "--size ${pagesize} --firstpage p1 --footer c.1";


# Now get filename and check it.  Don't allow whitespace, since a
# filename with whitespace will cause trouble.
# NOTE: If the filename can be controlled by an untrusted user,
# the filename (and options!) need to be filtered further to forbid
# metacharacters, control characters, etc. as well.

my($filename) = $ARGV[0];

if ($filename =~ m/ /) {
  die "\nldp_print: filenames cannot contain spaces.\n";
}

if ($filename =~ m/[\t\n]/) {
  die "\nldp_print: filenames cannot contain whitespace.\n";
}

my($fname_wo_ext) = $filename;
$fname_wo_ext =~ s/\.[\w]+$//;


# create new files from single HTML file to use for print
#
&fix_print_html($filename, 'body.html', 'title.html');


if ($generate_pdf) {
   my($pdf_cmd) = "${pth}htmldoc ${htmldoc_options} -t pdf -f ${fname_wo_ext}.pdf " .
                  "--titlefile title.html body.html ";
   system($pdf_cmd);
   die "\nldp_print: could not create ${fname_wo_ext}.pdf ($!)\n" if ($?);
};

if ($generate_ps) {
   my($ps_cmd) = "${pth}htmldoc ${htmldoc_options} -t ps -f ${fname_wo_ext}.ps " .
                 "--titlefile title.html body.html";
   system($ps_cmd);
   die "\nldp_print: could not create ${fname_wo_ext}.ps ($!)\n" if ($?);
};

# cleanup
#
system("rm -f body.html title.html");

exit(0);