old-www/LDP/LGNET/bin/bibelot

1592 lines
40 KiB
Perl
Executable File

#!/usr/bin/perl
# bibelot.pl
$|++;
my $VERSION = "0.94";
my $URL="http://sourceforge.net/projects/bibelot";
# Format ASCII text, esp. Project Gutenberg (http://www.promo.net/pg) etexts,
# into a PalmDoc PDB file.
#
#
#
# Copyright (C) 2000,2001 John Fulmer <jfulmer@appin.org>
#
# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License
# as published by the Free Software Foundation; either version 2
# of the License, or (at your option) any later version.
#
# This program is distributed in the hope that it will be useful,
# but WITHOUT ANY WARRANTY; without even the implied warranty of
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
# GNU General Public License for more details.
#
# A full copy of the GNU Public License may be found at:
#
# http://www.gnu.org/copyleft/gpl.html
#
#
#
# This program was written using documentation and structures borrowed
# from Paul J. Lucas' 'txt2pdbdoc' (http://www.best.com/~pjl/software.html)
# and documentation from the Pyrite website
# (http://www.pyrite.org/etext/format.html). Also, 'pdbdump' was invaluable in
# troubleshooting format problems.
#
# Some of the header structures were borrowed, but the programming is my fault.
# If it breaks, you keep both pieces. But let me know. I'm especially interested
# in formatting problems, and trying to track down all the different cases I
# can.
#
# Oh, and what is a 'bibelot'?
# See http://www.dictionary.com/cgi-bin/dict.pl?term=bibelot or your nearest
# dictionary.
#
#
# jf
#
#
# Version History:
#
# .01 Initial (ugly) version
#
# .02 -Partial re-write, made more modular
# -Added option to turn compression off (-c)
# -Added verbose (-v)
# -Added option to set document name (-l)
# -Added option to disable document formatting (-f)
# -Added usage message (-h)
# -Improved compression slightly
#
#
# .03 -Added 'Project Gutenberg' (-g) mode, which sets
# the beginning of actual text as a bookmark.
# -Now adds two NULL characters between the PDB record
# headers and the first record (Record 0).
# -Initial bookmark support.
#
# .04 10/4/00 -Various cleanups
# -Added filename sanity checks, and read/write checking
# -Improved text formatting efficiency.
# -Fixed bug that didn't collapse whitespace correctly
# -Fixed off-by-one bug if forcing line lengths.
# -Fixed incorrect use of 'pack' that required
# 'no strict'.
# -Gutenberg mode now sets chapter bookmarks, if able.
# -Added dynamic bookmark support (-b).
# Place text to bookmark
# in between angle brackets (<>). The script will
# search for the first instance of the text, and
# create a bookmark using the text as the bookmark
# name. One bookmark per line, please.
#
# For instance, let's see you wanted a bookmark
# at text that says "Fit The First". At the bottom
# of the origional text file, on a blank line,
# place a "<Fit The First>". The script will
# Bookmark the first instance of "Fit The First"
# in the document, and erase the "<Fit The First>"
# at the bottom of the file. The text is case
# sensitive.
#
# -Adjusted 'smart' format function
#
# .5 10/6/00 -Work around (bug in Perl5?) where the :^ascii:
# regex class was matching "[", and stripping it
# from text.
# -changed version number to match Freshmeat announcement
# (whoops)
# -removed spurious 's' option from getopts()
#
# .6 -re-added option to turn off 'smart' format mode. (-s)
# (Found out what that spurious 's' was for)
# -added code to rejoin words split by hyphens at eol.
#
# .7 12/21/00 -automagically grab title (if not specified with -t)
# from file in 'Project Gutenberg' mode. Often
# (but not always) the title is specified on the
# first line of a text from Project Gutenberg.
# Grab it, truncate (if necessary),plunk it into
# the DOC title field.
# -added -d option to turn off hypen correction
# -modified help option and added to opening comments.
# -verbose now echo's detected title.
# -a few code cleanups.
#
# .8 1/4/00 -match more title entries from Project Gutenberg
# -now hosted at Sourceforge, and development versions
# in CVS.
# -more (minor)tweaks to the smart formatting, to help
# with badly formatted text with short lines early on.
# -fixed bug that didn't strip out non-ascii chars.
# Yes, Virginia, octal DOESN'T stop at 255.....
#
# .9 1/9/00 -Code cleanups.
# -Strips control characters from title text.
# -More sanity checks on output filename. If infile or
# outfile are NULL, treat them as stdin/stdout.
# -You can now use '-' to specify stdin or stdout
# -Better compression. Thanks to Antaeus Feldspar, the
# compression algorithm is more efficient. It also makes
# bibelot a bit slower (6 seconds vs 4.5 seconds on an
# average book file on my system). The efficiencies only
# add up to %1-2 better compression, but that's 4-10k for
# many books, which can add up.
# -Compression error debugger, also courtesy Antaeus
# Feldspar. Turn on by the $error_check global variable.
# -New switch, 'o', to seed smart formatting offset. The
# smaller the number, the better (maybe) the formatting,
# but more badly chopped lines. Default is 20.
# -Handle another different title for PG mode.
#
# .91 -Minor code change, Palm desktop for Windows demands
# a timestamp in the PDB header. I faked up one
# (0x11111111) for now. In the process, I also learned
# that ActiveState Perl build 623 doesn't work with
# with bibelot, something to do with a difference in
# string handling. ActiveState's problem, if you ask me.
# I would be interested if bibelot works on anything else
# besides Linux, though... I DO know that nsperl 5.004
# for dos works fine.
#
# .92 2/26/01 -More minor changes for DOS and Windows versions of perl
# now it actually works. Uses binmode() for output if
# DOS/Win32 platform. (Are you happy now, Kyle?!?)
# -Added check for common DOS and Win32 versions of
# perl, currently only looks for ActiveState's Perl
# for Win32, others probably work.
# -Disabled filename sanity checks for Win32 platforms.
# -Accidentally left the compression error checking on.
# Should be MUCH faster now.
#
#
# .93 4/02/01 -Condensed the title match regex to one line.
# -Fixed problem with spaces in title with '-t'
#
# .94 5/18/01 -Added 8-bit support. This removes the check for high
# byte control characters, so don't blame me if your
# Palm blows up. :)
#
#
#
# Pragma goes HERE
#
# 'Use strict' so that we have to declare variables. Not a bad practice.
#
use strict;
#
# Global Variables go HERE
#
my $total_len = 0; # Total length of uncompressed text
my $buff = ""; # Temporary buffer space
my $header = ""; # PDB headers to preappend
my $is_compr = 1; # '0' = no, '1' = yes
my $is_verbose = 0; # If set, output debug info.
my $dont_format = 0; # Don't format the text
my $infile = "-"; # file to read, or STDIN (-)
my $outfile = ">-"; # file to write to, or STDOUT (>-)
my $line_len = 0; # If set, force linefeeds at $line_len
my $pdb_name = "PalmDoc Document"; # Name of PalmDoc file
my $col_position = 0; # Global column position for format
my @block_size; # Compressed size of all text blocks
my $avg_line_num = 0; # The next three are for use in
my $avg = 0; # format_text()'s formatting logic.
my $avg_total = 0;
my $is_pg = 0; # 'Project Gutenberg' mode. Adds
# A bookmark autoscan tag to the end of
# the text to indicate the start of
# the real text.
my $pg_pos = 0;
my $bookmark_buff = ""; # Temporary buffer for bookmark
my $bookmark_num = 0; # Total number of bookmarks
my $is_bookmark = 0; # Switch for bookmark mode
my $is_smart = 1; # Switch to turn off 'smart' format
my $title_set = 0; # Is title name set?
my $pg_title; # Title for PG mode
my $pg_title_set = 0; # Found $pg_title
my $is_hyphen_off = 0; # Switch to turn off hypen correction
my $sformat_offset = 20; # Smart format offset
my $error_check = 0; # Compr. error checker
my $is_evil = 0; # Check for Microsoft OS's
#################################################################
# #
# Main program #
# #
#################################################################
#
# Process 'getopts' and return global variables
#
proc_opts() || die "Arg! Confusing command options (should never happen!)\n";
#
# Read text from input source into buffer. Yes, all of it. And format it.
#
$buff = read_text() ||
die "Arg! Error in reading text (should never happen!)\n";
#
# Create optional bookmarks
#
if ($is_pg || $is_bookmark) { $bookmark_buff = find_bookmarks($buff); }
#
# Compress, if necessary.
#
if ($is_compr) { $buff = compr_text($buff); }
#
# Generate PDB headers and record 0, and pre-append them to the buffer.
#
$buff = pdb_header() . $buff;
#
# Write optional bookmarks
#
if ($is_pg || $is_bookmark) { $buff .= $bookmark_buff; }
#
# Write text out
#
write_text($buff)||
die "Arg! Error in writing file (should never happen!)\n"; ;
# Done. Wasn't that easy.
#################################################################
# #
# Get and process command line options #
# #
#################################################################
sub proc_opts {
#
# Local Variables
#
my $num_args;
#
# Turn off 'strict' for getopts().
#
no strict;
#
# getopts() is your friend
#
use Getopt::Std qw(getopt getopts);
getopts('l:vdht:cfgbso:') || die "Invalid Argument\n";
#
# Force line length?
#
if ($opt_l) { # Not empty
unless ($opt_l =~ /\D/) { # And only contains digits
$line_len = int($opt_l);
} else { # is alpha or otherwise
die "Invalid line length.\n";
}
}
#
# Help text
#
if ( $opt_h ) {
print "\nusage: $0 [OPTIONS] <infile> <outfile>\n\n" .
"Formats text to PalmDoc format.\n" .
"$URL\n" .
"Version $VERSION\n\n" .
"options:\n" .
"\t-h\t\tthis message\n" .
"\t-c\t\tturn file compression OFF\n" .
"\t-v\t\tverbose\n" .
"\t-t \"title\"\tdocument title\n" .
"\t-f\t\tdon't format text\n" .
"\t-l<n>\t\tforce line width to <n> bytes\n" .
"\t-g\t\tEnable 'Project Gutenberg' mode\n" .
"\t-b\t\tEnable Dynamic Bookmark mode\n" .
"\t-d\t\tTurn off hyphen correction\n" .
"\t-s\t\tTurn off 'smart' format\n" .
"\t-o<n>\t\tOffset for 'smart' format (default '20')\n\n" .
"Use '-' or omit filenames to indicate STDIN or STDOUT.\n\n";
exit 0;
}
#
# Set document name
#
if ( $opt_t ) {
$opt_t =~ s/[\000-\011\013-\037\177-\377]//g; #strip control chars
$opt_t =~ s/\s+/ /g;
if ( (length $opt_t) > 31 ) {
$pdb_name = substr($opt_t,0,28) . "...";
$title_set = 1;
} else {
$pdb_name = $opt_t;
$title_set = 1;
}
}
#
# Set offset for 'smart' filtering. The larger the number, the more formatted
# text it may miss (due to the shorter length), but you will get fewer false
# positives due to short lines.
#
if ($opt_o) { # Not empty
unless (($opt_o =~ /\D/) || # And only contains digits
(int($opt_o) > 65)) { # Offsets greater than 65 are worthless
$sformat_offset = int($opt_o);
} else { # is alpha or otherwise
die "Invalid offset.\n";
}
}
if ($opt_v) { $is_verbose = 1; } # Maximum Verbosity!
if ($opt_c) { $is_compr = 0; } # Turn off compression?
if ($opt_f) { $dont_format = 1; } # Don't format text
if ($opt_g) { $is_pg = 1; } # Project Gutenberg mode
if ($opt_b) { $is_bookmark = 1; } # Bookmark mode
if ($opt_s) { $is_smart = 0; } # Turn off 'smart' format
if ($opt_d) { $is_hypen_off = 1; } # Turn off hyphen correction
#
# Turn back on strict
#
use strict;
#
# Check for the 'Evil' OS...or OS/2 or whatever...
#
if ($^O =~ /MSWin32|dos|os2/i) { $is_evil = 1 }
#
# Everything left should be file names, or an error
$num_args = @ARGV;
#
# use filenames or STDIN/STDOUT?
#
if ($num_args == 0) { # No args?
$is_verbose = 0; # Turn off verbosity
# defaults are good for STDIN/STDOUT
}elsif ($num_args == 1) { # 1 arg? Must be for infile
$infile = sanitize($ARGV[0], "input");
$is_verbose = 0; # Turn off verbosity
}elsif ($num_args == 2){ # 2 args? Must be both infile/outfile
$infile = sanitize($ARGV[0], "input");
$outfile = sanitize($ARGV[1], "output");
}else { # More? Error and die!
die "Too many filename arguments on command line.\n";
}
#
# Return 'success' code
#
return(1);
}
#################################################################
# #
# Read text from input into buffer. #
# #
#################################################################
sub read_text {
#
# Local Vars HERE
#
my $in; # Buffer to store text in.
open (IN, "$infile") || die "Can't open $infile: $!\n";
while (<IN>) {
#
# Format and add each line to $in
#
if ($dont_format) { # Don't format text
$in .= $_;
} else {
$in .= format_text($_);
}
}
close (IN);
#
# Set $total_len for header generation
#
$total_len = length $in;
return ($in);
}
#################################################################
# #
# Write text out to file. #
# #
#################################################################
sub write_text {
open (OUT, ">$outfile") || die "Can't open $outfile: $!\n";
if ($is_evil) { binmode(OUT) } # Make MS OS's happy
print OUT $_[0]; # Output the file
close (OUT);
return (1);
}
#################################################################
# #
# Format text to a more PalmDoc reader #
# friendly format. #
# #
#################################################################
sub format_text {
#
# Local Vars HERE
#
my $line_buff = ""; # Temorary buffer to format text in
my @line;
my $x;
my $y;
my $testchar;
my $newx = "";
#
# Function to take a line of text (in $_[0]), strip out extra
# linefeeds and such and, if necessary, add linefeeds to give
# max -l # chars per line. Must also maintain a global (col_position)
# to make sure that when this function is reentered,
# we know on what column position we left off last time.
#
#
# Grab title from text, first one that matches, wins.
#
if (($is_pg) && !($pg_title_set)){
$pg_title = $_[0];
if (
$pg_title =~ s/.+?Project Gutenber(g|g's) Etext( | of) (.+?)(by|,|,by|\*|\.).+/$3/i
)
{
chop $pg_title;
if ( (length $pg_title) > 31 ) {
$pg_title = substr($pg_title,0,28) . "...";
}
$pg_title_set = 1;
}
}
#
# Assign input string to @line, remove ending newlines, split by whitespace
#
chomp;
@line = split(/\s+/, $_[0]);
#
# Attempt at some formatting logic. If average line size is somewhat over 80,
# we can safely assume that the file is not formatted, and any linefeeds we
# find should stay right where they are, since they are probably formatting.
#
# If we find the average size is ~ 80 or under, but the linefeed comes somewhat
# under the average size, we will guess the linefeed stays.
#
#
if (length($_[0]) > 30) { # Ignore short lines
$avg_total += length($_[0]);
$avg = $avg_total / ++$avg_line_num;
}
#
# Check each word, strip any whitespace characters, and insert
# a newline before the word if it would cross the $line_len boundary.
#
# Then add the word to the output string.
#
# Note that some text may be mangled, if it depends on hard returns for
# formatting, or double spaces.
#
foreach $x (@line) {
if ($x) {
if ($is_smart) {
$x =~ s/\s+?|[\000-\011\013-\037]//g;
}
# Ixnay spaces, control chars
# tab/space formatted text will
# certainly break.
#
# If forcing to a specific line length, check to see if adding the word
# and space will overflow the specified line length. If so, add newline first
# and reset the col_position counter.
#
if ( $line_len &&
(((length $x) + $col_position + 1) > $line_len) ) {
$line_buff .= "\n";
$col_position = 0;
}
#
# Add word + space to output buffer, then increment the column position
#
$line_buff .= $x . " ";
$col_position += (length $x) + 1;
}
}
unless ($is_hyphen_off) {
$x = length $line_buff;
$line_buff =~ s/-\s+\Z//; # fix hypen separated words at
# the end of lines.
$col_position += $x - (length $line_buff); # Adjust for hypen removal
}
#
# If the output string contains no words, assume a double spaced line
# otherwise, replace the final newline.
#
if ( $avg > 85 ) {
$line_buff .= "\n"; # Preserve linefeeds if
$col_position = 0; # file appears to
} # already be stripped.
if (($line_buff eq "") && ($col_position != 0)) { # Double space
$line_buff = "\n\n";
$col_position = 0;
} elsif ($line_buff eq "") { # Single space
$line_buff = "\n";
$col_position = 0;
#
# This is some VooDoo that seems to work well. So far.
#
# What it does is this: Using average line size information at the top of this
# function, it assumes that lines that are less than the average -
# $sformat_offset AND if the current column position (where the linefeed would
# go) is less than the average - $sformat_offset , it assumes that it is a
# formatted line, and inserts the linefeed. The further into the file it goes,
# the more accurate it should be.
#
# I can imagine all kinds of places where this would break horribly,
# but it would break anyway without this bit's help.
#
} elsif ( ((length $line_buff) <= $avg - $sformat_offset ) &&
($col_position <= $avg - $sformat_offset ) &&
($avg < 85) &&
($is_smart) ) { # Assume formatted text
$line_buff .= "\n";
$col_position = 0;
}
return ($line_buff);
}
#################################################################
# #
# Generate the PDB headers and Record 0 #
# #
#################################################################
sub pdb_header {
#
# Local Vars HERE!
#
#
# Some constants
#
my $COUNT_BITS = 3;
my $DISP_BITS = 11;
my $DOC_CREATOR = "REAd";
my $DOC_TYPE = "TEXt";
my $RECORD_SIZE_MAX = 4096; # 4k record size
my $dmDBNameLength = 32; # 32 chars + 1 null
my $pdb_rec_offset; # PDB record offset
my $header_buff = ""; # Temporary buffer to build the headers in.
my $x;
my $y;
#
# PDB header
#
# We're going to set some variables and then use 'pack' to put them into a
# buffer.
#
# Here's the format in C (Dword = 4 bytes, Word = 2 bytes)
#
#typedef struct { /* 78 bytes total */
# char name[ dmDBNameLength ];
# Word attributes;
# Word version;
# DWord create_time;
# DWord modify_time;
# DWord backup_time;
# DWord modificationNumber;
# DWord appInfoID;
# DWord sortInfoID;
# char type[4];
# char creator[4];
# DWord id_seed;
# DWord nextRecordList;
# Word numRecords;
#} pdb_header;
my $pdb_header_size = 78;
my $pdb_attributes = 0;
my $pdb_version = 0;
my $pdb_create_time = 0x11111111; # Palm Desktop demands
my $pdb_modify_time = 0x11111111; # a timestamp.
my $pdb_backup_time = 0;
my $pdb_modificationNumber;
my $pdb_appInfoID = 0;
my $pdb_sortInfoID = 0;
my $pdb_type = $DOC_TYPE;
my $pdb_creator = $DOC_CREATOR;
my $pdb_id_seed = 0;
my $pdb_id_nextRecordList = 0;
my $pdb_numRecords = (int ($total_len / 4096)) + 2; # +1 for record 0
# +1 for fractional part
if ($is_pg || $is_bookmark) { $pdb_numRecords += $bookmark_num; }
#
# Pack that header!
#
#
# Set $pdb_name to detected name, unless forced using -t.
#
if ( !($title_set) && ($is_pg) && ($pg_title_set)) {
$pdb_name = $pg_title;
}
if ($is_verbose) {
print "Document Title: $pdb_name\n";
}
my $pdb_header = pack("a32nnNNNNNNa4a4NNn",$pdb_name,$pdb_attributes,
$pdb_version,$pdb_create_time,
$pdb_modify_time,$pdb_backup_time,
$pdb_modificationNumber,$pdb_appInfoID,
$pdb_sortInfoID,$pdb_type,$pdb_creator,
$pdb_id_seed,$pdb_id_nextRecordList,
$pdb_numRecords);
#
# Sanity check
#
if ( (length $pdb_header) != 78) { die "pdb_header malformed\n"; }
#
# Create the PalmDoc header
#
#
# Here's the format in C
#
# struct doc_record0 { /* 16 bytes total */
# Word version; /* 1 = plain text, 2 = compressed text */
# Word reserved1;
# DWord doc_size; /* uncompressed size in bytes */
# Word num_recs; /* not counting itself */
# Word rec_size; /* in bytes: usually 4096 (4K) */
# DWord reserved2;
# };
my $doc_header_size = 16;
my $doc_version = $is_compr + 1; # Compression on by default
my $reserved1 = 0;
my $doc_doc_size = $total_len;
my $doc_rec_size = 4096;
my $doc_num_recs = (int ($total_len / 4096)) + 1;
my $doc_reserved2 = 0;
#
# Pack Record 0
#
my $doc_header = pack("nnNnnN",$doc_version,$reserved1,$doc_doc_size,
$doc_num_recs,$doc_rec_size,$doc_reserved2);
#
# Sanity check!
#
if ( (length $doc_header) != 16) { die "doc_header malformed\n"; }
#
# Template for the PDB record headers
#
# Docs are REAL fuzzy on this.
#
#
# Format in C
#
#struct pdb_rec_header { /* 8 bytes total */
# DWord offset;
# struct {
# int delete : 1;
# int dirty : 1;
# int busy : 1;
# int secret : 1;
# int category : 4;
# } attributes;
# char uniqueID[3];
#}
my $pdb_rec_header_size = 8;
my $pdb_rec_attributes = 0x40; # We'll fake this, 0x40 = 'dirty'
my $pdb_rec_uniqueID = 0x3D0; # Simple increment
#
# Since we need to so a bunch of these, we'll use this as a template
#
my $pdb_rec_header_template = "Nccn";
#
# Generate and write headers
#
#
# PDB record headers are generated and placed at the head of the file.
# The number of headers required is Total_File_Bytes / 4096 + 1
# The +1 being for the fractional part left over.
#
# Someone could have documented this better. :)
#
# For the record, the file format is:
#
# PDB Header (78 bytes)
# PDB Record Headers (8 bytes)
# . . .
# . . . Repeat N + B + 1 times, where N is # of 4096K blocks
# . . . The +1 is for record 0 (DOC header)
# . . . B = # of bookmarks
# (DB Records)
# 0x0 0x0 Two NULLS
# Record 0 (PalmDoc Header)
# Text
# . . .
# . . .
# . . .
# Optional Bookmark records
# . . .
# EOF
#
#
$pdb_rec_offset = $pdb_header_size +
(($pdb_numRecords)* $pdb_rec_header_size) + 2;
#
# Write PDB header, and PDB rec header for record 0
#
$header_buff = $pdb_header . pack($pdb_rec_header_template,
$pdb_rec_offset, $pdb_rec_attributes,
"a",$pdb_rec_uniqueID );
$pdb_rec_offset += $doc_header_size; # Add offset for doc_header
if ($is_pg || $is_bookmark) { $pdb_numRecords -= $bookmark_num;}
for ($x = 0; $x < $pdb_numRecords - 1; $x++) {
# -1 for rec 0 header added above
#
# If we aren't compressing, every other block besides 0 is guarenteed to be
# $RECORD_SIZE_MAX
#
if (! $is_compr && $x > 0 )
{ $block_size[$x] = $RECORD_SIZE_MAX; }
$pdb_rec_offset += $block_size[$x];
++$pdb_rec_uniqueID;
$header_buff .= pack($pdb_rec_header_template,$pdb_rec_offset,
$pdb_rec_attributes,"a",$pdb_rec_uniqueID);
}
#
# Write optional bookmark pdb headers
#
if (($is_pg || $is_bookmark) && $bookmark_num) {
if ($is_compr){ # Find the end of the text
$pdb_rec_offset += $block_size[$x];
} else {
$pdb_rec_offset += $total_len % 4096;
}
for ($y = 0; $y < $bookmark_num; $y++) {
$pdb_rec_uniqueID += 10;
$header_buff .= pack($pdb_rec_header_template,$pdb_rec_offset,
$pdb_rec_attributes,"a",$pdb_rec_uniqueID);
$pdb_rec_offset += 20; # Bookmarks are 20 bytes.
}
}
#
# Write 2 NULLS
#
$header_buff .= 0x00 . 0x00;
# Write Record 0
$header_buff .= $doc_header;
return ($header_buff);
}
#################################################################
# #
# Compress the text #
# #
#################################################################
sub compr_text {
#
#
# Compresses text with the PalmDoc compression scheme.
#
# Requires:
# $_[0], which contains the entire text to be compressed.
#
# Returns: $compr_buff, which contains the compressed text.
# global @block_size, Array that contains the length of each
# compressed block.
# 'scalar(@block_size)' should be = to $pdb_numRecords
#
# Local Vars HERE!
#
my $total_compr_size = 0; # Final compressed text size
my $compr_buff = ""; # Temporary output buffer
my $numrecords = (int($total_len / 4096) +1); # Number of blocks to compress.
my $x;
my $y;
my $block_offset;
my $block; # Contains the current 4096 byte block of text
my $block_len; # Length of current block
my $index; # Current scan position in block
my $byte; # Char at index (for space + char compression)
my $byte2; # Char at index+1
my $test; # Potentially compressible text for
# LZ77 compression.
my $frag_size; # Current size of above
my $frag_size2; # Spare for lazy byte compression
my $test2; # spare for above
my $test3; # second spare
my $pos; # Position (in $block) of reference text
# for $test
# to compress against.
my $pos2; # spare for above
my $pos3; # second spare
my $back; # $index - pos
my $mask; # Bitwise mask to do LZ77 'magic'
my $compr_ratio; # Compression ratio
my $done;
my $comp_block_offset = 0; # The $compr_buff index
# block begins.
my $FRAG_MAX = 10; # Max LZ77 fragment size
my $FRAG_MIN = 3; # Min LZ77 fragment size
my $LAZY_BYTE_FRAG = $FRAG_MAX + $FRAG_MIN - 1;
$block_size[0] = 0; # Record 0 is already written and
# is not compressed.
for ($x = 1; $x <= $numrecords; $x++) {
$block_offset = ($x - 1) * 4096;
$block = substr($_[0],$block_offset, 4096);
if ($x >= $numrecords) { # Last block
$block = substr($block,0,($total_len % 4096));
}
$block_len = length($block);
#
# Tricky PalmDoc compression scheme. Here's the overview:
#
# Given a compressed stream, read a byte.
# The byte will lie in the following zones:
# 0 represents itself
# 1...8 type A command; read the next n bytes
# 9...7F represents itself
# 80..BF type B command; read one more byte
# C0..FF type C command; represent "space + char"
#
#
# Sooo. If we just write ASCII text, it will fall within 9..7F or 0 (NULL).
# No worries.
#
# If we write 1...8, the next n bytes will be taken as verbatim. This is
# used to mask high byte characters, like accents. I'm not a-using them
# at this point. High byte characters get stripped in the text processing
# function.
#
# If we write C0..FF, it will be treated as a space + character.
# Write the space, then xOR 0x80, should work.
#
# 80..BF is tricky. A 16 bit number is written:
# Throw away offset bits to copy (+3)
# 0 0|0 0 0 0 0 0 0 0 0 0 0|0 0 0
#
# So. To encode we keep an index of where we currently are in the file,
# and constantly check 3-10 char fragments from $index+frag_size against
# the text in $index - 2047 of a 4096 byte block, which contains the
# uncompressed text.
#
# If we find a match, we generate the above gobblygook, (that is, place the
# offset into a packed INT (2 bytes), shift it 3 places, then place the number
# of bits to copy from the offset in the lower three bits of the INT) place
# it in the compressed buffer, increment the index accordingly (# of bits
# compressed), and go from there.
# Whee.
#
$index = 0;
#
# Compression loop
#
while ( $index < $block_len ) {
#
# Type 'A', Escape high bytes
#
$byte = substr($block,$index,1); # Char at $index
if ($byte =~ /[\200-\377]/) { # is high bit set?
$y = 1; # found at least one!
#
# Loop to find out how many concurrent high bit characters, max 8
#
while ( (substr($block,$index + ($y + 1),1) =~
/[\200-\377]/) &&
($y < 8) ) {
++$y; # If found, increment counter
}
$compr_buff .= chr($y); # Write escape code
$compr_buff .= substr($block,$index,$y); # Write text
$index += $y; # Increment the index
} else { # Real compression routines
#
# Type 'B', simple LZ77 compression
#
$frag_size = $FRAG_MIN; # We don't care about anything less
$test = substr($block,$index,$frag_size); # pull the current fragment
$pos = rindex($block, $test, $index - 1); # check against the buffer
#
# There's a sliding window of 2047 bytes that we can pull reference
# characters from.
#
if ( ($pos > 0) &&
($index - $pos <= 2047) && # Inside our 2047 byte window
( $index < $block_len - $frag_size) ) {
# # Found a match!
# looking for bigger fragments
#
for ($y = 4; $y <= $FRAG_MAX; $y++ ) {
++$frag_size ;
$test2 = substr($block,$index,$frag_size);
$pos2 = rindex($block, $test2, $index - 1);
if (($pos2 > 0) &&
($index - $pos2 <= 2047) &&
($index < $block_len - $frag_size) ) {
# found a match!
$pos = $pos2;
$test = $test2;
} else { # no match, go back
--$frag_size;
last;
}
}
# Sanity check
if ($frag_size > $FRAG_MAX)
{ die "frag_size too big!!!: $frag_size\n"; }
#
# Now look for an even better match starting at the next position.
# This is known as 'lazy matching'.
#
# NOTE: Why is ($STD_FRAG_MAX + $STD_FRAG_MIN - 1) so magic?
# Let's pretend that we are currently at index 1001, looking for matches.
# The longest match we can find for the text starting at 1001 has a length of 3.
# If the longest match we can find for the text starting at 1002 has a length of
# 10, then obviously we get better compression by sending the byte at 1001 out
# as a literal and encoding the match found at 1002. But if the longest match
# for the text starting at 1002 has a length of 12 ($STD_FRAG_MAX + $STD_FRAG_MIN - 1,
# for the PalmDoc spec) then we can encode the match we find for the text at 1001
# and *still* have a match of length 10 for the text starting at 1004.
$frag_size2 = $frag_size + 2;
$test2 = substr($block,$index + 1, $frag_size2);
$pos2 = rindex($block, $test2, $index - 1);
if (($pos2 > 0) &&
($index - $pos2 <= 2047) &&
($index < $block_len - $frag_size2) ) {
# found a match
for ($y = $frag_size2;$y <= $LAZY_BYTE_FRAG;
$y++ ) { # Look for more
++$frag_size2;
$test2 = substr($block,$index + 1, $frag_size2);
$pos2 = rindex($block, $test2, $index - 1);
if (($pos2 > 0) &&
($index - $pos2 <= 2047) &&
($index < $block_len - $frag_size2) ) {
# found a match!
} else { # no match, go back
--$frag_size2;
last;
}
}
if ($frag_size2 < $LAZY_BYTE_FRAG) {
#
# Lazy byte found; write byte to output and abort compression round
#
$pos = 0;
$compr_buff .= substr($block,$index,1);
++$index;
}
}
if ($pos > 0) { # Did we abort the compression?
#
# Figure out how far to reach back into the buffer, and create OR mask
# that sets the high bit and indicates how big the compressed fragment is.
#
$back = $index - $pos;
$mask = 0x8000 | int($frag_size - 3);
#
# This line does all the magic; munge and add to output buffer
#
$compr_buff .= pack("n",int($back << 3) | $mask);
$index += $frag_size;
}
} else {
#
# Type 'C', Space + Char compress
#
$byte = substr($block,$index,1); # Char at $index
$byte2 = substr($block,$index + 1,1); # next char as well
if ( ($byte eq " ") &&
($byte2 =~ /[\100-\176]/ ) &&
($index <= $block_len - 1)) {
# Got a space + char
# Set the high bit
# and add to output
# buffer.
$compr_buff .= pack("c", ord ($byte2) | 0x80 );
$index += 2; # Compressed 2 bytes
} else {
$compr_buff .= $byte; # No compression
++$index;
}
}
}
}
#
# Check for errors in the compression routine then move the counter that
# identifies where the compressed representation of the most recently handled
# block starts. Turn on by setting $error_check to '1'
#
if ($error_check) {
check_comp($block, substr($compr_buff, $comp_block_offset));
$comp_block_offset = length($compr_buff);
}
if ( $is_verbose ) {
$| = 1; # Flush output buffers
$done = int(($x / ((length $_[0]) / 4096)) * 100);
if ($done > 100) {$done = 100;}
print "\rBlock: $x\tComplete: $done%";
}
#
# Calculate compressed block sizes, and the total compressed size of the file
#
$block_size[$x] = (length ($compr_buff)) - $total_compr_size;
$total_compr_size = length ($compr_buff);
if ( $is_verbose ) {
$done = int(($block_size[$x] / $block_len) * 100);
print "\tCompressed: $done%";
}
$| = 0; # Flush buffers off
}
#
# And one linefeed for Ra....
#
if ($is_verbose) { print "\n"; }
#
# Print some useless information
#
if ($is_verbose ) {
$compr_ratio = ($total_compr_size / $total_len) * 100 ;
print "Original Size: $total_len\tCompressed Size: $total_compr_size\t";
printf ("Reduced: %.2f%\n", $compr_ratio);
}
return ($compr_buff);
}
#################################################################
# #
# Generate Bookmark Headers #
# #
#################################################################
sub bookmark_rec {
#
# For now, we are only going to find the end of Gutenberg Project "Fine Print"
# text and set it as a bookmark.
#
#
# Local Vars HERE
#
#my $book_pg = "*END*THE SMALL PRINT!";
my $book_pg = $_[1];
my $book_name = "Bookmark $bookmark_num"; # Default bookmark name
if ($_[2]) { $book_name = $_[2];} # Bookmark name was passed to function.
my $book_pos = $_[3]; # Offset from start of text to place bm
my $book_header_size = 20; # Size of Bookmark header
my $book_buff = ""; # Output buffer
unless ($book_pos) { # If bookmark position not passed
$book_pos = (index($_[0],$book_pg)) + 1; # Index starts at 0, DOC readers 1
}
#
# Make sure the bookmark name is 15 chars or less
#
if (length $book_name > 15) {$book_name = substr($book_name,0,12) . "...";}
if ($book_pos > 0) {
$book_buff = pack("a16N",$book_name,$book_pos);
++$bookmark_num;
return ($book_buff);
} else {
return (""); # No bookmark
}
}
#################################################################
# #
# Sanitize filename entries #
# #
#################################################################
sub sanitize {
#
# Do various checks on filename entries. Strip control characters, substitute
# underscores for most forms of punctuation.
#
# Recieves filename or path + filename to process, whether is it a input file
# or output file, and returns the sanitized version.
#
#
#
# Local vars HERE
#
chomp; # Just to be safe;
my $filename = $_ = $_[0];
my $io = $_[1];
my $junk;
my $path = $filename;
#
# If input file, all we care about is that the file exists, is a text file
# and readable. For the output file, we want to sanitize the filename,
# and make sure the destination directory is writable.
#
if ($is_evil) { return ($_) } # MS OS. Ack! Game over! No sanity for you!
if ($io =~ /in/i) { # Input file
if ($filename && $filename ne "-" ) { # and not null or "-"
unless ( -e $filename && -r $filename )
{ die "Input file IO error: $filename $!\n";}
} else { # is null
$_ = "-"; # stdin
}
} elsif ($io =~ /out/i) { # Output file
if ($filename) { # and not null
$junk = eval "tr#\-/.a-zA-Z0-9#_#cs";
if (m#/#) { # contains a path.
$path =~ s#^(.*/).*#$1#; # Strip filename from path
unless (-w $path)
{ die "Output file IO error: Output directory unwritable\n";}
}
unless ( (!(-e $filename)) || -w $filename ) # Not exist or writable
{ die "Output file IO error: Output file unwritable\n";}
} else { # is null
$_ = ">-"; # stdout
}
} else { # Shouldn't get here.
die "Error in sanitize function\n";
}
return ($_);
}
#################################################################
# #
# Find Bookmarks #
# #
#################################################################
sub find_bookmarks {
my $pg_bookmark = "*END*THE SMALL PRINT!";
my $pg_bookmark_name = "Text Begins";
my $bookmark_rec = "";
if ($is_pg) {
#
# Set 'start of text' bookmark
#
$bookmark_rec .= bookmark_rec($_[0],$pg_bookmark,$pg_bookmark_name);
#
# Find and set chapter bookmarks
#
while ($_[0] =~ /\n((?:chapter|chaptre).*?)\s*?\n/gi ) {
if ($is_verbose) {
print "Bookmark: $1\t\tOffset: " . pos($_[0]) . "\n";
}
$bookmark_rec .= bookmark_rec($_[0],"$1","$1",pos($_[0]) -
length($1));
}
}
if ($is_bookmark) {
while ($_[0] =~ /\n<(.+?)>/g ) {
if ($is_verbose) {
print "Bookmark: $1\n";
}
$bookmark_rec .= bookmark_rec($_[0],"$1","$1");
}
$_[0] =~ s/\n<(.+?)>//g;
}
return ($bookmark_rec);
}
#################################################################
# #
# Compression Error Checking #
# #
#################################################################
sub check_comp ($$) {
#
# Compares the original block to one that's been compressed and decompressed
# and reports any places where they differ.
#
# Requires:
# $original_block, the formatted block that was originally sent
# to be compressed. Passed to the subroutine as a parameter
#
# $comp_block, the compressed version of the block
# Passed to the subroutine as a parameter
#
# Returns: Nothing. Output from this routine goes to standard output.
#
#
#
#
#
# Local Vars HERE!
#
my $original_block = $_[0];
my $comp_block = $_[1];
my $roundtrip_block = ""; # buffer for decompressed text.
my $comp_index = 0; # index for start of next element in $comp_block
my $element; # element read from the compressed data stream
my $bytes_added = 0; # the number of bytes added to the output
my $pair_var; # integer used to hold the two-byte packed pair.
my $offset; # used if B compression is encountered.
my $length; # used if B compression is encountered.
my $i; # simple loop variable
while ($comp_index < length($comp_block)) {
$element = substr($comp_block, $comp_index, 1);
#
# decompress the next element:
#
if ((ord($element) == 0x00) || # Literal byte range
((ord($element) >= 0x09) &&
(ord($element) <= 0x7F))) {
#
# output the literal byte.
#
$roundtrip_block .= $element;
$bytes_added = 1;
$comp_index += 1;
} elsif ((ord($element) >= 0x01) && # 'A' (escaped) code range
(ord($element) <= 0x08)) {
#
# Copy next $element bytes literally. (shouldn't happen at this point)
#
$roundtrip_block .= substr($comp_block, $comp_index + 1, ord($element));
$bytes_added = ord($element);
$comp_index += (1 + $bytes_added);
} elsif ((ord($element) >= 0x80) && # 'B' (LZ77) code range
(ord($element) <= 0xBF)) {
#
# read the next byte and copy the offset, length pair if it's a B code.
#
$pair_var = ((ord($element)) << 8) +
ord(substr($comp_block, ($comp_index + 1), 1));
$offset = ($pair_var >> 3) & 0x7FF;
$length = ($pair_var & 0x07) + 3;
#
# sanity checks
#
if (($offset <= 0) or ($offset > 2047)) { # out of window error
die "offset is " . $offset . " at index " .
(length($roundtrip_block)). "!!!\n";
}
if (($length < 3) or ($length > 10)) { # too few/too many
die "length is " . $length . " at index " . # bytes to copy error
(length($roundtrip_block)) . "!!!\n";
}
if ((length($roundtrip_block) - $offset) < 0) { # read before start
# of block error
die "offset " . $offset . " goes beyond beginning of block!!!\n"; }
#
# This last one would really be better if a meaningful representation of
# *where* in the file/block the offensive offset occurs could be included.
#
for ($i = 1; $i <= $length; $i++) {
$roundtrip_block .= substr($roundtrip_block,
(length($roundtrip_block) - $offset), 1);
}
$bytes_added = $length;
$comp_index += 2;
} elsif ((ord($element) >= 0xC0) && # 'C' (space + char) code range
(ord($element) <= 0xFF)) {
#
# output the space + character
#
$roundtrip_block .= " ";
$roundtrip_block .= chr(ord($element) & 0x7F);
$bytes_added = 2;
$comp_index += 1;
}
} # end while
if ( $roundtrip_block ne $original_block) {
die "Compressed text does not match original\n";
}
} # end of check_comp