1592 lines
40 KiB
Perl
Executable File
1592 lines
40 KiB
Perl
Executable File
#!/usr/bin/perl
|
|
# bibelot.pl
|
|
$|++;
|
|
|
|
my $VERSION = "0.94";
|
|
my $URL="http://sourceforge.net/projects/bibelot";
|
|
|
|
# Format ASCII text, esp. Project Gutenberg (http://www.promo.net/pg) etexts,
|
|
# into a PalmDoc PDB file.
|
|
#
|
|
#
|
|
#
|
|
# Copyright (C) 2000,2001 John Fulmer <jfulmer@appin.org>
|
|
#
|
|
# This program is free software; you can redistribute it and/or
|
|
# modify it under the terms of the GNU General Public License
|
|
# as published by the Free Software Foundation; either version 2
|
|
# of the License, or (at your option) any later version.
|
|
#
|
|
# This program is distributed in the hope that it will be useful,
|
|
# but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
# GNU General Public License for more details.
|
|
#
|
|
# A full copy of the GNU Public License may be found at:
|
|
#
|
|
# http://www.gnu.org/copyleft/gpl.html
|
|
#
|
|
#
|
|
#
|
|
# This program was written using documentation and structures borrowed
|
|
# from Paul J. Lucas' 'txt2pdbdoc' (http://www.best.com/~pjl/software.html)
|
|
# and documentation from the Pyrite website
|
|
# (http://www.pyrite.org/etext/format.html). Also, 'pdbdump' was invaluable in
|
|
# troubleshooting format problems.
|
|
#
|
|
# Some of the header structures were borrowed, but the programming is my fault.
|
|
# If it breaks, you keep both pieces. But let me know. I'm especially interested
|
|
# in formatting problems, and trying to track down all the different cases I
|
|
# can.
|
|
#
|
|
# Oh, and what is a 'bibelot'?
|
|
# See http://www.dictionary.com/cgi-bin/dict.pl?term=bibelot or your nearest
|
|
# dictionary.
|
|
#
|
|
#
|
|
# jf
|
|
#
|
|
#
|
|
# Version History:
|
|
#
|
|
# .01 Initial (ugly) version
|
|
#
|
|
# .02 -Partial re-write, made more modular
|
|
# -Added option to turn compression off (-c)
|
|
# -Added verbose (-v)
|
|
# -Added option to set document name (-l)
|
|
# -Added option to disable document formatting (-f)
|
|
# -Added usage message (-h)
|
|
# -Improved compression slightly
|
|
#
|
|
#
|
|
# .03 -Added 'Project Gutenberg' (-g) mode, which sets
|
|
# the beginning of actual text as a bookmark.
|
|
# -Now adds two NULL characters between the PDB record
|
|
# headers and the first record (Record 0).
|
|
# -Initial bookmark support.
|
|
#
|
|
# .04 10/4/00 -Various cleanups
|
|
# -Added filename sanity checks, and read/write checking
|
|
# -Improved text formatting efficiency.
|
|
# -Fixed bug that didn't collapse whitespace correctly
|
|
# -Fixed off-by-one bug if forcing line lengths.
|
|
# -Fixed incorrect use of 'pack' that required
|
|
# 'no strict'.
|
|
# -Gutenberg mode now sets chapter bookmarks, if able.
|
|
# -Added dynamic bookmark support (-b).
|
|
# Place text to bookmark
|
|
# in between angle brackets (<>). The script will
|
|
# search for the first instance of the text, and
|
|
# create a bookmark using the text as the bookmark
|
|
# name. One bookmark per line, please.
|
|
#
|
|
# For instance, let's see you wanted a bookmark
|
|
# at text that says "Fit The First". At the bottom
|
|
# of the origional text file, on a blank line,
|
|
# place a "<Fit The First>". The script will
|
|
# Bookmark the first instance of "Fit The First"
|
|
# in the document, and erase the "<Fit The First>"
|
|
# at the bottom of the file. The text is case
|
|
# sensitive.
|
|
#
|
|
# -Adjusted 'smart' format function
|
|
#
|
|
# .5 10/6/00 -Work around (bug in Perl5?) where the :^ascii:
|
|
# regex class was matching "[", and stripping it
|
|
# from text.
|
|
# -changed version number to match Freshmeat announcement
|
|
# (whoops)
|
|
# -removed spurious 's' option from getopts()
|
|
#
|
|
# .6 -re-added option to turn off 'smart' format mode. (-s)
|
|
# (Found out what that spurious 's' was for)
|
|
# -added code to rejoin words split by hyphens at eol.
|
|
#
|
|
# .7 12/21/00 -automagically grab title (if not specified with -t)
|
|
# from file in 'Project Gutenberg' mode. Often
|
|
# (but not always) the title is specified on the
|
|
# first line of a text from Project Gutenberg.
|
|
# Grab it, truncate (if necessary),plunk it into
|
|
# the DOC title field.
|
|
# -added -d option to turn off hypen correction
|
|
# -modified help option and added to opening comments.
|
|
# -verbose now echo's detected title.
|
|
# -a few code cleanups.
|
|
#
|
|
# .8 1/4/00 -match more title entries from Project Gutenberg
|
|
# -now hosted at Sourceforge, and development versions
|
|
# in CVS.
|
|
# -more (minor)tweaks to the smart formatting, to help
|
|
# with badly formatted text with short lines early on.
|
|
# -fixed bug that didn't strip out non-ascii chars.
|
|
# Yes, Virginia, octal DOESN'T stop at 255.....
|
|
#
|
|
# .9 1/9/00 -Code cleanups.
|
|
# -Strips control characters from title text.
|
|
# -More sanity checks on output filename. If infile or
|
|
# outfile are NULL, treat them as stdin/stdout.
|
|
# -You can now use '-' to specify stdin or stdout
|
|
# -Better compression. Thanks to Antaeus Feldspar, the
|
|
# compression algorithm is more efficient. It also makes
|
|
# bibelot a bit slower (6 seconds vs 4.5 seconds on an
|
|
# average book file on my system). The efficiencies only
|
|
# add up to %1-2 better compression, but that's 4-10k for
|
|
# many books, which can add up.
|
|
# -Compression error debugger, also courtesy Antaeus
|
|
# Feldspar. Turn on by the $error_check global variable.
|
|
# -New switch, 'o', to seed smart formatting offset. The
|
|
# smaller the number, the better (maybe) the formatting,
|
|
# but more badly chopped lines. Default is 20.
|
|
# -Handle another different title for PG mode.
|
|
#
|
|
# .91 -Minor code change, Palm desktop for Windows demands
|
|
# a timestamp in the PDB header. I faked up one
|
|
# (0x11111111) for now. In the process, I also learned
|
|
# that ActiveState Perl build 623 doesn't work with
|
|
# with bibelot, something to do with a difference in
|
|
# string handling. ActiveState's problem, if you ask me.
|
|
# I would be interested if bibelot works on anything else
|
|
# besides Linux, though... I DO know that nsperl 5.004
|
|
# for dos works fine.
|
|
#
|
|
# .92 2/26/01 -More minor changes for DOS and Windows versions of perl
|
|
# now it actually works. Uses binmode() for output if
|
|
# DOS/Win32 platform. (Are you happy now, Kyle?!?)
|
|
# -Added check for common DOS and Win32 versions of
|
|
# perl, currently only looks for ActiveState's Perl
|
|
# for Win32, others probably work.
|
|
# -Disabled filename sanity checks for Win32 platforms.
|
|
# -Accidentally left the compression error checking on.
|
|
# Should be MUCH faster now.
|
|
#
|
|
#
|
|
# .93 4/02/01 -Condensed the title match regex to one line.
|
|
# -Fixed problem with spaces in title with '-t'
|
|
#
|
|
# .94 5/18/01 -Added 8-bit support. This removes the check for high
|
|
# byte control characters, so don't blame me if your
|
|
# Palm blows up. :)
|
|
#
|
|
#
|
|
#
|
|
# Pragma goes HERE
|
|
#
|
|
# 'Use strict' so that we have to declare variables. Not a bad practice.
|
|
#
|
|
|
|
use strict;
|
|
|
|
|
|
#
|
|
# Global Variables go HERE
|
|
#
|
|
|
|
my $total_len = 0; # Total length of uncompressed text
|
|
my $buff = ""; # Temporary buffer space
|
|
my $header = ""; # PDB headers to preappend
|
|
my $is_compr = 1; # '0' = no, '1' = yes
|
|
my $is_verbose = 0; # If set, output debug info.
|
|
my $dont_format = 0; # Don't format the text
|
|
my $infile = "-"; # file to read, or STDIN (-)
|
|
my $outfile = ">-"; # file to write to, or STDOUT (>-)
|
|
my $line_len = 0; # If set, force linefeeds at $line_len
|
|
my $pdb_name = "PalmDoc Document"; # Name of PalmDoc file
|
|
my $col_position = 0; # Global column position for format
|
|
my @block_size; # Compressed size of all text blocks
|
|
my $avg_line_num = 0; # The next three are for use in
|
|
my $avg = 0; # format_text()'s formatting logic.
|
|
my $avg_total = 0;
|
|
my $is_pg = 0; # 'Project Gutenberg' mode. Adds
|
|
# A bookmark autoscan tag to the end of
|
|
# the text to indicate the start of
|
|
# the real text.
|
|
my $pg_pos = 0;
|
|
my $bookmark_buff = ""; # Temporary buffer for bookmark
|
|
my $bookmark_num = 0; # Total number of bookmarks
|
|
my $is_bookmark = 0; # Switch for bookmark mode
|
|
my $is_smart = 1; # Switch to turn off 'smart' format
|
|
my $title_set = 0; # Is title name set?
|
|
my $pg_title; # Title for PG mode
|
|
my $pg_title_set = 0; # Found $pg_title
|
|
my $is_hyphen_off = 0; # Switch to turn off hypen correction
|
|
my $sformat_offset = 20; # Smart format offset
|
|
my $error_check = 0; # Compr. error checker
|
|
my $is_evil = 0; # Check for Microsoft OS's
|
|
|
|
#################################################################
|
|
# #
|
|
# Main program #
|
|
# #
|
|
#################################################################
|
|
|
|
#
|
|
# Process 'getopts' and return global variables
|
|
#
|
|
|
|
proc_opts() || die "Arg! Confusing command options (should never happen!)\n";
|
|
|
|
#
|
|
# Read text from input source into buffer. Yes, all of it. And format it.
|
|
#
|
|
|
|
$buff = read_text() ||
|
|
die "Arg! Error in reading text (should never happen!)\n";
|
|
|
|
#
|
|
# Create optional bookmarks
|
|
#
|
|
|
|
if ($is_pg || $is_bookmark) { $bookmark_buff = find_bookmarks($buff); }
|
|
|
|
#
|
|
# Compress, if necessary.
|
|
#
|
|
|
|
if ($is_compr) { $buff = compr_text($buff); }
|
|
|
|
#
|
|
# Generate PDB headers and record 0, and pre-append them to the buffer.
|
|
#
|
|
|
|
$buff = pdb_header() . $buff;
|
|
|
|
#
|
|
# Write optional bookmarks
|
|
#
|
|
|
|
if ($is_pg || $is_bookmark) { $buff .= $bookmark_buff; }
|
|
|
|
#
|
|
# Write text out
|
|
#
|
|
|
|
write_text($buff)||
|
|
die "Arg! Error in writing file (should never happen!)\n"; ;
|
|
|
|
|
|
# Done. Wasn't that easy.
|
|
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Get and process command line options #
|
|
# #
|
|
#################################################################
|
|
|
|
|
|
sub proc_opts {
|
|
|
|
#
|
|
# Local Variables
|
|
#
|
|
|
|
my $num_args;
|
|
|
|
#
|
|
# Turn off 'strict' for getopts().
|
|
#
|
|
|
|
no strict;
|
|
|
|
#
|
|
# getopts() is your friend
|
|
#
|
|
|
|
use Getopt::Std qw(getopt getopts);
|
|
getopts('l:vdht:cfgbso:') || die "Invalid Argument\n";
|
|
|
|
#
|
|
# Force line length?
|
|
#
|
|
|
|
if ($opt_l) { # Not empty
|
|
unless ($opt_l =~ /\D/) { # And only contains digits
|
|
$line_len = int($opt_l);
|
|
} else { # is alpha or otherwise
|
|
die "Invalid line length.\n";
|
|
}
|
|
}
|
|
|
|
#
|
|
# Help text
|
|
#
|
|
|
|
if ( $opt_h ) {
|
|
|
|
print "\nusage: $0 [OPTIONS] <infile> <outfile>\n\n" .
|
|
"Formats text to PalmDoc format.\n" .
|
|
"$URL\n" .
|
|
"Version $VERSION\n\n" .
|
|
"options:\n" .
|
|
"\t-h\t\tthis message\n" .
|
|
"\t-c\t\tturn file compression OFF\n" .
|
|
"\t-v\t\tverbose\n" .
|
|
"\t-t \"title\"\tdocument title\n" .
|
|
"\t-f\t\tdon't format text\n" .
|
|
"\t-l<n>\t\tforce line width to <n> bytes\n" .
|
|
"\t-g\t\tEnable 'Project Gutenberg' mode\n" .
|
|
"\t-b\t\tEnable Dynamic Bookmark mode\n" .
|
|
"\t-d\t\tTurn off hyphen correction\n" .
|
|
"\t-s\t\tTurn off 'smart' format\n" .
|
|
"\t-o<n>\t\tOffset for 'smart' format (default '20')\n\n" .
|
|
"Use '-' or omit filenames to indicate STDIN or STDOUT.\n\n";
|
|
exit 0;
|
|
}
|
|
|
|
#
|
|
# Set document name
|
|
#
|
|
|
|
if ( $opt_t ) {
|
|
|
|
$opt_t =~ s/[\000-\011\013-\037\177-\377]//g; #strip control chars
|
|
$opt_t =~ s/\s+/ /g;
|
|
|
|
if ( (length $opt_t) > 31 ) {
|
|
$pdb_name = substr($opt_t,0,28) . "...";
|
|
$title_set = 1;
|
|
} else {
|
|
$pdb_name = $opt_t;
|
|
$title_set = 1;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Set offset for 'smart' filtering. The larger the number, the more formatted
|
|
# text it may miss (due to the shorter length), but you will get fewer false
|
|
# positives due to short lines.
|
|
#
|
|
|
|
if ($opt_o) { # Not empty
|
|
unless (($opt_o =~ /\D/) || # And only contains digits
|
|
(int($opt_o) > 65)) { # Offsets greater than 65 are worthless
|
|
$sformat_offset = int($opt_o);
|
|
} else { # is alpha or otherwise
|
|
die "Invalid offset.\n";
|
|
}
|
|
}
|
|
|
|
if ($opt_v) { $is_verbose = 1; } # Maximum Verbosity!
|
|
if ($opt_c) { $is_compr = 0; } # Turn off compression?
|
|
if ($opt_f) { $dont_format = 1; } # Don't format text
|
|
if ($opt_g) { $is_pg = 1; } # Project Gutenberg mode
|
|
if ($opt_b) { $is_bookmark = 1; } # Bookmark mode
|
|
if ($opt_s) { $is_smart = 0; } # Turn off 'smart' format
|
|
if ($opt_d) { $is_hypen_off = 1; } # Turn off hyphen correction
|
|
|
|
#
|
|
# Turn back on strict
|
|
#
|
|
|
|
use strict;
|
|
|
|
#
|
|
# Check for the 'Evil' OS...or OS/2 or whatever...
|
|
#
|
|
|
|
if ($^O =~ /MSWin32|dos|os2/i) { $is_evil = 1 }
|
|
|
|
|
|
#
|
|
# Everything left should be file names, or an error
|
|
|
|
$num_args = @ARGV;
|
|
|
|
#
|
|
# use filenames or STDIN/STDOUT?
|
|
#
|
|
|
|
if ($num_args == 0) { # No args?
|
|
$is_verbose = 0; # Turn off verbosity
|
|
# defaults are good for STDIN/STDOUT
|
|
|
|
}elsif ($num_args == 1) { # 1 arg? Must be for infile
|
|
$infile = sanitize($ARGV[0], "input");
|
|
$is_verbose = 0; # Turn off verbosity
|
|
|
|
}elsif ($num_args == 2){ # 2 args? Must be both infile/outfile
|
|
$infile = sanitize($ARGV[0], "input");
|
|
$outfile = sanitize($ARGV[1], "output");
|
|
|
|
}else { # More? Error and die!
|
|
die "Too many filename arguments on command line.\n";
|
|
}
|
|
|
|
|
|
#
|
|
# Return 'success' code
|
|
#
|
|
|
|
return(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Read text from input into buffer. #
|
|
# #
|
|
#################################################################
|
|
|
|
|
|
sub read_text {
|
|
|
|
#
|
|
# Local Vars HERE
|
|
#
|
|
|
|
my $in; # Buffer to store text in.
|
|
|
|
|
|
open (IN, "$infile") || die "Can't open $infile: $!\n";
|
|
while (<IN>) {
|
|
|
|
#
|
|
# Format and add each line to $in
|
|
#
|
|
|
|
if ($dont_format) { # Don't format text
|
|
$in .= $_;
|
|
} else {
|
|
$in .= format_text($_);
|
|
}
|
|
}
|
|
|
|
close (IN);
|
|
|
|
|
|
#
|
|
# Set $total_len for header generation
|
|
#
|
|
|
|
$total_len = length $in;
|
|
|
|
return ($in);
|
|
|
|
}
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Write text out to file. #
|
|
# #
|
|
#################################################################
|
|
|
|
sub write_text {
|
|
|
|
open (OUT, ">$outfile") || die "Can't open $outfile: $!\n";
|
|
|
|
if ($is_evil) { binmode(OUT) } # Make MS OS's happy
|
|
|
|
print OUT $_[0]; # Output the file
|
|
close (OUT);
|
|
return (1);
|
|
|
|
}
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Format text to a more PalmDoc reader #
|
|
# friendly format. #
|
|
# #
|
|
#################################################################
|
|
|
|
sub format_text {
|
|
|
|
#
|
|
# Local Vars HERE
|
|
#
|
|
|
|
|
|
my $line_buff = ""; # Temorary buffer to format text in
|
|
my @line;
|
|
my $x;
|
|
my $y;
|
|
my $testchar;
|
|
my $newx = "";
|
|
|
|
#
|
|
# Function to take a line of text (in $_[0]), strip out extra
|
|
# linefeeds and such and, if necessary, add linefeeds to give
|
|
# max -l # chars per line. Must also maintain a global (col_position)
|
|
# to make sure that when this function is reentered,
|
|
# we know on what column position we left off last time.
|
|
#
|
|
|
|
|
|
|
|
#
|
|
# Grab title from text, first one that matches, wins.
|
|
#
|
|
|
|
if (($is_pg) && !($pg_title_set)){
|
|
$pg_title = $_[0];
|
|
if (
|
|
$pg_title =~ s/.+?Project Gutenber(g|g's) Etext( | of) (.+?)(by|,|,by|\*|\.).+/$3/i
|
|
)
|
|
|
|
{
|
|
chop $pg_title;
|
|
if ( (length $pg_title) > 31 ) {
|
|
$pg_title = substr($pg_title,0,28) . "...";
|
|
}
|
|
$pg_title_set = 1;
|
|
}
|
|
}
|
|
|
|
#
|
|
# Assign input string to @line, remove ending newlines, split by whitespace
|
|
#
|
|
|
|
chomp;
|
|
@line = split(/\s+/, $_[0]);
|
|
|
|
#
|
|
# Attempt at some formatting logic. If average line size is somewhat over 80,
|
|
# we can safely assume that the file is not formatted, and any linefeeds we
|
|
# find should stay right where they are, since they are probably formatting.
|
|
#
|
|
# If we find the average size is ~ 80 or under, but the linefeed comes somewhat
|
|
# under the average size, we will guess the linefeed stays.
|
|
#
|
|
#
|
|
|
|
if (length($_[0]) > 30) { # Ignore short lines
|
|
$avg_total += length($_[0]);
|
|
$avg = $avg_total / ++$avg_line_num;
|
|
}
|
|
|
|
|
|
#
|
|
# Check each word, strip any whitespace characters, and insert
|
|
# a newline before the word if it would cross the $line_len boundary.
|
|
#
|
|
# Then add the word to the output string.
|
|
#
|
|
# Note that some text may be mangled, if it depends on hard returns for
|
|
# formatting, or double spaces.
|
|
#
|
|
|
|
foreach $x (@line) {
|
|
|
|
if ($x) {
|
|
if ($is_smart) {
|
|
$x =~ s/\s+?|[\000-\011\013-\037]//g;
|
|
}
|
|
# Ixnay spaces, control chars
|
|
# tab/space formatted text will
|
|
# certainly break.
|
|
|
|
#
|
|
# If forcing to a specific line length, check to see if adding the word
|
|
# and space will overflow the specified line length. If so, add newline first
|
|
# and reset the col_position counter.
|
|
#
|
|
|
|
if ( $line_len &&
|
|
(((length $x) + $col_position + 1) > $line_len) ) {
|
|
$line_buff .= "\n";
|
|
$col_position = 0;
|
|
}
|
|
|
|
#
|
|
# Add word + space to output buffer, then increment the column position
|
|
#
|
|
|
|
$line_buff .= $x . " ";
|
|
$col_position += (length $x) + 1;
|
|
|
|
}
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
unless ($is_hyphen_off) {
|
|
$x = length $line_buff;
|
|
$line_buff =~ s/-\s+\Z//; # fix hypen separated words at
|
|
# the end of lines.
|
|
$col_position += $x - (length $line_buff); # Adjust for hypen removal
|
|
}
|
|
|
|
#
|
|
# If the output string contains no words, assume a double spaced line
|
|
# otherwise, replace the final newline.
|
|
#
|
|
|
|
if ( $avg > 85 ) {
|
|
$line_buff .= "\n"; # Preserve linefeeds if
|
|
$col_position = 0; # file appears to
|
|
} # already be stripped.
|
|
|
|
if (($line_buff eq "") && ($col_position != 0)) { # Double space
|
|
$line_buff = "\n\n";
|
|
$col_position = 0;
|
|
|
|
} elsif ($line_buff eq "") { # Single space
|
|
$line_buff = "\n";
|
|
$col_position = 0;
|
|
|
|
#
|
|
# This is some VooDoo that seems to work well. So far.
|
|
#
|
|
# What it does is this: Using average line size information at the top of this
|
|
# function, it assumes that lines that are less than the average -
|
|
# $sformat_offset AND if the current column position (where the linefeed would
|
|
# go) is less than the average - $sformat_offset , it assumes that it is a
|
|
# formatted line, and inserts the linefeed. The further into the file it goes,
|
|
# the more accurate it should be.
|
|
#
|
|
# I can imagine all kinds of places where this would break horribly,
|
|
# but it would break anyway without this bit's help.
|
|
#
|
|
|
|
} elsif ( ((length $line_buff) <= $avg - $sformat_offset ) &&
|
|
($col_position <= $avg - $sformat_offset ) &&
|
|
($avg < 85) &&
|
|
($is_smart) ) { # Assume formatted text
|
|
$line_buff .= "\n";
|
|
$col_position = 0;
|
|
}
|
|
|
|
|
|
return ($line_buff);
|
|
|
|
}
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Generate the PDB headers and Record 0 #
|
|
# #
|
|
#################################################################
|
|
|
|
|
|
sub pdb_header {
|
|
|
|
#
|
|
# Local Vars HERE!
|
|
#
|
|
|
|
#
|
|
# Some constants
|
|
#
|
|
|
|
|
|
my $COUNT_BITS = 3;
|
|
my $DISP_BITS = 11;
|
|
my $DOC_CREATOR = "REAd";
|
|
my $DOC_TYPE = "TEXt";
|
|
my $RECORD_SIZE_MAX = 4096; # 4k record size
|
|
my $dmDBNameLength = 32; # 32 chars + 1 null
|
|
|
|
my $pdb_rec_offset; # PDB record offset
|
|
my $header_buff = ""; # Temporary buffer to build the headers in.
|
|
my $x;
|
|
my $y;
|
|
|
|
#
|
|
# PDB header
|
|
#
|
|
# We're going to set some variables and then use 'pack' to put them into a
|
|
# buffer.
|
|
#
|
|
# Here's the format in C (Dword = 4 bytes, Word = 2 bytes)
|
|
#
|
|
#typedef struct { /* 78 bytes total */
|
|
# char name[ dmDBNameLength ];
|
|
# Word attributes;
|
|
# Word version;
|
|
# DWord create_time;
|
|
# DWord modify_time;
|
|
# DWord backup_time;
|
|
# DWord modificationNumber;
|
|
# DWord appInfoID;
|
|
# DWord sortInfoID;
|
|
# char type[4];
|
|
# char creator[4];
|
|
# DWord id_seed;
|
|
# DWord nextRecordList;
|
|
# Word numRecords;
|
|
#} pdb_header;
|
|
|
|
my $pdb_header_size = 78;
|
|
my $pdb_attributes = 0;
|
|
my $pdb_version = 0;
|
|
my $pdb_create_time = 0x11111111; # Palm Desktop demands
|
|
my $pdb_modify_time = 0x11111111; # a timestamp.
|
|
my $pdb_backup_time = 0;
|
|
my $pdb_modificationNumber;
|
|
my $pdb_appInfoID = 0;
|
|
my $pdb_sortInfoID = 0;
|
|
my $pdb_type = $DOC_TYPE;
|
|
my $pdb_creator = $DOC_CREATOR;
|
|
my $pdb_id_seed = 0;
|
|
my $pdb_id_nextRecordList = 0;
|
|
my $pdb_numRecords = (int ($total_len / 4096)) + 2; # +1 for record 0
|
|
# +1 for fractional part
|
|
if ($is_pg || $is_bookmark) { $pdb_numRecords += $bookmark_num; }
|
|
|
|
#
|
|
# Pack that header!
|
|
#
|
|
|
|
#
|
|
# Set $pdb_name to detected name, unless forced using -t.
|
|
#
|
|
|
|
if ( !($title_set) && ($is_pg) && ($pg_title_set)) {
|
|
$pdb_name = $pg_title;
|
|
|
|
}
|
|
|
|
if ($is_verbose) {
|
|
print "Document Title: $pdb_name\n";
|
|
}
|
|
|
|
|
|
my $pdb_header = pack("a32nnNNNNNNa4a4NNn",$pdb_name,$pdb_attributes,
|
|
$pdb_version,$pdb_create_time,
|
|
$pdb_modify_time,$pdb_backup_time,
|
|
$pdb_modificationNumber,$pdb_appInfoID,
|
|
$pdb_sortInfoID,$pdb_type,$pdb_creator,
|
|
$pdb_id_seed,$pdb_id_nextRecordList,
|
|
$pdb_numRecords);
|
|
|
|
|
|
#
|
|
# Sanity check
|
|
#
|
|
|
|
if ( (length $pdb_header) != 78) { die "pdb_header malformed\n"; }
|
|
|
|
#
|
|
# Create the PalmDoc header
|
|
#
|
|
#
|
|
# Here's the format in C
|
|
#
|
|
# struct doc_record0 { /* 16 bytes total */
|
|
# Word version; /* 1 = plain text, 2 = compressed text */
|
|
# Word reserved1;
|
|
# DWord doc_size; /* uncompressed size in bytes */
|
|
# Word num_recs; /* not counting itself */
|
|
# Word rec_size; /* in bytes: usually 4096 (4K) */
|
|
# DWord reserved2;
|
|
# };
|
|
|
|
|
|
|
|
my $doc_header_size = 16;
|
|
my $doc_version = $is_compr + 1; # Compression on by default
|
|
my $reserved1 = 0;
|
|
my $doc_doc_size = $total_len;
|
|
my $doc_rec_size = 4096;
|
|
my $doc_num_recs = (int ($total_len / 4096)) + 1;
|
|
my $doc_reserved2 = 0;
|
|
|
|
#
|
|
# Pack Record 0
|
|
#
|
|
|
|
|
|
my $doc_header = pack("nnNnnN",$doc_version,$reserved1,$doc_doc_size,
|
|
$doc_num_recs,$doc_rec_size,$doc_reserved2);
|
|
|
|
|
|
#
|
|
# Sanity check!
|
|
#
|
|
|
|
if ( (length $doc_header) != 16) { die "doc_header malformed\n"; }
|
|
|
|
#
|
|
# Template for the PDB record headers
|
|
#
|
|
# Docs are REAL fuzzy on this.
|
|
#
|
|
#
|
|
# Format in C
|
|
#
|
|
#struct pdb_rec_header { /* 8 bytes total */
|
|
# DWord offset;
|
|
# struct {
|
|
# int delete : 1;
|
|
# int dirty : 1;
|
|
# int busy : 1;
|
|
# int secret : 1;
|
|
# int category : 4;
|
|
# } attributes;
|
|
# char uniqueID[3];
|
|
#}
|
|
|
|
my $pdb_rec_header_size = 8;
|
|
my $pdb_rec_attributes = 0x40; # We'll fake this, 0x40 = 'dirty'
|
|
my $pdb_rec_uniqueID = 0x3D0; # Simple increment
|
|
|
|
#
|
|
# Since we need to so a bunch of these, we'll use this as a template
|
|
#
|
|
|
|
my $pdb_rec_header_template = "Nccn";
|
|
|
|
|
|
#
|
|
# Generate and write headers
|
|
#
|
|
#
|
|
# PDB record headers are generated and placed at the head of the file.
|
|
# The number of headers required is Total_File_Bytes / 4096 + 1
|
|
# The +1 being for the fractional part left over.
|
|
#
|
|
# Someone could have documented this better. :)
|
|
#
|
|
# For the record, the file format is:
|
|
#
|
|
# PDB Header (78 bytes)
|
|
# PDB Record Headers (8 bytes)
|
|
# . . .
|
|
# . . . Repeat N + B + 1 times, where N is # of 4096K blocks
|
|
# . . . The +1 is for record 0 (DOC header)
|
|
# . . . B = # of bookmarks
|
|
# (DB Records)
|
|
# 0x0 0x0 Two NULLS
|
|
# Record 0 (PalmDoc Header)
|
|
# Text
|
|
# . . .
|
|
# . . .
|
|
# . . .
|
|
# Optional Bookmark records
|
|
# . . .
|
|
# EOF
|
|
#
|
|
#
|
|
|
|
$pdb_rec_offset = $pdb_header_size +
|
|
(($pdb_numRecords)* $pdb_rec_header_size) + 2;
|
|
|
|
#
|
|
# Write PDB header, and PDB rec header for record 0
|
|
#
|
|
|
|
$header_buff = $pdb_header . pack($pdb_rec_header_template,
|
|
$pdb_rec_offset, $pdb_rec_attributes,
|
|
"a",$pdb_rec_uniqueID );
|
|
$pdb_rec_offset += $doc_header_size; # Add offset for doc_header
|
|
|
|
if ($is_pg || $is_bookmark) { $pdb_numRecords -= $bookmark_num;}
|
|
|
|
for ($x = 0; $x < $pdb_numRecords - 1; $x++) {
|
|
# -1 for rec 0 header added above
|
|
|
|
#
|
|
# If we aren't compressing, every other block besides 0 is guarenteed to be
|
|
# $RECORD_SIZE_MAX
|
|
#
|
|
if (! $is_compr && $x > 0 )
|
|
{ $block_size[$x] = $RECORD_SIZE_MAX; }
|
|
|
|
$pdb_rec_offset += $block_size[$x];
|
|
++$pdb_rec_uniqueID;
|
|
$header_buff .= pack($pdb_rec_header_template,$pdb_rec_offset,
|
|
$pdb_rec_attributes,"a",$pdb_rec_uniqueID);
|
|
}
|
|
|
|
#
|
|
# Write optional bookmark pdb headers
|
|
#
|
|
|
|
if (($is_pg || $is_bookmark) && $bookmark_num) {
|
|
|
|
if ($is_compr){ # Find the end of the text
|
|
$pdb_rec_offset += $block_size[$x];
|
|
} else {
|
|
$pdb_rec_offset += $total_len % 4096;
|
|
}
|
|
for ($y = 0; $y < $bookmark_num; $y++) {
|
|
|
|
$pdb_rec_uniqueID += 10;
|
|
$header_buff .= pack($pdb_rec_header_template,$pdb_rec_offset,
|
|
$pdb_rec_attributes,"a",$pdb_rec_uniqueID);
|
|
$pdb_rec_offset += 20; # Bookmarks are 20 bytes.
|
|
}
|
|
}
|
|
|
|
#
|
|
# Write 2 NULLS
|
|
#
|
|
|
|
$header_buff .= 0x00 . 0x00;
|
|
|
|
# Write Record 0
|
|
|
|
$header_buff .= $doc_header;
|
|
|
|
|
|
|
|
return ($header_buff);
|
|
|
|
|
|
}
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Compress the text #
|
|
# #
|
|
#################################################################
|
|
|
|
sub compr_text {
|
|
|
|
|
|
#
|
|
#
|
|
# Compresses text with the PalmDoc compression scheme.
|
|
#
|
|
# Requires:
|
|
# $_[0], which contains the entire text to be compressed.
|
|
#
|
|
# Returns: $compr_buff, which contains the compressed text.
|
|
# global @block_size, Array that contains the length of each
|
|
# compressed block.
|
|
# 'scalar(@block_size)' should be = to $pdb_numRecords
|
|
|
|
#
|
|
# Local Vars HERE!
|
|
#
|
|
|
|
my $total_compr_size = 0; # Final compressed text size
|
|
my $compr_buff = ""; # Temporary output buffer
|
|
my $numrecords = (int($total_len / 4096) +1); # Number of blocks to compress.
|
|
my $x;
|
|
my $y;
|
|
my $block_offset;
|
|
my $block; # Contains the current 4096 byte block of text
|
|
my $block_len; # Length of current block
|
|
my $index; # Current scan position in block
|
|
my $byte; # Char at index (for space + char compression)
|
|
my $byte2; # Char at index+1
|
|
my $test; # Potentially compressible text for
|
|
# LZ77 compression.
|
|
|
|
my $frag_size; # Current size of above
|
|
my $frag_size2; # Spare for lazy byte compression
|
|
my $test2; # spare for above
|
|
my $test3; # second spare
|
|
my $pos; # Position (in $block) of reference text
|
|
# for $test
|
|
# to compress against.
|
|
|
|
my $pos2; # spare for above
|
|
my $pos3; # second spare
|
|
my $back; # $index - pos
|
|
my $mask; # Bitwise mask to do LZ77 'magic'
|
|
my $compr_ratio; # Compression ratio
|
|
my $done;
|
|
my $comp_block_offset = 0; # The $compr_buff index
|
|
# block begins.
|
|
my $FRAG_MAX = 10; # Max LZ77 fragment size
|
|
my $FRAG_MIN = 3; # Min LZ77 fragment size
|
|
my $LAZY_BYTE_FRAG = $FRAG_MAX + $FRAG_MIN - 1;
|
|
|
|
|
|
$block_size[0] = 0; # Record 0 is already written and
|
|
# is not compressed.
|
|
|
|
|
|
for ($x = 1; $x <= $numrecords; $x++) {
|
|
|
|
$block_offset = ($x - 1) * 4096;
|
|
$block = substr($_[0],$block_offset, 4096);
|
|
if ($x >= $numrecords) { # Last block
|
|
$block = substr($block,0,($total_len % 4096));
|
|
|
|
}
|
|
|
|
$block_len = length($block);
|
|
|
|
#
|
|
# Tricky PalmDoc compression scheme. Here's the overview:
|
|
#
|
|
# Given a compressed stream, read a byte.
|
|
# The byte will lie in the following zones:
|
|
# 0 represents itself
|
|
# 1...8 type A command; read the next n bytes
|
|
# 9...7F represents itself
|
|
# 80..BF type B command; read one more byte
|
|
# C0..FF type C command; represent "space + char"
|
|
#
|
|
#
|
|
# Sooo. If we just write ASCII text, it will fall within 9..7F or 0 (NULL).
|
|
# No worries.
|
|
#
|
|
# If we write 1...8, the next n bytes will be taken as verbatim. This is
|
|
# used to mask high byte characters, like accents. I'm not a-using them
|
|
# at this point. High byte characters get stripped in the text processing
|
|
# function.
|
|
#
|
|
# If we write C0..FF, it will be treated as a space + character.
|
|
# Write the space, then xOR 0x80, should work.
|
|
#
|
|
# 80..BF is tricky. A 16 bit number is written:
|
|
# Throw away offset bits to copy (+3)
|
|
# 0 0|0 0 0 0 0 0 0 0 0 0 0|0 0 0
|
|
#
|
|
# So. To encode we keep an index of where we currently are in the file,
|
|
# and constantly check 3-10 char fragments from $index+frag_size against
|
|
# the text in $index - 2047 of a 4096 byte block, which contains the
|
|
# uncompressed text.
|
|
#
|
|
# If we find a match, we generate the above gobblygook, (that is, place the
|
|
# offset into a packed INT (2 bytes), shift it 3 places, then place the number
|
|
# of bits to copy from the offset in the lower three bits of the INT) place
|
|
# it in the compressed buffer, increment the index accordingly (# of bits
|
|
# compressed), and go from there.
|
|
# Whee.
|
|
#
|
|
|
|
$index = 0;
|
|
|
|
#
|
|
# Compression loop
|
|
#
|
|
|
|
|
|
while ( $index < $block_len ) {
|
|
|
|
|
|
|
|
#
|
|
# Type 'A', Escape high bytes
|
|
#
|
|
$byte = substr($block,$index,1); # Char at $index
|
|
if ($byte =~ /[\200-\377]/) { # is high bit set?
|
|
|
|
$y = 1; # found at least one!
|
|
|
|
#
|
|
# Loop to find out how many concurrent high bit characters, max 8
|
|
#
|
|
while ( (substr($block,$index + ($y + 1),1) =~
|
|
/[\200-\377]/) &&
|
|
($y < 8) ) {
|
|
|
|
++$y; # If found, increment counter
|
|
|
|
}
|
|
|
|
$compr_buff .= chr($y); # Write escape code
|
|
$compr_buff .= substr($block,$index,$y); # Write text
|
|
$index += $y; # Increment the index
|
|
|
|
} else { # Real compression routines
|
|
|
|
#
|
|
# Type 'B', simple LZ77 compression
|
|
#
|
|
$frag_size = $FRAG_MIN; # We don't care about anything less
|
|
|
|
$test = substr($block,$index,$frag_size); # pull the current fragment
|
|
$pos = rindex($block, $test, $index - 1); # check against the buffer
|
|
|
|
|
|
#
|
|
# There's a sliding window of 2047 bytes that we can pull reference
|
|
# characters from.
|
|
#
|
|
|
|
if ( ($pos > 0) &&
|
|
($index - $pos <= 2047) && # Inside our 2047 byte window
|
|
( $index < $block_len - $frag_size) ) {
|
|
|
|
# # Found a match!
|
|
# looking for bigger fragments
|
|
#
|
|
for ($y = 4; $y <= $FRAG_MAX; $y++ ) {
|
|
++$frag_size ;
|
|
$test2 = substr($block,$index,$frag_size);
|
|
$pos2 = rindex($block, $test2, $index - 1);
|
|
if (($pos2 > 0) &&
|
|
($index - $pos2 <= 2047) &&
|
|
($index < $block_len - $frag_size) ) {
|
|
# found a match!
|
|
$pos = $pos2;
|
|
$test = $test2;
|
|
} else { # no match, go back
|
|
--$frag_size;
|
|
last;
|
|
|
|
}
|
|
|
|
}
|
|
# Sanity check
|
|
if ($frag_size > $FRAG_MAX)
|
|
{ die "frag_size too big!!!: $frag_size\n"; }
|
|
|
|
|
|
#
|
|
# Now look for an even better match starting at the next position.
|
|
# This is known as 'lazy matching'.
|
|
#
|
|
|
|
|
|
# NOTE: Why is ($STD_FRAG_MAX + $STD_FRAG_MIN - 1) so magic?
|
|
# Let's pretend that we are currently at index 1001, looking for matches.
|
|
# The longest match we can find for the text starting at 1001 has a length of 3.
|
|
# If the longest match we can find for the text starting at 1002 has a length of
|
|
# 10, then obviously we get better compression by sending the byte at 1001 out
|
|
# as a literal and encoding the match found at 1002. But if the longest match
|
|
# for the text starting at 1002 has a length of 12 ($STD_FRAG_MAX + $STD_FRAG_MIN - 1,
|
|
# for the PalmDoc spec) then we can encode the match we find for the text at 1001
|
|
# and *still* have a match of length 10 for the text starting at 1004.
|
|
|
|
|
|
$frag_size2 = $frag_size + 2;
|
|
$test2 = substr($block,$index + 1, $frag_size2);
|
|
$pos2 = rindex($block, $test2, $index - 1);
|
|
if (($pos2 > 0) &&
|
|
($index - $pos2 <= 2047) &&
|
|
($index < $block_len - $frag_size2) ) {
|
|
# found a match
|
|
|
|
for ($y = $frag_size2;$y <= $LAZY_BYTE_FRAG;
|
|
$y++ ) { # Look for more
|
|
++$frag_size2;
|
|
$test2 = substr($block,$index + 1, $frag_size2);
|
|
$pos2 = rindex($block, $test2, $index - 1);
|
|
if (($pos2 > 0) &&
|
|
($index - $pos2 <= 2047) &&
|
|
($index < $block_len - $frag_size2) ) {
|
|
# found a match!
|
|
|
|
} else { # no match, go back
|
|
--$frag_size2;
|
|
last;
|
|
|
|
}
|
|
}
|
|
if ($frag_size2 < $LAZY_BYTE_FRAG) {
|
|
|
|
#
|
|
# Lazy byte found; write byte to output and abort compression round
|
|
#
|
|
$pos = 0;
|
|
$compr_buff .= substr($block,$index,1);
|
|
++$index;
|
|
}
|
|
}
|
|
|
|
if ($pos > 0) { # Did we abort the compression?
|
|
|
|
|
|
#
|
|
# Figure out how far to reach back into the buffer, and create OR mask
|
|
# that sets the high bit and indicates how big the compressed fragment is.
|
|
#
|
|
$back = $index - $pos;
|
|
$mask = 0x8000 | int($frag_size - 3);
|
|
|
|
#
|
|
# This line does all the magic; munge and add to output buffer
|
|
#
|
|
$compr_buff .= pack("n",int($back << 3) | $mask);
|
|
$index += $frag_size;
|
|
}
|
|
|
|
} else {
|
|
|
|
|
|
|
|
#
|
|
# Type 'C', Space + Char compress
|
|
#
|
|
$byte = substr($block,$index,1); # Char at $index
|
|
$byte2 = substr($block,$index + 1,1); # next char as well
|
|
if ( ($byte eq " ") &&
|
|
($byte2 =~ /[\100-\176]/ ) &&
|
|
($index <= $block_len - 1)) {
|
|
# Got a space + char
|
|
|
|
# Set the high bit
|
|
# and add to output
|
|
# buffer.
|
|
$compr_buff .= pack("c", ord ($byte2) | 0x80 );
|
|
$index += 2; # Compressed 2 bytes
|
|
|
|
} else {
|
|
$compr_buff .= $byte; # No compression
|
|
++$index;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
|
|
#
|
|
# Check for errors in the compression routine then move the counter that
|
|
# identifies where the compressed representation of the most recently handled
|
|
# block starts. Turn on by setting $error_check to '1'
|
|
#
|
|
|
|
if ($error_check) {
|
|
check_comp($block, substr($compr_buff, $comp_block_offset));
|
|
$comp_block_offset = length($compr_buff);
|
|
|
|
}
|
|
|
|
if ( $is_verbose ) {
|
|
$| = 1; # Flush output buffers
|
|
$done = int(($x / ((length $_[0]) / 4096)) * 100);
|
|
if ($done > 100) {$done = 100;}
|
|
print "\rBlock: $x\tComplete: $done%";
|
|
}
|
|
|
|
#
|
|
# Calculate compressed block sizes, and the total compressed size of the file
|
|
#
|
|
|
|
$block_size[$x] = (length ($compr_buff)) - $total_compr_size;
|
|
$total_compr_size = length ($compr_buff);
|
|
|
|
if ( $is_verbose ) {
|
|
$done = int(($block_size[$x] / $block_len) * 100);
|
|
print "\tCompressed: $done%";
|
|
}
|
|
|
|
$| = 0; # Flush buffers off
|
|
|
|
}
|
|
|
|
#
|
|
# And one linefeed for Ra....
|
|
#
|
|
|
|
if ($is_verbose) { print "\n"; }
|
|
|
|
|
|
#
|
|
# Print some useless information
|
|
#
|
|
|
|
if ($is_verbose ) {
|
|
$compr_ratio = ($total_compr_size / $total_len) * 100 ;
|
|
print "Original Size: $total_len\tCompressed Size: $total_compr_size\t";
|
|
printf ("Reduced: %.2f%\n", $compr_ratio);
|
|
}
|
|
|
|
|
|
return ($compr_buff);
|
|
|
|
}
|
|
|
|
#################################################################
|
|
# #
|
|
# Generate Bookmark Headers #
|
|
# #
|
|
#################################################################
|
|
|
|
sub bookmark_rec {
|
|
|
|
|
|
#
|
|
# For now, we are only going to find the end of Gutenberg Project "Fine Print"
|
|
# text and set it as a bookmark.
|
|
#
|
|
|
|
#
|
|
# Local Vars HERE
|
|
#
|
|
|
|
#my $book_pg = "*END*THE SMALL PRINT!";
|
|
my $book_pg = $_[1];
|
|
my $book_name = "Bookmark $bookmark_num"; # Default bookmark name
|
|
|
|
if ($_[2]) { $book_name = $_[2];} # Bookmark name was passed to function.
|
|
|
|
my $book_pos = $_[3]; # Offset from start of text to place bm
|
|
my $book_header_size = 20; # Size of Bookmark header
|
|
my $book_buff = ""; # Output buffer
|
|
|
|
unless ($book_pos) { # If bookmark position not passed
|
|
$book_pos = (index($_[0],$book_pg)) + 1; # Index starts at 0, DOC readers 1
|
|
}
|
|
|
|
#
|
|
# Make sure the bookmark name is 15 chars or less
|
|
#
|
|
|
|
if (length $book_name > 15) {$book_name = substr($book_name,0,12) . "...";}
|
|
|
|
if ($book_pos > 0) {
|
|
$book_buff = pack("a16N",$book_name,$book_pos);
|
|
++$bookmark_num;
|
|
return ($book_buff);
|
|
} else {
|
|
return (""); # No bookmark
|
|
}
|
|
|
|
}
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Sanitize filename entries #
|
|
# #
|
|
#################################################################
|
|
|
|
|
|
sub sanitize {
|
|
|
|
#
|
|
# Do various checks on filename entries. Strip control characters, substitute
|
|
# underscores for most forms of punctuation.
|
|
#
|
|
# Recieves filename or path + filename to process, whether is it a input file
|
|
# or output file, and returns the sanitized version.
|
|
#
|
|
#
|
|
|
|
#
|
|
# Local vars HERE
|
|
#
|
|
chomp; # Just to be safe;
|
|
|
|
my $filename = $_ = $_[0];
|
|
my $io = $_[1];
|
|
my $junk;
|
|
my $path = $filename;
|
|
|
|
#
|
|
# If input file, all we care about is that the file exists, is a text file
|
|
# and readable. For the output file, we want to sanitize the filename,
|
|
# and make sure the destination directory is writable.
|
|
#
|
|
|
|
if ($is_evil) { return ($_) } # MS OS. Ack! Game over! No sanity for you!
|
|
|
|
if ($io =~ /in/i) { # Input file
|
|
if ($filename && $filename ne "-" ) { # and not null or "-"
|
|
unless ( -e $filename && -r $filename )
|
|
{ die "Input file IO error: $filename $!\n";}
|
|
} else { # is null
|
|
$_ = "-"; # stdin
|
|
}
|
|
|
|
} elsif ($io =~ /out/i) { # Output file
|
|
if ($filename) { # and not null
|
|
$junk = eval "tr#\-/.a-zA-Z0-9#_#cs";
|
|
if (m#/#) { # contains a path.
|
|
$path =~ s#^(.*/).*#$1#; # Strip filename from path
|
|
unless (-w $path)
|
|
{ die "Output file IO error: Output directory unwritable\n";}
|
|
}
|
|
unless ( (!(-e $filename)) || -w $filename ) # Not exist or writable
|
|
{ die "Output file IO error: Output file unwritable\n";}
|
|
} else { # is null
|
|
$_ = ">-"; # stdout
|
|
}
|
|
} else { # Shouldn't get here.
|
|
die "Error in sanitize function\n";
|
|
}
|
|
|
|
|
|
return ($_);
|
|
|
|
}
|
|
|
|
#################################################################
|
|
# #
|
|
# Find Bookmarks #
|
|
# #
|
|
#################################################################
|
|
|
|
sub find_bookmarks {
|
|
|
|
my $pg_bookmark = "*END*THE SMALL PRINT!";
|
|
my $pg_bookmark_name = "Text Begins";
|
|
my $bookmark_rec = "";
|
|
|
|
if ($is_pg) {
|
|
|
|
#
|
|
# Set 'start of text' bookmark
|
|
#
|
|
$bookmark_rec .= bookmark_rec($_[0],$pg_bookmark,$pg_bookmark_name);
|
|
|
|
#
|
|
# Find and set chapter bookmarks
|
|
#
|
|
while ($_[0] =~ /\n((?:chapter|chaptre).*?)\s*?\n/gi ) {
|
|
|
|
if ($is_verbose) {
|
|
print "Bookmark: $1\t\tOffset: " . pos($_[0]) . "\n";
|
|
}
|
|
$bookmark_rec .= bookmark_rec($_[0],"$1","$1",pos($_[0]) -
|
|
length($1));
|
|
}
|
|
|
|
}
|
|
|
|
if ($is_bookmark) {
|
|
|
|
while ($_[0] =~ /\n<(.+?)>/g ) {
|
|
|
|
if ($is_verbose) {
|
|
print "Bookmark: $1\n";
|
|
}
|
|
$bookmark_rec .= bookmark_rec($_[0],"$1","$1");
|
|
}
|
|
$_[0] =~ s/\n<(.+?)>//g;
|
|
|
|
|
|
}
|
|
|
|
return ($bookmark_rec);
|
|
|
|
}
|
|
|
|
|
|
#################################################################
|
|
# #
|
|
# Compression Error Checking #
|
|
# #
|
|
#################################################################
|
|
|
|
|
|
sub check_comp ($$) {
|
|
|
|
#
|
|
# Compares the original block to one that's been compressed and decompressed
|
|
# and reports any places where they differ.
|
|
#
|
|
# Requires:
|
|
# $original_block, the formatted block that was originally sent
|
|
# to be compressed. Passed to the subroutine as a parameter
|
|
#
|
|
# $comp_block, the compressed version of the block
|
|
# Passed to the subroutine as a parameter
|
|
#
|
|
# Returns: Nothing. Output from this routine goes to standard output.
|
|
#
|
|
#
|
|
#
|
|
|
|
#
|
|
#
|
|
# Local Vars HERE!
|
|
#
|
|
|
|
my $original_block = $_[0];
|
|
my $comp_block = $_[1];
|
|
my $roundtrip_block = ""; # buffer for decompressed text.
|
|
my $comp_index = 0; # index for start of next element in $comp_block
|
|
my $element; # element read from the compressed data stream
|
|
my $bytes_added = 0; # the number of bytes added to the output
|
|
|
|
my $pair_var; # integer used to hold the two-byte packed pair.
|
|
my $offset; # used if B compression is encountered.
|
|
my $length; # used if B compression is encountered.
|
|
|
|
my $i; # simple loop variable
|
|
|
|
|
|
while ($comp_index < length($comp_block)) {
|
|
$element = substr($comp_block, $comp_index, 1);
|
|
|
|
#
|
|
# decompress the next element:
|
|
#
|
|
if ((ord($element) == 0x00) || # Literal byte range
|
|
((ord($element) >= 0x09) &&
|
|
(ord($element) <= 0x7F))) {
|
|
|
|
#
|
|
# output the literal byte.
|
|
#
|
|
|
|
$roundtrip_block .= $element;
|
|
$bytes_added = 1;
|
|
$comp_index += 1;
|
|
|
|
} elsif ((ord($element) >= 0x01) && # 'A' (escaped) code range
|
|
(ord($element) <= 0x08)) {
|
|
|
|
#
|
|
# Copy next $element bytes literally. (shouldn't happen at this point)
|
|
#
|
|
|
|
$roundtrip_block .= substr($comp_block, $comp_index + 1, ord($element));
|
|
$bytes_added = ord($element);
|
|
$comp_index += (1 + $bytes_added);
|
|
|
|
} elsif ((ord($element) >= 0x80) && # 'B' (LZ77) code range
|
|
(ord($element) <= 0xBF)) {
|
|
|
|
#
|
|
# read the next byte and copy the offset, length pair if it's a B code.
|
|
#
|
|
|
|
$pair_var = ((ord($element)) << 8) +
|
|
ord(substr($comp_block, ($comp_index + 1), 1));
|
|
$offset = ($pair_var >> 3) & 0x7FF;
|
|
$length = ($pair_var & 0x07) + 3;
|
|
|
|
#
|
|
# sanity checks
|
|
#
|
|
|
|
if (($offset <= 0) or ($offset > 2047)) { # out of window error
|
|
die "offset is " . $offset . " at index " .
|
|
(length($roundtrip_block)). "!!!\n";
|
|
}
|
|
|
|
if (($length < 3) or ($length > 10)) { # too few/too many
|
|
die "length is " . $length . " at index " . # bytes to copy error
|
|
(length($roundtrip_block)) . "!!!\n";
|
|
}
|
|
|
|
if ((length($roundtrip_block) - $offset) < 0) { # read before start
|
|
# of block error
|
|
|
|
die "offset " . $offset . " goes beyond beginning of block!!!\n"; }
|
|
#
|
|
# This last one would really be better if a meaningful representation of
|
|
# *where* in the file/block the offensive offset occurs could be included.
|
|
#
|
|
|
|
for ($i = 1; $i <= $length; $i++) {
|
|
$roundtrip_block .= substr($roundtrip_block,
|
|
(length($roundtrip_block) - $offset), 1);
|
|
}
|
|
|
|
$bytes_added = $length;
|
|
$comp_index += 2;
|
|
|
|
|
|
} elsif ((ord($element) >= 0xC0) && # 'C' (space + char) code range
|
|
(ord($element) <= 0xFF)) {
|
|
|
|
#
|
|
# output the space + character
|
|
#
|
|
|
|
$roundtrip_block .= " ";
|
|
$roundtrip_block .= chr(ord($element) & 0x7F);
|
|
$bytes_added = 2;
|
|
$comp_index += 1;
|
|
}
|
|
|
|
|
|
} # end while
|
|
|
|
if ( $roundtrip_block ne $original_block) {
|
|
die "Compressed text does not match original\n";
|
|
}
|
|
|
|
} # end of check_comp
|