old-www/LDP/LGNET/bin/lg-process-mailbag

#!/usr/bin/perl -w
# Created by Ben Okopnik on Wed Oct 25 18:09:09 EDT 2006
# Processes an mbox, creates the LG mailbag articles; run without args for full documentation
use strict;
$|++;
# use CGI::Carp qw/fatalsToBrowser warningsToBrowser/;
use CGI qw/:standard/;

############## USER_CONFIG SECTION ##########################
# The author[s] credited for the Mailbag, etc.
my $author = "";

# Where to write the KnowledgeBase entries
my $tag_kb = "$ENV{LG_ROOT}/data/kb";

# Chop off thread on main page if it's longer than this number of characters
my $cutoff = 3500;

# The tags to use for delimiting various types of content
my $raw_start = '[RAW]';
my $raw_stop = '[/RAW]';
my $private_start = '[PRIVATE]';
my $private_stop = '[/PRIVATE]';

# Three-character subject prefixes (used with a ':' immediately after
# them), their related weights (i.e., their order in the main page), and
# their section headings.
my %sections = (
	gaz	=>	[ 0, "Gazette Matters" ],
	sts	=>	[ 1, "Still Searching" ],
	tkb	=>	[ 2, "Talkbacks"       ],
	tct	=>	[ 3, "2-Cent Tips"     ],
	gem	=>	[ 4, "LG Gems"         ],
);

# Titles (HTML and heading) generated for specified output filenames. The
# default is "Mailbag".
my %titles = (
	'lg_talkback.html'		=>	"Talkback",
	'lg_talkback2.html'		=>	"Talkback2",
	'lg_talkback3.html'		=>	"Talkback3",
	'lg_talkback4.html'		=>	"Talkback4",
	'lg_tips.html'			=>	"2-Cent Tips",
	'lg_gems.html'			=>	"Selected Gems from Our Mailbag",
	'lg_laundrette.html'	=>	"The Linux Launderette",
	'lg_launderette.html'	=>	"The Linux Launderette",
);

############## USER_CONFIG SECTION ##########################

#####################################################
# This code is a really ugly hack, and I must have  #
# been smoking really cheap crack when I wrote it. 	#
# It is, however, useful.                           #
#####################################################

# Nifty little procedure: show the docs if the args are wrong.
exec "/usr/bin/perldoc $0" unless @ARGV && -f $ARGV[0];
# Get the input filename
my $in_file = shift;
die "'$in_file' does not exist!\n" unless -f $in_file;
die "'$in_file' is not a valid mailbox!\n"
	unless qx#/usr/bin/file $in_file# =~ /mail text/;
# Get the output filename if one is given
my $fname = shift if $ARGV[0];
# Fail if that filename doesn't have a ".html" extension
die "The filename must have a '.html' extension!\n"
	if $fname && $fname !~ /\.html$/;
# If an optional title is supplied, use it.
my $title = shift if $ARGV[0];

# If no output filename is given, make it the default one
$fname ||= "lg_mail.html";
if ( ! $title ){
	# If the output filename matches one of those defined in the %titles hash,
	# above, set the $title variable accordingly (it'll be used for the title
	# and the headings later.)
	$title = $titles{$fname} if exists $titles{$fname};
	# If no match exists, set the title to a default one.
	$title ||= "Mailbag";
}

# Define all HTML entities
my @ent =
qw/
nbsp iexcl cent pound curren yen brvbar sect uml copy ordf laquo not shy
reg macr deg plusmn sup2 sup3 acute micro para middot cedil sup1 ordm raquo
frac14 frac12 frac34 iquest Agrave Aacute Acirc Atilde Auml Aring AElig
Ccedil Egrave Eacute Ecirc Euml Igrave Iacute Icirc Iuml ETH Ntilde Ograve
Oacute Ocirc Otilde Ouml times Oslash Ugrave Uacute Ucirc Uuml Yacute THORN
szlig agrave aacute acirc atilde auml aring aelig ccedil egrave eacute
ecirc euml igrave iacute icirc iuml eth ntilde ograve oacute ocirc otilde
ouml divide oslash ugrave uacute ucirc uuml yacute thorn yuml
/;
# Build the 'chr => entity' equivalence list using the above
my %entity;
$entity{chr $_} = "&" . $ent[$_ - 160] . ";" for 160 .. 255;
# Add the 'special' characters
@entity{ split //, "<>&" } = qw/&lt; &gt; &amp;/;

# This sub does all the text processing. Individual email bodies are fed to
# it, and it processes them as described in the comments below.
sub cleanup {
	my $body = shift;
	# Get rid of all preceding whitespace; replace all following whitespace with a single "\n"
	$body =~ s/\A\s*(.*?)\s*\Z/$1\n/s;

	# Remove anything that was explicitly marked as private
	my $ps = () = $body =~ /(\Q$private_start\E)/g;
	my $pe = () = $body =~ /(\Q$private_stop\E)/g;
	unless ($ps == $pe){
		print "\nERROR: Unequal number of 'PRIVATE' tags in the mail body.\n\n";
		exit;
	}
	$body =~ s/(?:^|\n)[ 	]*\Q$private_start\E[ 	]*\n/\n$private_start/g;
	$body =~ s/\n[ 	]*\Q$private_stop\E[ 	]*(?:\n|$)/\n$private_stop/g;
	$body =~ s/\Q$private_start\E.*?\Q$private_stop\E/ [[[Elided content]]] /gsm;

	my $rs = () = $body =~ /(\Q$raw_start\E)/g;
	my $re = () = $body =~ /(\Q$raw_stop\E)/g;
	unless ($rs == $re){
		print "\nERROR: Unequal number of 'RAW' tags in the mail body.\n\n";
		exit;
	}
	$body =~ s/(?:^|\n)[ 	]*\Q$raw_start\E[ 	]*\n/\n$raw_start/g;
	$body =~ s/\n[ 	]*\Q$raw_stop\E[ 	]*(?:\n|$)/\n$raw_stop/g;

	# Beginning of the biggest 's///' statement I've ever written :)))
	$body =~ s%(?:^|\Q$raw_stop\E)(.+?)(?:\Q$raw_start\E|$)%
		my $body1 = $1;

		# Dump the TAG footer
		$body1 =~ s#^\+-\+---+\+-\+\nYou've asked.*?mailman/listinfo/tag\n*##gsm;
		# Get rid of any whitespace preceding and following the body
		$body1 =~ s/\s*(.*?)\s*/$1/s;
		# Convert all necessary characters to entities
		### WHOOPS... this breaks Unicode characters. Must rethink.
		# $body1 =~ s/([\240-\377<>&])/$entity{$1}/eg;
		$body1 =~ s/([<>&])/$entity{$1}/eg;
		# Get rid of the "--- next part ---" junk
		$body1 =~ s/-{5,} next part -{5,}$//gsmi;
		# Toss the 'Attachment was scrubbed' stanza added by Mailman
		$body1 =~ s/^A[^\n]+ was scrubbed\.\.\.\n.*?U[Rr][Ll]\s?:[^\n]+\n//gsm;
		# Convert the 'tweaked' "From"s at the beginning of the line so they
		# don't automatically become <pre>s
		$body1 =~ s/^(?:>|&gt;)From/ From/gsm;
		# Convert .sigs to <pre> - they usually use some kind of layout
		# $body1 =~ s#^-- ?\n(([^\n]+\n){1,8})([^\n]*\n){0,5}$#(defined$1&&defined$3)?"<pre>-- \n$1</pre>$3":"$&"#egsm;
		$body1 =~ s#^(-- \n([^\n]+\n){0,9}([^\n]*\n?)?)\Z#(defined$1)?"<pre>$1\n</pre>":"$&"#sme;
		# Convert cited email headers to <pre>
		$body1 =~ s#^(((Date|From|To|Subject|Newsgroups|User-Agent|X-Rcpt-To|X-Country|X-UIDL|X-Bogosity|X-Mas|Message-ID|Reply-To|CC|Cc):[^\n]+\n)+)#<pre>\n$1</pre>#gsm;

		# Tricky bit here: since we're going to replace multiple newlines with
		# <p>s, and _don't_ want any of those in <pre> blocks, we're going to
		# insert a _space_ within any TAG-special "<pre>" blocks (i.e.,
		# anything delimited by "``/''" or "```/'''".
		my ($flag, $tmp_body);
		for ( split /\n/, $body1 ){
			if (/^(```?)\s*$/ && ! $flag){
				$tmp_body .= length($1) == 3 ? "<pre class='code'>\n" : "<pre>\n";
				$flag = 1;
				next;
			}
			if (/^'''?\s*$/ && $flag){
				$tmp_body .= "</pre>\n";
				$flag = 0;
				next;
			}
			if ($flag && /^$/){
				$tmp_body .= " \n";
			}
			else {
				$tmp_body .= "$_\n";
			}
		}
		$body1 = $tmp_body if $tmp_body;

		 my($tmp1, $tmp2);
		# Convert all simple (single-string) _em_s and _e_m_s
		# $body1 =~ s{(^|\s)_(\S+)_($|\s|\.|,)}{$1<em>$2</em>$3}gsm;
		$body1 =~ s{(^|\s)_(\S+)_($|\s|\.|,|;)}{($tmp1 = $2) =~ tr|_| |; "$1<em>$tmp1</em>$3"}egsm;
		# Convert all simple (single-string) *strong*s and *str*ongs*
		# $body1 =~ s{(^|\s)\*(\S+)\*($|\s|\.|,|;)}{$1<strong>$2</strong>$3}gsm;
		$body1 =~ s{(^|\s)\*(\S+)\*($|\s|\.|,|;)}{($tmp2 = $2) =~ tr|*| |; "$1<strong>$tmp2</strong>$3"}egsm;

		# Hotlink the URLs
		my $tmp3;
		# $body1 =~ s#(?<!href=)['"]?(http://[-a-zA-Z0-9_/~&?=.]+)(['"\)\]]?)#<a href="$1">$1</a>$2#gsm;
		$body1 =~ s#(?<!href=)(['"]?)((?:https?|ftp)://[^'"\]\s\n]+[^'"\)\]\s\n,.&?-])#
			$tmp3 = length($2) > 85 ? substr($2, 0, 40) . "[...]" . substr($2, -40) : $2;
			"$1<a href='$2'>$tmp3</a>"#egsm;
		# LG addresses get special treatment (relative paths within LG)
		$body1 =~ s#(a href=["'])http://linuxgazette.net/([^'"\n]+)#$1../$2#g;

		# Replace text smilies with an image; preserve text as 'alt' tag
		$body1 =~ s#(:\)|:-\)|:&gt;|:-&gt;)#<img src="../gx/smile.png" alt="$1">#g;
		# Replace the frownies as well
		$body1 =~ s#(:\(|:-\(|:&lt;|:-&lt;)#<img src="../gx/frown.png" alt="$1">#g;

		# Find any runs of '>'s at line start (quoted email) and <pre> them
		$body1 =~ s#(^&gt;.*\n)+#\n<pre>\n$&</pre>\n#gm;

		$body1;
	%seg;

	# Find any number of repeated newlines and enclose in </p> and <p>, and...
	$body =~ s#(\n{2,})#\n</p>$1<p>\n#gsm;
	# ...wrap the entire body in <p> and </p>.
	$body =~ s#.*#<p>\n$&\n</p>\n#s;

	# Insert editorial comments
	$body =~ s|\@\#\$(.*?)\$\#\@|<p class="editorial">\n[[[ $1 ]]]\n</p>\n|gsm;

	# Clean up overzealous <p> markup
	$body =~ s#<p>\s*\n<pre>#<pre>#gsm;
	$body =~ s#</pre>\s*\n</p>#</pre>#gsm;

	# Special processing for René!
	$body =~ s/=\?iso-8859-15\?Q\?Ren=E9\?=|Ren=E9|Ren=C3=A9|Ren=C3=E9|RenÃ©|René|Ren\xe9/Ren&eacute;/gsm;

	# Return the much-massaged body
	$body;
}

# Create the 'misc/lg' subdir if it doesn't exist
unless ( -d "misc/lg" ){ system "/bin/mkdir -p misc/lg" and die "mkdir failed: $!\n"; }

# Build up the data structure that contains the processed mailbox
my ($subj, $reflink, $iss, $link, $reftitle, $x, $fn, $str, $from, $date,
	$body, $last, $seen, %files);
sub build {
	# Clear the old content of the special Talkback header
	undef $reflink;
	# If it is a Talkback, parse the necessary info out of the subject
	# if ( $subj =~ m%^tkb:\s*Talkback:\s*((?:issue)?\d+)(/\S+)% ){
	if ( $subj =~ m%^tkb:\s*Talkback:\s*((?:issue)?\d+)(/[^#]+)% ){
	    $iss = $1;
	    $link = "$1$2";
		# Read the article title from the article itself (SVN working copy)
		# if it exists; use LG-local URL otherwise.
		if ( -f "$ENV{LG_ARTICLES}/$link" ){
			open Fh, "$ENV{LG_ARTICLES}/$link" or die "$ENV{LG_ARTICLES}/$link: $!\n";
			while ( $x = <Fh> ){
				if ( $x =~ /^title:\s*(.*)\s*$/ ){
					$reftitle = $1;
					last;
				}
			}
			close Fh;
		}
		else {
			$reftitle = $2;
		}
		# Create the Talkback header
	    $reflink = qq@[ In reference to "<a href='../$link'>$reftitle</a>" in LG#$iss ]@;
	}

	# Remove the marker tags from the subject before using it in the output
	for ( keys %sections ){
		$subj =~ s/^$_:\s*//i;
	}
	# If we're at the top of the first post in the thread, append
	# the subject and the Talkback header (if it exists)
	unless ( exists $files{$fn} ){
		$str = h3("$subj")."\n";
		$str .= p(b($reflink)) if $reflink;
	}

	$str .= p("\n".b(cleanup($from)).br."\n".b($date)."\n")."\n\n".cleanup($body);
	# Add the formatted post as an element in an arrayref pointed
	# to by its (defanged) subject in the '%files' hash
	push @{$files{$fn}}, $str;
	# Clear out all the variables used in processing this post
	$str = $subj = $from = $date = $body = "";
}

my %gang;

# It's sort of odd to explicitly open a filename that's already in @ARGV...
# but if I wanted to do anything else, I'd have to do CLI switch
# processing, and Oh, Mother.
open In, $in_file or die "$in_file: $!\n";

while ( <In> ){
	# Convert to Unix format by removing DOS carriage returns
	y/\r//d;
	if ( /^From / .. /^$/ ){
		# Get all the participants' email addresses for later checking
		# against the LG roster
		$gang{$1}++ if /^From:.*?([a-zA-Z0-9_.+-]+\@[a-zA-Z0-9_.+-]+)/;

		# Once the body of the email has been built up, process it
		if ( $body ){
			&build;
		}
		chomp;
		# Collapse repeated whitespace
		y/ \t/ /s;
		# "Collect" any multi-line subjects into a single string
		if ( $last && $last eq "subj" && $_ !~ /^\S+:/ ){
			$subj .= $_;
		}

		$last = "";
		$seen = 1;
        # Normalize the 'From:' line to Mailbag standards
        if ( /^From:\s*(.*) (\S+)\@(\S+)\s*$/ ){
			$from = "$1 [$2 at $3]";
			$from =~ tr/<>"//d;
			$from =~ s/'(.*)'/$1/g;
		}
        # Ditto the subject
        if ( s/^Subject:\s*// ){
			s/\[TAG\]\s*//;
			s/(?:re:|fw:|fwd:|forw:|balasan:|aw:|[\(\[](?:re|fw|fwd|forw|balasan|aw)[\)\]])\s*//ig;
			$subj = $_;
			$last = "subj";
		}
        # Get the date
        $date = $1 if /^Date:\s*(.*?)\s*$/;
		next;
	}
	else {
		if ( $seen ){
			# Flip the flag
			$seen = 0;
			# Create a unique filename from the subject
			( $fn = $subj ) =~ y/A-Za-z0-9:/_/cs;
			$fn =~ s/^_*(.*?)_*$/$1/;
		}
		# Collect the non-header lines
		$body .= $_;
	}
}
close In;

# Add last post
&build;

############### Output section ####################
# Build the header / define the title
sub lg_header {
	# start_html(-title=>"$_[0]",-lang=>"utf-8",-style=>{src=>"../../../lg.css"}) .
	"<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN'\n".
	"		 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>\n".
	"<html xmlns='http://www.w3.org/1999/xhtml' lang='utf-8' xml:lang='utf-8'>\n".
	"<head>\n".
	"<title>$_[0]</title>\n".
    "<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\n".
    "<link rel='stylesheet' type='text/css' href='../../../lg.css' />\n".
    "</head>\n".
    "<body>\n".
	a({href=>"../../../"},img({src=>"../../../gx/2003/newlogo-blank-200-gold2.jpg",id=>"logo",alt=>"Linux Gazette"})) .
		img({src=>"../../../gx/tux_86x95_indexed.png",id=>"tux",alt=>"Tux"}) .
		p({id=>"fun"},"...making Linux just a little more fun!") .
			"<div class='content articlecontent'>" . a({name=>"top"},"");
}

my ( $issue, %seen );
# Get the current issue number
open Lg, "$ENV{LG_LIBPYTHON}/lgconfig.py" or die "lgconfig.py: $!\n";
while ( <Lg> ){
	if ( /^currentIssueNumber.*?(\d+)/ ){
		$issue = $1;
		last;
	}
}
close Lg;

my $barename = $fname;
$barename =~ s/\.html$//;
# Create the "flat content" page for TWDT
open Twdt, ">TWDT.lg_answer$issue-$barename.html" or die "TWDT.lg_answer$issue-$barename.html: $!\n";
open Fp, ">$fname" or die "$fname: $!\n";

print Fp "author: $author\ntitle: $title\n\n";
print Twdt h2($title), "\n";

# if the author is defined, get their name
if ( $author ){
	my $name;
	open Au, "$ENV{LG_ROOT}/authors/$author";
	while ( <Au> ){
		if ( /^name:\s*(.*)$/ ){
			$name = $1;
			last;
		}
	}
	close Au;

	print Twdt p( "By ", a({href=>"../authors/$author.html"}, $name ) );
}

# Process the participants' bios and extract the names
if ( $title =~ /Mailbag/ ){
	my %bios;
	# Do they have an LG bio?
	for our $Bio ( <$ENV{LG_ROOT}/authors/*> ){
		chomp $Bio;
		open Bio or die "$Bio: $!\n";
		for ( <Bio> ){
			chomp;
			last if /^$/;
			my( $k, $v ) = split /: ?/;
			if ( $v ){
				$v =~ s/\s*$//;
				$bios{$Bio}{$k} = $v;
			}
		}
	}

	# Build list of LG participants
	my @took_part;
	for my $email ( sort keys %gang ){
		for ( keys %bios ){
			push @took_part, $bios{$_}{name} if $bios{$_}{privateEmail} && $bios{$_}{privateEmail} eq $email;
		}
	}
	print Fp h3( "This month's answers created by:" ),
		strong( "[ ", join( ", ", @took_part ), " ]" ),
		"\n", br, "...and you, our readers!", br,
		hr({size=>"3", width=>"50%", align=>"center"});

	print Twdt h3( "This month's answers created by:" ),
		strong( "[ ", join( ", ", @took_part ), " ]" ),
		"\n", br, "...and you, our readers!", br,
		hr({size=>"3", width=>"50%", align=>"center"});
}

# Add editorial commentary if a "-head" file exists
if ( -f "$in_file-head" ){
	print Fp h2("Editor's Note");
	print Twdt h2("Editor's Note");
	open Cmt, "$in_file-head" or die "$in_file-head: $!\n";
	print Fp while <Cmt>;
	print Twdt while <Cmt>;
	print Fp hr({size=>"3", width=>"50%", align=>"center"});
	print Twdt hr({size=>"3", width=>"50%", align=>"center"});
	close Cmt;
}

# Define weighted sorting order for different categories (i.e., Mailbag is
# first, etc.) See %sections at the top for relative weights.
sub weight {
	my $str = substr $_[0], 0, 4;
	( $str =~ s/:$// ) || return 100;
	return ( exists $sections{lc $str} ) ? $sections{lc $str}->[0] : 100;
}

# Create/overwrite the KnowledgeBase link file
open Kb, ">$tag_kb/$issue-$fname" or die "$issue-$fname: $!\n";

for my $key ( sort { weight($a) <=> weight($b) } keys %files ){
	my $sec = $key =~ /^(...):/ ? lc $1 : "";
	# Flatten the key to create the link name
	( my $lnk = $key ) =~ s/^...:_*//;
	$lnk = lc $lnk;
	$lnk =~ tr/a-z0-9_/_/sc;
	$lnk =~ s/^_*(.*?)_*(?:html)?_*$/$1/;

	# Print the section header just once
	if ( exists $sections{ $sec } ){
		# Only print the section headers if they are part of the Mailbag
		# page
		if ( $title eq "Mailbag" ){
			print Fp h1( $sections{ $sec }->[1]), "\n", hr, "\n" unless $seen{ $sec }++;
			print Twdt h1( $sections{ $sec }->[1]), "\n", hr, "\n" unless $seen{ $sec }++;
		}
	}
	else {
		if ($fname eq "lg_mail.html"){
			print Fp h1("Our Mailbag"), "\n", hr, "\n" unless $seen{mailbag}++;
			print Twdt h1("Our Mailbag"), "\n", hr, "\n" unless $seen{mailbag}++;
		}
	}

	# Extract the subject
	( my $subject = $files{ $key }->[0] ) =~ s#^.*?<h3>([^<]+)</h3>.*#$1#s;
	# Create unique anchor for thread
	print Fp "\n<!-- Thread anchor: $subject --><a name='$lnk'></a>\n";
	print Twdt "\n<!-- Thread anchor: $subject --><a name='$lnk'></a>\n";
	print Kb "<span class='issuenum'>[ LG #$issue ]</span> <a href='../$issue/$fname#$lnk'>$subject</a><br>\n";

	my($shortie);
	# Chop off the first post if it's too long
	if ( length $files{ $key }->[0] > $cutoff ){
		$shortie = substr $files{ $key }->[0], 0, ( rindex $files{ $key }->[0], "</p>", $cutoff );
		print Fp $shortie, p("\n[ ... ]\n");
	}
	else {
		print Fp $files{ $key }->[0], "\n";
	}

	for ( @{ $files{ $key } } ){
		print Twdt $_, br;
	}

	if ( $shortie || @{ $files{ $key } } > 1 ){
		if (-f "misc/lg/$lnk.html"){
			print "\nERROR: while processing '$in_file', I ran across a thread named '$lnk' that was longer\n" .
					 "than the defined thread cutoff length ($cutoff characters.) This would normally result in\n" .
					 "the creation of a file called 'misc/lg/$lnk.html', but this file ALREADY EXISTS.\n\n" .
					 "Since there's no way for me to tell whether this is a result of an error (e.g., threads with\n" .
					 "conflicting names from different mailboxes) or an accidental second run of this program on a\n" .
					 "given mailbox, I have to give up and turn this over to a human.\n\n" .
					 "To resolve the first problem, I suggest locating and renaming the thread in one of the mailboxes.\n" .
					 "To resolve the second one, simply delete the 'misc/' directory and reprocess all the mailboxes.\n\n";
			exit;
		}
		$shortie = "";
		my $tnum = @{ $files{ $key } };
		my $s = $tnum == 1 ? "" : "s";
		my $tlen = sprintf "%.2f", length("@{ $files{ $key } }") / 1024;
		print Fp p( b("[ ", a({name=>"mb-$lnk"},""),
			a({href=>"misc/lg/$lnk.html"},"Thread continues here ($tnum message$s/${tlen}kB)"), " ]" ) ), "\n";

		open Th, ">misc/lg/$lnk.html" or die "misc/lg/$lnk.html: $!\n";
		print Th lg_header($subject);
		for ( @{ $files{ $key } } ){
			# Fix the LG relative links in the thread
			s#(a href=['"])(\.\./)#$1$2$2$2#gsm;

			print Th $_, br,
			a({href=>"#top"},"Top"), "&nbsp;" x 4,
			a({href=>"../../$fname#mb-$lnk"},"Back"),
			hr({width=>"50%", align=>"left"}), p(br);
		}

		print Th "</div>", end_html;
		close Th;
	}

	print Fp hr, "\n\n";
	print Twdt hr, "\n\n";

}

close Fp;
close Kb;
close Twdt;

# Make the KB file accessible to everyone
chmod 0666, "$tag_kb/$issue-$fname";

=head1 NAME

lg-process-mailbag - Processes the LG mailbox for publication

=head1 SYNOPSIS

lg-process-mailbag <mailbox_name> [output_filename.html] [title]

NOTE: The exact output filename is significant, since it is used by the
program to create the HTML title and the page heading if it matches a
pattern. The equivalence table currently looks like this (it may be
expanded later):

 Filename              Title/header
 ========              ============
 'lg_talkback.html'	=>	"Talkback",
 'lg_talkback2.html'	=>	"Talkback2",
 'lg_talkback3.html'	=>	"Talkback3",
 'lg_talkback4.html'	=>	"Talkback4",
 'lg_tips.html'		=>	"2-Cent Tips",
 'lg_laundrette.html'	=>	"The Linux Launderette",
 'lg_launderette.html'	=>	"The Linux Launderette",

If no filename is specified, the output will be sent to 'lg_mail.html', and
the title/header will default to "Mailbag".  An optional title can also be
specified as the last argument (after the filename); if it contains anything
other than an unbroken string of alphanumeric characters, the entire title
should be quoted.

If a file with the same name as the input file plus a "-head" extension exists
in the current directory, the content of that file will be inserted as
editorial commentary below an "Editor's Note" header. The insert will be
positioned below the title and credits but above the processed content. The
content of the '-head' file should be HTML-formatted but should not contain an
HTML header or footer.

'lg-process-mailbag' also creates a file containing a list of links, one per
topic, for later insertion into the KnowledgeBase; the file name consists of
the current issue number followed by a "-" and the name of the current output
file. It is saved in a directory defined in the user-configurable section at
the top of the script ('$SVN_ROOT/data/kb/' by default.)

A "TWDT.lg_answer<ISSUE_NUMBER>-<FILENAME>.html" is also created. These are
"flat" (non-threaded) representations of mailbag content, and are intended to
be concatenated (assuming there's more than one) into
"TWDT.lg_answer<ISSUE_NUMBER>.html", which should be copied into
$LG_ROOT/data/twdt, where it will be read by our publication scripts and
inserted into the TWDT during processing.  [NOTE: Currently, this process
isn't usable - the build scripts need to be hacked to ignore all the varieties
of "lg_mail.html" while building the issue. For now, simply ignore the
resulting "TWDT" file.)


=head1 DESCRIPTION

'lg-process-mailbag' is the LG Mailbag processing script, which also handles
'Gazette Matters', 'Still Searching' (unanswered questions), and other
sections. Its output is a page containing all the first posts for each thread
in TAG (those that are too long are chopped off at the last paragraph marker
before the character count defined in '$cutoff' at the top of the script)
which are followed by links to a file comprising the rest of the thread (if
any.) It's made to be as automated as possible in order to minimize
hand-hacking; since humans will be humans, however, some twiddling is usually
required. :)


=head1 USAGE

1) Open the TAG mail archive in Mutt and delete all the repeated and
"uninteresting" messages (spam, broken messages from 'bogus', etc.)

2) Tag (using 't' for single messages, or 'esc-t' for threads) all messages
that don't belong in the mailbag (e.g.  Talkbacks, Launderette, etc.) and
save them into distinctively-named mailboxes with the ';s' (tagged-save)
command. The 'l' (limit) key can also be very helpful in selecting the
messages that fit a pattern.

3) All messages with identical 'Subject:' lines will be grouped into
individual threads by the script; therefore, messages are added to or
removed from threads them by changing their subject lines (but see the
"Tags" subsection in "EDITING", below.)

4) Quit Mutt and run the script to build the Mailbag page and the related
files in '$PWD/misc/lg/'.

5) Review the produced HTML file and fix any problems you discover by
editing the mailbox directly (don't edit individual messages from within
Mutt; this changes the position of the email within the file and thus its
position within the thread.) Please see the EDITING section, below, for
specific editing tips and tricks.

6) Take a look at the files linked from within the Mailbag page (the links
are at the bottom of each post that exceeds '$cutoff' in length) and fix
any really serious problems. The idea here is to pluck the low-hanging
fruit; as long as the content is easily readable, small discrepancies don't
matter.

7) When you're done, move the output file and the 'misc' subdirectory into
$LG_ARTICLES/[current_issue]/, then copy the original mbox to
$LG_ROOT/data/tag/<issue_num> (you'll need to create the 'issue_num'
directory) and check everything into the repository with 'svn ci'.

=head1 EDITING

=head2 Tags

Any email that is marked with a special tag will be placed in the section
specified by that tag. Tags consist of a three-letter abbreviation and a
colon at the start of the subject, as follows:

 gaz:  		Gazette Matters
 sts:  		Still Searching
 tkb:  		Talkbacks
 tct:  		2-Cent Tips
 gem:		LG Mail Gems

(e.g. 'Subject: sts: Looking for moonbat weasels in Cleveland'.)

Tag case is not significant, and the tags will be removed from the subject
during processing. The sections in the generated Mailbag page are arranged
in the above order; any email which does not have a tag will go into the
main ("Mailbag") section, which comes last.

=head2 Formatting Problems

Most of the HTML conversion is done by the program, which is fairly smart.
However, as any automated gadget, sometimes it's going to get things wrong.

=over 1

=item *
Unformatted code

The most common problem is code that gets all wrapped into one (or
more) lines instead of being nicely formatted. The reason is that it
wasn't properly delimited in the original email (i.e., with either a
pair or a trio of backticks and single quotes - see the Members FAQ,
http://linuxgazette.net/faq/members-faq.html#markup for exact details.)

FIX: Edit the original mail and insert the appropriate delimiters
before and after the code.

=item *
Collapsed lists

If someone has a neat little list of items in their mail, it is also
going to get wrapped. However, converting it to preformatted code
(essentially using <pre></pre> tags, as in the above example) is
usually inapproprate due to the font used in <pre>s.

FIX: Enter a blank line between the individual list items. They will
now become separate paragraphs, which looks just fine.

=item *
Huge amounts of whitespace in preformatted items

This is something like the opposite of the last problem; what is
happening is that in preformatted items other than the standard
backtick/single quote delimited code (e.g., quoted email which is
denoted by '>'s at the start of the line), multiple blank lines
are still converted to paragraphs. In general, this shouldn't happen -
the greatest majority of this is handled by the script - but there's
still a small chance of it.

FIX: Insert a single space at the beginning of each blank line within
the quoted material. The '</p><p>' pairs only get inserted into
successive runs of newlines (returns), and adding a space makes it not
match the search pattern anymore. _Voila,_ no more Antarctica
(wide-open expanses of white.)

=back

Most other layout problems can be cured by making the text preformatted -
i.e., by wrapping it in backtick/single quote sets.


=head1 SPECIFIC FEATURES

As mentioned earlier, the script is fairly smart about processing mail
text. Here are some of its built-in features; it's important to recognize
them and to know how they work, since the easiest way to fix problems is
often a matter of adjusting the content to follow the rules that it should
have followed in the first place.

=over 1

=item *
[PRIVATE]...[/PRIVATE] clause

Anything delimited with the above tags will be replaced with a
'[[[ Content elided ]]]' string. Not that TAG is a private list, but having
the option is nice.

=item *
[RAW]...[/RAW] clause

Anything delimited with these tags is "protected" from processing and will
not be modified by the script. Particularly useful for any HTML entities or
tags that you want to preserve as they stand (i.e., that you want to be used
as HTML tags rather than displayed as content.)

=item *
The TAG footer is automatically removed

This footer is the standard block of text that is appended to all TAG mail;
it is defined as starting with '+-+--- [...] ---+-+' and ending with
"http://lists.linuxgazette.net/mailman/listinfo/tag", with several lines of
text between the two.

=item *
Non-HTML-parseable characters are converted to entities

'<', '>', '&', and all ASCII characters from 160 to 255 are converted to
their HTML equivalents.

=item *
Mailman's "Attachment was scrubbed" messages are removed

These are defined as the above phrase plus a URL where the attachment is
stored.

=item *
All signature blocks are converted to preformatted text

A signature block is a line consisting of two dashes, a space, and a
newline followed by any number of non-blank lines. If you see a signature
block getting wrapped into a one-line mess, just insert the '-- ' delimiter
immediately above it.

=item *
Cited email headers are preformatted

Cited email headers (i.e., a run of lines beginning with any of
"Date|From|To|Subject|Newsgroups|User-Agent|X-Rcpt-To|X-Country|X-UIDL|X-Bogosity|X-Mas")
will be converted to formatted text. If they don't start at the beginning
of the line, they will not be converted - so formatting them is often a
matter of removing the preceding whitespace.

=item *
Quoted email text is preformatted

Text preceded by '>'s at the beginning of the line will have its layout
preserved.

=item *
All "TAG markup" will be appropriately converted

Any blocks of text preceded by two backticks and followed by two single
quotes will be turned into formatted text; three of each will mark it as
'<pre class="code"></pre>' (formatted in a nice colored box - preferred for
actual code examples.) Make sure that these markers start at the beginning
of the line, and that nothing else follows them on that line.

=item *
"Email enhancements" are appropriately applied

Single words in *asterisks* will be made bold; those in _underscores_ will
be made italic. Note that words in /slashes/ will be ignored although they
are commonly used in email: treating them specially would screw up file paths...

=item *
URLs are hotlinked

Anything beginning with 'http://' is treated as a URL and is automatically
converted to a hotlink. LG URLs are turned into hotlinks with URLs pointing
to the local file structure.

=item *
Smilies are replaced with images (and use the text as an 'alt' link); ditto
the 'frownies'.

':)', ':-)', ':>', and ':->' will point to 'smile.png'.
':(', ':-(', ':<', and ':-<' will point to 'frown.png'.

=item *
Blank lines delimit paragraphs

=item *
Editorial comments

If you wish to comment on something in the text, feel free: start with a
line consisting of '@#$', insert whatever you want to write, and finish it
with a line consisting of '$#@'. Your insert will become an editorial comment.

=back

=head1 AUTHOR

Ben Okopnik (ben@linuxgazette.net)

=cut