837 lines
30 KiB
Perl
Executable File
837 lines
30 KiB
Perl
Executable File
#!/usr/bin/perl -w
|
|
# Created by Ben Okopnik on Wed Oct 25 18:09:09 EDT 2006
|
|
# Processes an mbox, creates the LG mailbag articles; run without args for full documentation
|
|
use strict;
|
|
$|++;
|
|
# use CGI::Carp qw/fatalsToBrowser warningsToBrowser/;
|
|
use CGI qw/:standard/;
|
|
|
|
############## USER_CONFIG SECTION ##########################
|
|
# The author[s] credited for the Mailbag, etc.
|
|
my $author = "";
|
|
|
|
# Where to write the KnowledgeBase entries
|
|
my $tag_kb = "$ENV{LG_ROOT}/data/kb";
|
|
|
|
# Chop off thread on main page if it's longer than this number of characters
|
|
my $cutoff = 3500;
|
|
|
|
# The tags to use for delimiting various types of content
|
|
my $raw_start = '[RAW]';
|
|
my $raw_stop = '[/RAW]';
|
|
my $private_start = '[PRIVATE]';
|
|
my $private_stop = '[/PRIVATE]';
|
|
|
|
# Three-character subject prefixes (used with a ':' immediately after
|
|
# them), their related weights (i.e., their order in the main page), and
|
|
# their section headings.
|
|
my %sections = (
|
|
gaz => [ 0, "Gazette Matters" ],
|
|
sts => [ 1, "Still Searching" ],
|
|
tkb => [ 2, "Talkbacks" ],
|
|
tct => [ 3, "2-Cent Tips" ],
|
|
gem => [ 4, "LG Gems" ],
|
|
);
|
|
|
|
# Titles (HTML and heading) generated for specified output filenames. The
|
|
# default is "Mailbag".
|
|
my %titles = (
|
|
'lg_talkback.html' => "Talkback",
|
|
'lg_talkback2.html' => "Talkback2",
|
|
'lg_talkback3.html' => "Talkback3",
|
|
'lg_talkback4.html' => "Talkback4",
|
|
'lg_tips.html' => "2-Cent Tips",
|
|
'lg_gems.html' => "Selected Gems from Our Mailbag",
|
|
'lg_laundrette.html' => "The Linux Launderette",
|
|
'lg_launderette.html' => "The Linux Launderette",
|
|
);
|
|
|
|
############## USER_CONFIG SECTION ##########################
|
|
|
|
#####################################################
|
|
# This code is a really ugly hack, and I must have #
|
|
# been smoking really cheap crack when I wrote it. #
|
|
# It is, however, useful. #
|
|
#####################################################
|
|
|
|
# Nifty little procedure: show the docs if the args are wrong.
|
|
exec "/usr/bin/perldoc $0" unless @ARGV && -f $ARGV[0];
|
|
# Get the input filename
|
|
my $in_file = shift;
|
|
die "'$in_file' does not exist!\n" unless -f $in_file;
|
|
die "'$in_file' is not a valid mailbox!\n"
|
|
unless qx#/usr/bin/file $in_file# =~ /mail text/;
|
|
# Get the output filename if one is given
|
|
my $fname = shift if $ARGV[0];
|
|
# Fail if that filename doesn't have a ".html" extension
|
|
die "The filename must have a '.html' extension!\n"
|
|
if $fname && $fname !~ /\.html$/;
|
|
# If an optional title is supplied, use it.
|
|
my $title = shift if $ARGV[0];
|
|
|
|
# If no output filename is given, make it the default one
|
|
$fname ||= "lg_mail.html";
|
|
if ( ! $title ){
|
|
# If the output filename matches one of those defined in the %titles hash,
|
|
# above, set the $title variable accordingly (it'll be used for the title
|
|
# and the headings later.)
|
|
$title = $titles{$fname} if exists $titles{$fname};
|
|
# If no match exists, set the title to a default one.
|
|
$title ||= "Mailbag";
|
|
}
|
|
|
|
# Define all HTML entities
|
|
my @ent =
|
|
qw/
|
|
nbsp iexcl cent pound curren yen brvbar sect uml copy ordf laquo not shy
|
|
reg macr deg plusmn sup2 sup3 acute micro para middot cedil sup1 ordm raquo
|
|
frac14 frac12 frac34 iquest Agrave Aacute Acirc Atilde Auml Aring AElig
|
|
Ccedil Egrave Eacute Ecirc Euml Igrave Iacute Icirc Iuml ETH Ntilde Ograve
|
|
Oacute Ocirc Otilde Ouml times Oslash Ugrave Uacute Ucirc Uuml Yacute THORN
|
|
szlig agrave aacute acirc atilde auml aring aelig ccedil egrave eacute
|
|
ecirc euml igrave iacute icirc iuml eth ntilde ograve oacute ocirc otilde
|
|
ouml divide oslash ugrave uacute ucirc uuml yacute thorn yuml
|
|
/;
|
|
# Build the 'chr => entity' equivalence list using the above
|
|
my %entity;
|
|
$entity{chr $_} = "&" . $ent[$_ - 160] . ";" for 160 .. 255;
|
|
# Add the 'special' characters
|
|
@entity{ split //, "<>&" } = qw/< > &/;
|
|
|
|
# This sub does all the text processing. Individual email bodies are fed to
|
|
# it, and it processes them as described in the comments below.
|
|
sub cleanup {
|
|
my $body = shift;
|
|
# Get rid of all preceding whitespace; replace all following whitespace with a single "\n"
|
|
$body =~ s/\A\s*(.*?)\s*\Z/$1\n/s;
|
|
|
|
# Remove anything that was explicitly marked as private
|
|
my $ps = () = $body =~ /(\Q$private_start\E)/g;
|
|
my $pe = () = $body =~ /(\Q$private_stop\E)/g;
|
|
unless ($ps == $pe){
|
|
print "\nERROR: Unequal number of 'PRIVATE' tags in the mail body.\n\n";
|
|
exit;
|
|
}
|
|
$body =~ s/(?:^|\n)[ ]*\Q$private_start\E[ ]*\n/\n$private_start/g;
|
|
$body =~ s/\n[ ]*\Q$private_stop\E[ ]*(?:\n|$)/\n$private_stop/g;
|
|
$body =~ s/\Q$private_start\E.*?\Q$private_stop\E/ [[[Elided content]]] /gsm;
|
|
|
|
my $rs = () = $body =~ /(\Q$raw_start\E)/g;
|
|
my $re = () = $body =~ /(\Q$raw_stop\E)/g;
|
|
unless ($rs == $re){
|
|
print "\nERROR: Unequal number of 'RAW' tags in the mail body.\n\n";
|
|
exit;
|
|
}
|
|
$body =~ s/(?:^|\n)[ ]*\Q$raw_start\E[ ]*\n/\n$raw_start/g;
|
|
$body =~ s/\n[ ]*\Q$raw_stop\E[ ]*(?:\n|$)/\n$raw_stop/g;
|
|
|
|
# Beginning of the biggest 's///' statement I've ever written :)))
|
|
$body =~ s%(?:^|\Q$raw_stop\E)(.+?)(?:\Q$raw_start\E|$)%
|
|
my $body1 = $1;
|
|
|
|
# Dump the TAG footer
|
|
$body1 =~ s#^\+-\+---+\+-\+\nYou've asked.*?mailman/listinfo/tag\n*##gsm;
|
|
# Get rid of any whitespace preceding and following the body
|
|
$body1 =~ s/\s*(.*?)\s*/$1/s;
|
|
# Convert all necessary characters to entities
|
|
### WHOOPS... this breaks Unicode characters. Must rethink.
|
|
# $body1 =~ s/([\240-\377<>&])/$entity{$1}/eg;
|
|
$body1 =~ s/([<>&])/$entity{$1}/eg;
|
|
# Get rid of the "--- next part ---" junk
|
|
$body1 =~ s/-{5,} next part -{5,}$//gsmi;
|
|
# Toss the 'Attachment was scrubbed' stanza added by Mailman
|
|
$body1 =~ s/^A[^\n]+ was scrubbed\.\.\.\n.*?U[Rr][Ll]\s?:[^\n]+\n//gsm;
|
|
# Convert the 'tweaked' "From"s at the beginning of the line so they
|
|
# don't automatically become <pre>s
|
|
$body1 =~ s/^(?:>|>)From/ From/gsm;
|
|
# Convert .sigs to <pre> - they usually use some kind of layout
|
|
# $body1 =~ s#^-- ?\n(([^\n]+\n){1,8})([^\n]*\n){0,5}$#(defined$1&&defined$3)?"<pre>-- \n$1</pre>$3":"$&"#egsm;
|
|
$body1 =~ s#^(-- \n([^\n]+\n){0,9}([^\n]*\n?)?)\Z#(defined$1)?"<pre>$1\n</pre>":"$&"#sme;
|
|
# Convert cited email headers to <pre>
|
|
$body1 =~ s#^(((Date|From|To|Subject|Newsgroups|User-Agent|X-Rcpt-To|X-Country|X-UIDL|X-Bogosity|X-Mas|Message-ID|Reply-To|CC|Cc):[^\n]+\n)+)#<pre>\n$1</pre>#gsm;
|
|
|
|
# Tricky bit here: since we're going to replace multiple newlines with
|
|
# <p>s, and _don't_ want any of those in <pre> blocks, we're going to
|
|
# insert a _space_ within any TAG-special "<pre>" blocks (i.e.,
|
|
# anything delimited by "``/''" or "```/'''".
|
|
my ($flag, $tmp_body);
|
|
for ( split /\n/, $body1 ){
|
|
if (/^(```?)\s*$/ && ! $flag){
|
|
$tmp_body .= length($1) == 3 ? "<pre class='code'>\n" : "<pre>\n";
|
|
$flag = 1;
|
|
next;
|
|
}
|
|
if (/^'''?\s*$/ && $flag){
|
|
$tmp_body .= "</pre>\n";
|
|
$flag = 0;
|
|
next;
|
|
}
|
|
if ($flag && /^$/){
|
|
$tmp_body .= " \n";
|
|
}
|
|
else {
|
|
$tmp_body .= "$_\n";
|
|
}
|
|
}
|
|
$body1 = $tmp_body if $tmp_body;
|
|
|
|
my($tmp1, $tmp2);
|
|
# Convert all simple (single-string) _em_s and _e_m_s
|
|
# $body1 =~ s{(^|\s)_(\S+)_($|\s|\.|,)}{$1<em>$2</em>$3}gsm;
|
|
$body1 =~ s{(^|\s)_(\S+)_($|\s|\.|,|;)}{($tmp1 = $2) =~ tr|_| |; "$1<em>$tmp1</em>$3"}egsm;
|
|
# Convert all simple (single-string) *strong*s and *str*ongs*
|
|
# $body1 =~ s{(^|\s)\*(\S+)\*($|\s|\.|,|;)}{$1<strong>$2</strong>$3}gsm;
|
|
$body1 =~ s{(^|\s)\*(\S+)\*($|\s|\.|,|;)}{($tmp2 = $2) =~ tr|*| |; "$1<strong>$tmp2</strong>$3"}egsm;
|
|
|
|
# Hotlink the URLs
|
|
my $tmp3;
|
|
# $body1 =~ s#(?<!href=)['"]?(http://[-a-zA-Z0-9_/~&?=.]+)(['"\)\]]?)#<a href="$1">$1</a>$2#gsm;
|
|
$body1 =~ s#(?<!href=)(['"]?)((?:https?|ftp)://[^'"\]\s\n]+[^'"\)\]\s\n,.&?-])#
|
|
$tmp3 = length($2) > 85 ? substr($2, 0, 40) . "[...]" . substr($2, -40) : $2;
|
|
"$1<a href='$2'>$tmp3</a>"#egsm;
|
|
# LG addresses get special treatment (relative paths within LG)
|
|
$body1 =~ s#(a href=["'])http://linuxgazette.net/([^'"\n]+)#$1../$2#g;
|
|
|
|
# Replace text smilies with an image; preserve text as 'alt' tag
|
|
$body1 =~ s#(:\)|:-\)|:>|:->)#<img src="../gx/smile.png" alt="$1">#g;
|
|
# Replace the frownies as well
|
|
$body1 =~ s#(:\(|:-\(|:<|:-<)#<img src="../gx/frown.png" alt="$1">#g;
|
|
|
|
# Find any runs of '>'s at line start (quoted email) and <pre> them
|
|
$body1 =~ s#(^>.*\n)+#\n<pre>\n$&</pre>\n#gm;
|
|
|
|
$body1;
|
|
%seg;
|
|
|
|
# Find any number of repeated newlines and enclose in </p> and <p>, and...
|
|
$body =~ s#(\n{2,})#\n</p>$1<p>\n#gsm;
|
|
# ...wrap the entire body in <p> and </p>.
|
|
$body =~ s#.*#<p>\n$&\n</p>\n#s;
|
|
|
|
# Insert editorial comments
|
|
$body =~ s|\@\#\$(.*?)\$\#\@|<p class="editorial">\n[[[ $1 ]]]\n</p>\n|gsm;
|
|
|
|
# Clean up overzealous <p> markup
|
|
$body =~ s#<p>\s*\n<pre>#<pre>#gsm;
|
|
$body =~ s#</pre>\s*\n</p>#</pre>#gsm;
|
|
|
|
# Special processing for René!
|
|
$body =~ s/=\?iso-8859-15\?Q\?Ren=E9\?=|Ren=E9|Ren=C3=A9|Ren=C3=E9|René|René|Ren\xe9/René/gsm;
|
|
|
|
# Return the much-massaged body
|
|
$body;
|
|
}
|
|
|
|
# Create the 'misc/lg' subdir if it doesn't exist
|
|
unless ( -d "misc/lg" ){ system "/bin/mkdir -p misc/lg" and die "mkdir failed: $!\n"; }
|
|
|
|
# Build up the data structure that contains the processed mailbox
|
|
my ($subj, $reflink, $iss, $link, $reftitle, $x, $fn, $str, $from, $date,
|
|
$body, $last, $seen, %files);
|
|
sub build {
|
|
# Clear the old content of the special Talkback header
|
|
undef $reflink;
|
|
# If it is a Talkback, parse the necessary info out of the subject
|
|
# if ( $subj =~ m%^tkb:\s*Talkback:\s*((?:issue)?\d+)(/\S+)% ){
|
|
if ( $subj =~ m%^tkb:\s*Talkback:\s*((?:issue)?\d+)(/[^#]+)% ){
|
|
$iss = $1;
|
|
$link = "$1$2";
|
|
# Read the article title from the article itself (SVN working copy)
|
|
# if it exists; use LG-local URL otherwise.
|
|
if ( -f "$ENV{LG_ARTICLES}/$link" ){
|
|
open Fh, "$ENV{LG_ARTICLES}/$link" or die "$ENV{LG_ARTICLES}/$link: $!\n";
|
|
while ( $x = <Fh> ){
|
|
if ( $x =~ /^title:\s*(.*)\s*$/ ){
|
|
$reftitle = $1;
|
|
last;
|
|
}
|
|
}
|
|
close Fh;
|
|
}
|
|
else {
|
|
$reftitle = $2;
|
|
}
|
|
# Create the Talkback header
|
|
$reflink = qq@[ In reference to "<a href='../$link'>$reftitle</a>" in LG#$iss ]@;
|
|
}
|
|
|
|
# Remove the marker tags from the subject before using it in the output
|
|
for ( keys %sections ){
|
|
$subj =~ s/^$_:\s*//i;
|
|
}
|
|
# If we're at the top of the first post in the thread, append
|
|
# the subject and the Talkback header (if it exists)
|
|
unless ( exists $files{$fn} ){
|
|
$str = h3("$subj")."\n";
|
|
$str .= p(b($reflink)) if $reflink;
|
|
}
|
|
|
|
$str .= p("\n".b(cleanup($from)).br."\n".b($date)."\n")."\n\n".cleanup($body);
|
|
# Add the formatted post as an element in an arrayref pointed
|
|
# to by its (defanged) subject in the '%files' hash
|
|
push @{$files{$fn}}, $str;
|
|
# Clear out all the variables used in processing this post
|
|
$str = $subj = $from = $date = $body = "";
|
|
}
|
|
|
|
my %gang;
|
|
|
|
# It's sort of odd to explicitly open a filename that's already in @ARGV...
|
|
# but if I wanted to do anything else, I'd have to do CLI switch
|
|
# processing, and Oh, Mother.
|
|
open In, $in_file or die "$in_file: $!\n";
|
|
|
|
while ( <In> ){
|
|
# Convert to Unix format by removing DOS carriage returns
|
|
y/\r//d;
|
|
if ( /^From / .. /^$/ ){
|
|
# Get all the participants' email addresses for later checking
|
|
# against the LG roster
|
|
$gang{$1}++ if /^From:.*?([a-zA-Z0-9_.+-]+\@[a-zA-Z0-9_.+-]+)/;
|
|
|
|
# Once the body of the email has been built up, process it
|
|
if ( $body ){
|
|
&build;
|
|
}
|
|
chomp;
|
|
# Collapse repeated whitespace
|
|
y/ \t/ /s;
|
|
# "Collect" any multi-line subjects into a single string
|
|
if ( $last && $last eq "subj" && $_ !~ /^\S+:/ ){
|
|
$subj .= $_;
|
|
}
|
|
|
|
$last = "";
|
|
$seen = 1;
|
|
# Normalize the 'From:' line to Mailbag standards
|
|
if ( /^From:\s*(.*) (\S+)\@(\S+)\s*$/ ){
|
|
$from = "$1 [$2 at $3]";
|
|
$from =~ tr/<>"//d;
|
|
$from =~ s/'(.*)'/$1/g;
|
|
}
|
|
# Ditto the subject
|
|
if ( s/^Subject:\s*// ){
|
|
s/\[TAG\]\s*//;
|
|
s/(?:re:|fw:|fwd:|forw:|balasan:|aw:|[\(\[](?:re|fw|fwd|forw|balasan|aw)[\)\]])\s*//ig;
|
|
$subj = $_;
|
|
$last = "subj";
|
|
}
|
|
# Get the date
|
|
$date = $1 if /^Date:\s*(.*?)\s*$/;
|
|
next;
|
|
}
|
|
else {
|
|
if ( $seen ){
|
|
# Flip the flag
|
|
$seen = 0;
|
|
# Create a unique filename from the subject
|
|
( $fn = $subj ) =~ y/A-Za-z0-9:/_/cs;
|
|
$fn =~ s/^_*(.*?)_*$/$1/;
|
|
}
|
|
# Collect the non-header lines
|
|
$body .= $_;
|
|
}
|
|
}
|
|
close In;
|
|
|
|
# Add last post
|
|
&build;
|
|
|
|
############### Output section ####################
|
|
# Build the header / define the title
|
|
sub lg_header {
|
|
# start_html(-title=>"$_[0]",-lang=>"utf-8",-style=>{src=>"../../../lg.css"}) .
|
|
"<!DOCTYPE html PUBLIC '-//W3C//DTD XHTML 1.0 Transitional//EN'\n".
|
|
" 'http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd'>\n".
|
|
"<html xmlns='http://www.w3.org/1999/xhtml' lang='utf-8' xml:lang='utf-8'>\n".
|
|
"<head>\n".
|
|
"<title>$_[0]</title>\n".
|
|
"<meta http-equiv='Content-Type' content='text/html; charset=utf-8' />\n".
|
|
"<link rel='stylesheet' type='text/css' href='../../../lg.css' />\n".
|
|
"</head>\n".
|
|
"<body>\n".
|
|
a({href=>"../../../"},img({src=>"../../../gx/2003/newlogo-blank-200-gold2.jpg",id=>"logo",alt=>"Linux Gazette"})) .
|
|
img({src=>"../../../gx/tux_86x95_indexed.png",id=>"tux",alt=>"Tux"}) .
|
|
p({id=>"fun"},"...making Linux just a little more fun!") .
|
|
"<div class='content articlecontent'>" . a({name=>"top"},"");
|
|
}
|
|
|
|
my ( $issue, %seen );
|
|
# Get the current issue number
|
|
open Lg, "$ENV{LG_LIBPYTHON}/lgconfig.py" or die "lgconfig.py: $!\n";
|
|
while ( <Lg> ){
|
|
if ( /^currentIssueNumber.*?(\d+)/ ){
|
|
$issue = $1;
|
|
last;
|
|
}
|
|
}
|
|
close Lg;
|
|
|
|
my $barename = $fname;
|
|
$barename =~ s/\.html$//;
|
|
# Create the "flat content" page for TWDT
|
|
open Twdt, ">TWDT.lg_answer$issue-$barename.html" or die "TWDT.lg_answer$issue-$barename.html: $!\n";
|
|
open Fp, ">$fname" or die "$fname: $!\n";
|
|
|
|
print Fp "author: $author\ntitle: $title\n\n";
|
|
print Twdt h2($title), "\n";
|
|
|
|
# if the author is defined, get their name
|
|
if ( $author ){
|
|
my $name;
|
|
open Au, "$ENV{LG_ROOT}/authors/$author";
|
|
while ( <Au> ){
|
|
if ( /^name:\s*(.*)$/ ){
|
|
$name = $1;
|
|
last;
|
|
}
|
|
}
|
|
close Au;
|
|
|
|
print Twdt p( "By ", a({href=>"../authors/$author.html"}, $name ) );
|
|
}
|
|
|
|
# Process the participants' bios and extract the names
|
|
if ( $title =~ /Mailbag/ ){
|
|
my %bios;
|
|
# Do they have an LG bio?
|
|
for our $Bio ( <$ENV{LG_ROOT}/authors/*> ){
|
|
chomp $Bio;
|
|
open Bio or die "$Bio: $!\n";
|
|
for ( <Bio> ){
|
|
chomp;
|
|
last if /^$/;
|
|
my( $k, $v ) = split /: ?/;
|
|
if ( $v ){
|
|
$v =~ s/\s*$//;
|
|
$bios{$Bio}{$k} = $v;
|
|
}
|
|
}
|
|
}
|
|
|
|
# Build list of LG participants
|
|
my @took_part;
|
|
for my $email ( sort keys %gang ){
|
|
for ( keys %bios ){
|
|
push @took_part, $bios{$_}{name} if $bios{$_}{privateEmail} && $bios{$_}{privateEmail} eq $email;
|
|
}
|
|
}
|
|
print Fp h3( "This month's answers created by:" ),
|
|
strong( "[ ", join( ", ", @took_part ), " ]" ),
|
|
"\n", br, "...and you, our readers!", br,
|
|
hr({size=>"3", width=>"50%", align=>"center"});
|
|
|
|
print Twdt h3( "This month's answers created by:" ),
|
|
strong( "[ ", join( ", ", @took_part ), " ]" ),
|
|
"\n", br, "...and you, our readers!", br,
|
|
hr({size=>"3", width=>"50%", align=>"center"});
|
|
}
|
|
|
|
# Add editorial commentary if a "-head" file exists
|
|
if ( -f "$in_file-head" ){
|
|
print Fp h2("Editor's Note");
|
|
print Twdt h2("Editor's Note");
|
|
open Cmt, "$in_file-head" or die "$in_file-head: $!\n";
|
|
print Fp while <Cmt>;
|
|
print Twdt while <Cmt>;
|
|
print Fp hr({size=>"3", width=>"50%", align=>"center"});
|
|
print Twdt hr({size=>"3", width=>"50%", align=>"center"});
|
|
close Cmt;
|
|
}
|
|
|
|
# Define weighted sorting order for different categories (i.e., Mailbag is
|
|
# first, etc.) See %sections at the top for relative weights.
|
|
sub weight {
|
|
my $str = substr $_[0], 0, 4;
|
|
( $str =~ s/:$// ) || return 100;
|
|
return ( exists $sections{lc $str} ) ? $sections{lc $str}->[0] : 100;
|
|
}
|
|
|
|
# Create/overwrite the KnowledgeBase link file
|
|
open Kb, ">$tag_kb/$issue-$fname" or die "$issue-$fname: $!\n";
|
|
|
|
for my $key ( sort { weight($a) <=> weight($b) } keys %files ){
|
|
my $sec = $key =~ /^(...):/ ? lc $1 : "";
|
|
# Flatten the key to create the link name
|
|
( my $lnk = $key ) =~ s/^...:_*//;
|
|
$lnk = lc $lnk;
|
|
$lnk =~ tr/a-z0-9_/_/sc;
|
|
$lnk =~ s/^_*(.*?)_*(?:html)?_*$/$1/;
|
|
|
|
# Print the section header just once
|
|
if ( exists $sections{ $sec } ){
|
|
# Only print the section headers if they are part of the Mailbag
|
|
# page
|
|
if ( $title eq "Mailbag" ){
|
|
print Fp h1( $sections{ $sec }->[1]), "\n", hr, "\n" unless $seen{ $sec }++;
|
|
print Twdt h1( $sections{ $sec }->[1]), "\n", hr, "\n" unless $seen{ $sec }++;
|
|
}
|
|
}
|
|
else {
|
|
if ($fname eq "lg_mail.html"){
|
|
print Fp h1("Our Mailbag"), "\n", hr, "\n" unless $seen{mailbag}++;
|
|
print Twdt h1("Our Mailbag"), "\n", hr, "\n" unless $seen{mailbag}++;
|
|
}
|
|
}
|
|
|
|
# Extract the subject
|
|
( my $subject = $files{ $key }->[0] ) =~ s#^.*?<h3>([^<]+)</h3>.*#$1#s;
|
|
# Create unique anchor for thread
|
|
print Fp "\n<!-- Thread anchor: $subject --><a name='$lnk'></a>\n";
|
|
print Twdt "\n<!-- Thread anchor: $subject --><a name='$lnk'></a>\n";
|
|
print Kb "<span class='issuenum'>[ LG #$issue ]</span> <a href='../$issue/$fname#$lnk'>$subject</a><br>\n";
|
|
|
|
my($shortie);
|
|
# Chop off the first post if it's too long
|
|
if ( length $files{ $key }->[0] > $cutoff ){
|
|
$shortie = substr $files{ $key }->[0], 0, ( rindex $files{ $key }->[0], "</p>", $cutoff );
|
|
print Fp $shortie, p("\n[ ... ]\n");
|
|
}
|
|
else {
|
|
print Fp $files{ $key }->[0], "\n";
|
|
}
|
|
|
|
for ( @{ $files{ $key } } ){
|
|
print Twdt $_, br;
|
|
}
|
|
|
|
if ( $shortie || @{ $files{ $key } } > 1 ){
|
|
if (-f "misc/lg/$lnk.html"){
|
|
print "\nERROR: while processing '$in_file', I ran across a thread named '$lnk' that was longer\n" .
|
|
"than the defined thread cutoff length ($cutoff characters.) This would normally result in\n" .
|
|
"the creation of a file called 'misc/lg/$lnk.html', but this file ALREADY EXISTS.\n\n" .
|
|
"Since there's no way for me to tell whether this is a result of an error (e.g., threads with\n" .
|
|
"conflicting names from different mailboxes) or an accidental second run of this program on a\n" .
|
|
"given mailbox, I have to give up and turn this over to a human.\n\n" .
|
|
"To resolve the first problem, I suggest locating and renaming the thread in one of the mailboxes.\n" .
|
|
"To resolve the second one, simply delete the 'misc/' directory and reprocess all the mailboxes.\n\n";
|
|
exit;
|
|
}
|
|
$shortie = "";
|
|
my $tnum = @{ $files{ $key } };
|
|
my $s = $tnum == 1 ? "" : "s";
|
|
my $tlen = sprintf "%.2f", length("@{ $files{ $key } }") / 1024;
|
|
print Fp p( b("[ ", a({name=>"mb-$lnk"},""),
|
|
a({href=>"misc/lg/$lnk.html"},"Thread continues here ($tnum message$s/${tlen}kB)"), " ]" ) ), "\n";
|
|
|
|
open Th, ">misc/lg/$lnk.html" or die "misc/lg/$lnk.html: $!\n";
|
|
print Th lg_header($subject);
|
|
for ( @{ $files{ $key } } ){
|
|
# Fix the LG relative links in the thread
|
|
s#(a href=['"])(\.\./)#$1$2$2$2#gsm;
|
|
|
|
print Th $_, br,
|
|
a({href=>"#top"},"Top"), " " x 4,
|
|
a({href=>"../../$fname#mb-$lnk"},"Back"),
|
|
hr({width=>"50%", align=>"left"}), p(br);
|
|
}
|
|
|
|
print Th "</div>", end_html;
|
|
close Th;
|
|
}
|
|
|
|
print Fp hr, "\n\n";
|
|
print Twdt hr, "\n\n";
|
|
|
|
}
|
|
|
|
close Fp;
|
|
close Kb;
|
|
close Twdt;
|
|
|
|
# Make the KB file accessible to everyone
|
|
chmod 0666, "$tag_kb/$issue-$fname";
|
|
|
|
=head1 NAME
|
|
|
|
lg-process-mailbag - Processes the LG mailbox for publication
|
|
|
|
=head1 SYNOPSIS
|
|
|
|
lg-process-mailbag <mailbox_name> [output_filename.html] [title]
|
|
|
|
NOTE: The exact output filename is significant, since it is used by the
|
|
program to create the HTML title and the page heading if it matches a
|
|
pattern. The equivalence table currently looks like this (it may be
|
|
expanded later):
|
|
|
|
Filename Title/header
|
|
======== ============
|
|
'lg_talkback.html' => "Talkback",
|
|
'lg_talkback2.html' => "Talkback2",
|
|
'lg_talkback3.html' => "Talkback3",
|
|
'lg_talkback4.html' => "Talkback4",
|
|
'lg_tips.html' => "2-Cent Tips",
|
|
'lg_laundrette.html' => "The Linux Launderette",
|
|
'lg_launderette.html' => "The Linux Launderette",
|
|
|
|
If no filename is specified, the output will be sent to 'lg_mail.html', and
|
|
the title/header will default to "Mailbag". An optional title can also be
|
|
specified as the last argument (after the filename); if it contains anything
|
|
other than an unbroken string of alphanumeric characters, the entire title
|
|
should be quoted.
|
|
|
|
If a file with the same name as the input file plus a "-head" extension exists
|
|
in the current directory, the content of that file will be inserted as
|
|
editorial commentary below an "Editor's Note" header. The insert will be
|
|
positioned below the title and credits but above the processed content. The
|
|
content of the '-head' file should be HTML-formatted but should not contain an
|
|
HTML header or footer.
|
|
|
|
'lg-process-mailbag' also creates a file containing a list of links, one per
|
|
topic, for later insertion into the KnowledgeBase; the file name consists of
|
|
the current issue number followed by a "-" and the name of the current output
|
|
file. It is saved in a directory defined in the user-configurable section at
|
|
the top of the script ('$SVN_ROOT/data/kb/' by default.)
|
|
|
|
A "TWDT.lg_answer<ISSUE_NUMBER>-<FILENAME>.html" is also created. These are
|
|
"flat" (non-threaded) representations of mailbag content, and are intended to
|
|
be concatenated (assuming there's more than one) into
|
|
"TWDT.lg_answer<ISSUE_NUMBER>.html", which should be copied into
|
|
$LG_ROOT/data/twdt, where it will be read by our publication scripts and
|
|
inserted into the TWDT during processing. [NOTE: Currently, this process
|
|
isn't usable - the build scripts need to be hacked to ignore all the varieties
|
|
of "lg_mail.html" while building the issue. For now, simply ignore the
|
|
resulting "TWDT" file.)
|
|
|
|
|
|
=head1 DESCRIPTION
|
|
|
|
'lg-process-mailbag' is the LG Mailbag processing script, which also handles
|
|
'Gazette Matters', 'Still Searching' (unanswered questions), and other
|
|
sections. Its output is a page containing all the first posts for each thread
|
|
in TAG (those that are too long are chopped off at the last paragraph marker
|
|
before the character count defined in '$cutoff' at the top of the script)
|
|
which are followed by links to a file comprising the rest of the thread (if
|
|
any.) It's made to be as automated as possible in order to minimize
|
|
hand-hacking; since humans will be humans, however, some twiddling is usually
|
|
required. :)
|
|
|
|
|
|
=head1 USAGE
|
|
|
|
1) Open the TAG mail archive in Mutt and delete all the repeated and
|
|
"uninteresting" messages (spam, broken messages from 'bogus', etc.)
|
|
|
|
2) Tag (using 't' for single messages, or 'esc-t' for threads) all messages
|
|
that don't belong in the mailbag (e.g. Talkbacks, Launderette, etc.) and
|
|
save them into distinctively-named mailboxes with the ';s' (tagged-save)
|
|
command. The 'l' (limit) key can also be very helpful in selecting the
|
|
messages that fit a pattern.
|
|
|
|
3) All messages with identical 'Subject:' lines will be grouped into
|
|
individual threads by the script; therefore, messages are added to or
|
|
removed from threads them by changing their subject lines (but see the
|
|
"Tags" subsection in "EDITING", below.)
|
|
|
|
4) Quit Mutt and run the script to build the Mailbag page and the related
|
|
files in '$PWD/misc/lg/'.
|
|
|
|
5) Review the produced HTML file and fix any problems you discover by
|
|
editing the mailbox directly (don't edit individual messages from within
|
|
Mutt; this changes the position of the email within the file and thus its
|
|
position within the thread.) Please see the EDITING section, below, for
|
|
specific editing tips and tricks.
|
|
|
|
6) Take a look at the files linked from within the Mailbag page (the links
|
|
are at the bottom of each post that exceeds '$cutoff' in length) and fix
|
|
any really serious problems. The idea here is to pluck the low-hanging
|
|
fruit; as long as the content is easily readable, small discrepancies don't
|
|
matter.
|
|
|
|
7) When you're done, move the output file and the 'misc' subdirectory into
|
|
$LG_ARTICLES/[current_issue]/, then copy the original mbox to
|
|
$LG_ROOT/data/tag/<issue_num> (you'll need to create the 'issue_num'
|
|
directory) and check everything into the repository with 'svn ci'.
|
|
|
|
=head1 EDITING
|
|
|
|
=head2 Tags
|
|
|
|
Any email that is marked with a special tag will be placed in the section
|
|
specified by that tag. Tags consist of a three-letter abbreviation and a
|
|
colon at the start of the subject, as follows:
|
|
|
|
gaz: Gazette Matters
|
|
sts: Still Searching
|
|
tkb: Talkbacks
|
|
tct: 2-Cent Tips
|
|
gem: LG Mail Gems
|
|
|
|
(e.g. 'Subject: sts: Looking for moonbat weasels in Cleveland'.)
|
|
|
|
Tag case is not significant, and the tags will be removed from the subject
|
|
during processing. The sections in the generated Mailbag page are arranged
|
|
in the above order; any email which does not have a tag will go into the
|
|
main ("Mailbag") section, which comes last.
|
|
|
|
=head2 Formatting Problems
|
|
|
|
Most of the HTML conversion is done by the program, which is fairly smart.
|
|
However, as any automated gadget, sometimes it's going to get things wrong.
|
|
|
|
=over 1
|
|
|
|
=item *
|
|
Unformatted code
|
|
|
|
The most common problem is code that gets all wrapped into one (or
|
|
more) lines instead of being nicely formatted. The reason is that it
|
|
wasn't properly delimited in the original email (i.e., with either a
|
|
pair or a trio of backticks and single quotes - see the Members FAQ,
|
|
http://linuxgazette.net/faq/members-faq.html#markup for exact details.)
|
|
|
|
FIX: Edit the original mail and insert the appropriate delimiters
|
|
before and after the code.
|
|
|
|
=item *
|
|
Collapsed lists
|
|
|
|
If someone has a neat little list of items in their mail, it is also
|
|
going to get wrapped. However, converting it to preformatted code
|
|
(essentially using <pre></pre> tags, as in the above example) is
|
|
usually inapproprate due to the font used in <pre>s.
|
|
|
|
FIX: Enter a blank line between the individual list items. They will
|
|
now become separate paragraphs, which looks just fine.
|
|
|
|
=item *
|
|
Huge amounts of whitespace in preformatted items
|
|
|
|
This is something like the opposite of the last problem; what is
|
|
happening is that in preformatted items other than the standard
|
|
backtick/single quote delimited code (e.g., quoted email which is
|
|
denoted by '>'s at the start of the line), multiple blank lines
|
|
are still converted to paragraphs. In general, this shouldn't happen -
|
|
the greatest majority of this is handled by the script - but there's
|
|
still a small chance of it.
|
|
|
|
FIX: Insert a single space at the beginning of each blank line within
|
|
the quoted material. The '</p><p>' pairs only get inserted into
|
|
successive runs of newlines (returns), and adding a space makes it not
|
|
match the search pattern anymore. _Voila,_ no more Antarctica
|
|
(wide-open expanses of white.)
|
|
|
|
=back
|
|
|
|
Most other layout problems can be cured by making the text preformatted -
|
|
i.e., by wrapping it in backtick/single quote sets.
|
|
|
|
|
|
=head1 SPECIFIC FEATURES
|
|
|
|
As mentioned earlier, the script is fairly smart about processing mail
|
|
text. Here are some of its built-in features; it's important to recognize
|
|
them and to know how they work, since the easiest way to fix problems is
|
|
often a matter of adjusting the content to follow the rules that it should
|
|
have followed in the first place.
|
|
|
|
=over 1
|
|
|
|
=item *
|
|
[PRIVATE]...[/PRIVATE] clause
|
|
|
|
Anything delimited with the above tags will be replaced with a
|
|
'[[[ Content elided ]]]' string. Not that TAG is a private list, but having
|
|
the option is nice.
|
|
|
|
=item *
|
|
[RAW]...[/RAW] clause
|
|
|
|
Anything delimited with these tags is "protected" from processing and will
|
|
not be modified by the script. Particularly useful for any HTML entities or
|
|
tags that you want to preserve as they stand (i.e., that you want to be used
|
|
as HTML tags rather than displayed as content.)
|
|
|
|
=item *
|
|
The TAG footer is automatically removed
|
|
|
|
This footer is the standard block of text that is appended to all TAG mail;
|
|
it is defined as starting with '+-+--- [...] ---+-+' and ending with
|
|
"http://lists.linuxgazette.net/mailman/listinfo/tag", with several lines of
|
|
text between the two.
|
|
|
|
=item *
|
|
Non-HTML-parseable characters are converted to entities
|
|
|
|
'<', '>', '&', and all ASCII characters from 160 to 255 are converted to
|
|
their HTML equivalents.
|
|
|
|
=item *
|
|
Mailman's "Attachment was scrubbed" messages are removed
|
|
|
|
These are defined as the above phrase plus a URL where the attachment is
|
|
stored.
|
|
|
|
=item *
|
|
All signature blocks are converted to preformatted text
|
|
|
|
A signature block is a line consisting of two dashes, a space, and a
|
|
newline followed by any number of non-blank lines. If you see a signature
|
|
block getting wrapped into a one-line mess, just insert the '-- ' delimiter
|
|
immediately above it.
|
|
|
|
=item *
|
|
Cited email headers are preformatted
|
|
|
|
Cited email headers (i.e., a run of lines beginning with any of
|
|
"Date|From|To|Subject|Newsgroups|User-Agent|X-Rcpt-To|X-Country|X-UIDL|X-Bogosity|X-Mas")
|
|
will be converted to formatted text. If they don't start at the beginning
|
|
of the line, they will not be converted - so formatting them is often a
|
|
matter of removing the preceding whitespace.
|
|
|
|
=item *
|
|
Quoted email text is preformatted
|
|
|
|
Text preceded by '>'s at the beginning of the line will have its layout
|
|
preserved.
|
|
|
|
=item *
|
|
All "TAG markup" will be appropriately converted
|
|
|
|
Any blocks of text preceded by two backticks and followed by two single
|
|
quotes will be turned into formatted text; three of each will mark it as
|
|
'<pre class="code"></pre>' (formatted in a nice colored box - preferred for
|
|
actual code examples.) Make sure that these markers start at the beginning
|
|
of the line, and that nothing else follows them on that line.
|
|
|
|
=item *
|
|
"Email enhancements" are appropriately applied
|
|
|
|
Single words in *asterisks* will be made bold; those in _underscores_ will
|
|
be made italic. Note that words in /slashes/ will be ignored although they
|
|
are commonly used in email: treating them specially would screw up file paths...
|
|
|
|
=item *
|
|
URLs are hotlinked
|
|
|
|
Anything beginning with 'http://' is treated as a URL and is automatically
|
|
converted to a hotlink. LG URLs are turned into hotlinks with URLs pointing
|
|
to the local file structure.
|
|
|
|
=item *
|
|
Smilies are replaced with images (and use the text as an 'alt' link); ditto
|
|
the 'frownies'.
|
|
|
|
':)', ':-)', ':>', and ':->' will point to 'smile.png'.
|
|
':(', ':-(', ':<', and ':-<' will point to 'frown.png'.
|
|
|
|
=item *
|
|
Blank lines delimit paragraphs
|
|
|
|
=item *
|
|
Editorial comments
|
|
|
|
If you wish to comment on something in the text, feel free: start with a
|
|
line consisting of '@#$', insert whatever you want to write, and finish it
|
|
with a line consisting of '$#@'. Your insert will become an editorial comment.
|
|
|
|
=back
|
|
|
|
=head1 AUTHOR
|
|
|
|
Ben Okopnik (ben@linuxgazette.net)
|
|
|
|
=cut
|
|
|