LDP/LDP/builder/db2omf/db2omf

497 lines
11 KiB
Perl

#!/usr/bin/perl
#
# DESC: Script for creating an OMF (OpenSource Metadata Framework) file
# from DocBook SGML.
#
# USAGE: db2omf {-o <output_file>} <docbook_file>
#
# Greg Ferguson <gferg@sgi.com>
#
# See Changelog for the revision history.
$VERSION = '0.6-pre';
# output stream flush
#
$| = 1;
use Text::Wrap qw($columns &wrap);
$columns = 70;
$_vrs = $VERSION;
$_res_template = '';
$_creator_template = '';
my($in_f, $out_f) = '';
# read in cmd-line arguments
#
while(1) {
if($ARGV[0] eq "-o") {
shift(@ARGV);
$out_f = $ARGV[0];
shift(@ARGV);
} elsif($ARGV[0] eq "-h") {
&usage();
} else {
$in_f = $ARGV[0];
shift(@ARGV);
}
if($ARGV[0] eq '') {
last;
}
}
if( $in_f eq '' ) {
print "db2omf: ERROR <docbook_file> not specified.\n\n";
&usage();
} elsif( !(-r $in_f) ) {
print "db2omf: ERROR cannot read $f ($!)\n\n";
&usage();
}
if( $out_f eq '' ) {
$in_f =~ /([^\/]+)\./i;
$out_f = $1 . ".omf";
}
# get today's date; timestamp the template entries
#
my(@t_comps) = localtime(time());
$i = (($t_comps[5])+0) + 1900;
my($y) = "$i";
$i = (($t_comps[4])+0) + 1;
my($m) = "$i";
my($d) = $t_comps[3];
$_today = "${y}" . (length($m) == 1 ? '0' : '') . "${m}" .
(length($d) == 1 ? '0' : '') . "${d}";
# read in the OMF resource template
#
while(<DATA>){
$_res_template .= $_;
}
close(DATA);
$_creator_template =" <creator created=\"%%DATE\">\n" .
" <person created=\"%%DATE\">\n" .
" <firstName created=\"%%DATE\">\n" .
" %%FIRSTNAME\n" .
" </firstName>\n" .
" <lastName created=\"%%DATE\">\n" .
" %%LASTNAME\n" .
" </lastName>\n" .
" <email created=\"%%DATE\">\n" .
" %%EMAIL\n" .
" </email>\n" .
" </person>\n" .
" </creator>";
$_creator_template =~ s/%%DATE/$_today/g;
$_res_template =~ s/%%DATE/$_today/g;
&proc_file($in_f);
$_res_template =~ s/%%CREATOR/$_creator_template/;
$_res_template =~ s/%%SGML/$in_f/;
# write out the OMF file
#
open(OMF, "> $out_f") || die "db2omf: cannot write to $out_f ($!)\n";
print OMF $_res_template, "\n";
close(OMF);
print "db2omf: created $out_f from $in_f\n";
exit(0);
#
#
#
sub proc_file {
my($f) = @_;
my($buf,$vers,$auth,$tmp,$y,$m,$d,$s) = '';
my($i, $notfound) = 0;
# read in file (book/article info area only)
#
open(DBF, "$f") || die "db2omf: cannot open $f ($!)\n";
while(<DBF>) {
if($notfound == 0) {
if( $_ =~ /<article/i || $_ =~ /<book/i ) {
$notfound = 1;
} else {
next;
}
}
if($_ =~ /^</ && !($_ =~ />\n$/)) {
chop;
}
if($_ =~ /<\/artheader/i || /<\/articleinfo/i || $_ =~ /<\/bookinfo/i) {
$buf .= $_;
last;
}
if($_ ne '') {
$buf .= $_;
}
}
close(DBF);
$buf =~ s/\r//g;
# grab title; clean it up
#
($s) = ($buf =~ m#<title>\s*(.*?)\s*</title>#is);
$s =~ s/\n//g;
$s =~ s/<subtitle>\s*(.*?)\s*<\/subtitle>//gi;
$s =~ s/<[^>]*>//gs;
$s =~ s/[\s\.\ ]+$//g;
$s =~ s/^[\s\.\ ]+//g;
$s =~ s/&lowbar;/_/g;
$_res_template =~ s/%%TITLE/$s/;
# document type; based on title (or filename...)
#
if( $s =~ /how[\ ]*to/i || $f =~ /how[\ ]*to/i ) {
$_res_template =~ s/%%TYPE/HOWTO/;
} elsif( $s =~ /guide/i ) {
$_res_template =~ s/%%TYPE/guide/;
} else {
$_res_template =~ s/%%TYPE//;
}
# keywords; based on title
#
$s = lc($s);
$s =~ s/[\-\;\,\:\(\)\+\/\\]+/\ /g;
my($wrd) = '';
my(@wrds) = ('to', 'the', 'and', 'is', 'a',
'of', 'for', 'from', 'but', 'brief', 'at', 'how',
'with', 'on', 'off', 'via', 'list', 'la', 'le', 'mini');
foreach $wrd (@wrds) {
$s =~ s/[\ ]+$wrd[\ ]+/\ /g;
}
$s =~ s/cd\ rom/cd-rom/g;
$s =~ s/i\ o/i\/o/g;
@wrds = split(/\ /, $s);
%seen = ();
@wrds = grep(!$seen{$_}++, @wrds);
$s = '';
foreach (sort(@wrds)) {
$s .= " <keywords created=\"${_today}\">" . $_ . "</keywords>\n";
}
$_res_template =~ s/%%KEYWORDS/$s/g;
# grab author/email
# XXX TODO: only grabbing one author...
#
($auth) = ($buf =~ m#<author>\s*(.*?)\s*</author>#is);
($s) = ($auth =~ m#<firstname>\s*(.*?)\s*</firstname>#is);
$_creator_template =~ s/%%FIRSTNAME/$s/;
($s) = ($auth =~ m#<surname>\s*(.*?)\s*</surname>#is);
$_creator_template =~ s/%%LASTNAME/$s/;
($s) = ($auth =~ m#<email>\s*(.*?)\s*</email>#is);
$_creator_template =~ s/%%EMAIL/$s/;
# grab description/abstract; clean it up
#
($s) = ($buf =~ m#<abstract>\s*(.*?)\s*</abstract>#is);
$s =~ s/<indexterm>\s*(.*?)\s*<\/indexterm>//gim;
$s =~ s/<[^>]*>//gs;
$s =~ s/\n/\ /g;
$s =~ s/[\ ]{2,}/\ /g;
$tmp = wrap(' ', ' ', $s);
$_res_template =~ s/%%DESC/$tmp/;
# grab rev/date; take the last entry
# XXX TODO: sometimes <revhistory> resides outside the meta area...
#
$vers = '';
$tmp = '';
($tmp) = ($buf =~ m#<pubdate[^>]*>\s*(.*?)\s*</pubdate>#is);
if( $tmp eq '' || $buf =~ /<revhistory>/i ) {
# grab rev/date; take the first entry, or should it be the last?!
#
if( $buf =~ m#<revnumber>\s*(.*?)\s*</revnumber>#gis ) {
$vers = $1;
}
if( $buf =~ m#<date>\s*(.*?)\s*</date>#gis ) {
$tmp = $1;
} elsif( $buf =~ m#<releaseinfo>\s*(.*?)\s*</releaseinfo>#gis ) {
$tmp = $1;
}
}
if( $vers eq '' && $tmp ne '' ) {
# pull the version out of the <{pub}date> specification
#
if( $tmp =~ /[Rr]+evision:[\ ]+([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv]+ersion[\ ]+([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv]+([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv\.\,]+\ ([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv]+er[\.\ ]+([\w]+)\-([\w]+)\-([\w]+)\-([\w]+)/ ) {
$vers = $4;
} elsif( $tmp =~ /^([\d\.]+)[,\ ]+/ ) {
$vers = $1;
}
if( $tmp =~ /[Vv]+[\,\ ]+[\d]+[\ ]+[A-Za-z]+[\ ]+[\d]{4}/
||
$tmp =~ /draft[\,\ ]+/i ) {
# date is the version, so null it
#
$vers = '';
}
}
if( $vers eq '' || !($vers =~ /\d+/) || $vers =~/\d\d\d\d/ ) {
# a fallback, rarely needed...
#
if( open(HTF_REV, "grep -i '<revnumber>' $f |") ) {
$vers = <HTF_REV>;
chop($vers);
close(HTF_REV);
$vers =~ s/<[^>]*>//gs;
$vers =~ s/^[Vv]//;
$vers =~ s/^[\s\.\,\;\ \"\>]+//g;
}
if( $vers eq '' ) {
($vers) = ($buf =~ m#<edition>\s*(.*?)\s*</edition>#is);
$vers =~ s/revision//i;
$vers =~ s/version//i;
$vers =~ s/ver//i;
$vers =~ s/v//i;
$vers =~ s/^[\s\.\,\;\ \"\>]+//g;
}
if( $vers eq '' ) {
$vers = '1.0(?)';
}
} else {
$vers =~ s/^[Vv]//;
$vers =~ s/^\.//;
}
$tmp =~ s/(?:rd|nd|st|th)[\ \,]+/,\ /g;
if( $tmp =~ /(\d\d\d\d)[\/\-]+(\d\d)[\/\-]+(\d\d)/ ) {
$y = $1;
$m = $2;
$d = $3;
} elsif( $tmp =~ /\w+[\ ]+([\w]+)[\ ]+([\d]+)[\ ]+[\d\:]+[\ ]+\w+[\ ]+(\d\d\d\d)/ ) {
$y = $3;
$m = &get_date($1);
$d = $2;
if( length($d) == 1 ) {
$d = "0$d";
}
} elsif( $tmp =~ /(\w+)[\.,\ ]+(\d+)[,\ ]+(\d\d\d\d)/ ) {
$m = &get_date($1);
$d = $2;
$y = $3;
if( length($d) == 1 ) {
$d = "0$d";
}
} elsif( $tmp =~ /[\ ]+(\w+)[\,\.\ ]+(\d\d\d\d)/ ) {
$y = $2;
$m = &get_date($1);
$d = '01';
if( $tmp =~ /(\d+)[\ ]+[\w\.]+[\ ]+\d\d\d\d/ ) {
$d = $1;
if( length($d) == 1 ) {
$d = "0$d";
}
}
} elsif( $tmp =~ /(\d+)[\/\.]+(\d+)[\/\.]+(\d\d\d\d)/ ) {
$y = $3;
$m = $1;
if( length($m) == 1 ) {
$m = "0$m";
}
$d = $2;
if( length($d) == 1 ) {
$d = "0$d";
}
} elsif( $tmp =~ /[Vv]+er[\.\ ]+([\w]+)\-([\w]+)\-([\w]+)\-([\w]+)/ ) {
$y = $1;
$m = &get_date($2);
if( length($m) == 1 ) {
$m = "0$m";
}
$d = $3;
if( length($d) == 1 ) {
$d = "0$d";
}
} elsif( $tmp =~ /(\d\d\d\d)\-([\w]+)\-(\d\d)/ ) {
$y = $1;
$m = &get_date($2);
$d = $3;
if( length($d) == 1 ) {
$d = "0$d";
}
} elsif( $tmp =~ /([\d]+)[\ ]+([\w]+)[,\ ]+(\d\d\d\d)/ ) {
$y = $3;
$m = &get_date($2);
$d = $1;
if( length($d) == 1 ) {
$d = "0$d";
}
} elsif( $tmp =~ /([\w]+)[\ \,]+(\d\d\d\d)/ ) {
$y = $2;
$m = &get_date($1);
$d = '01';
}
if( !($y =~ /^2/ || $y =~ /^1/) ) {
$y = '';
}
if( !($y =~ /^\d\d\d\d$/) ) {
$y = '';
}
if( $m eq '00' ) {
$y = '';
}
$s = "${y}${m}${d}";
if( $y eq '' ) {
$s = $_today;
}
$_res_template =~ s/%%VERSION/$vers/;
$_res_template =~ s/%%V_DATE/$s/;
$_res_template =~ s/%%C_DATE/$s/;
return;
}
#
#
#
sub get_date {
my($str) = @_;
if($str =~ /^Jan/i ) {
return '01';
} elsif($str =~ /^Feb/i ) {
return '02';
} elsif($str =~ /^Mar/i ) {
return '03';
} elsif($str =~ /^Apr/i ) {
return '04';
} elsif($str =~ /^May/i ) {
return '05';
} elsif($str =~ /^Jun/i ) {
return '06';
} elsif($str =~ /^Jul/i ) {
return '07';
} elsif($str =~ /^Aug/i ) {
return '08';
} elsif($str =~ /^Sep/i ) {
return '09';
} elsif($str =~ /^Oct/i ) {
return '10';
} elsif($str =~ /^Nov/i ) {
return '11';
} elsif($str =~ /^Dec/i ) {
return '12';
}
return '00';
}
#
#
#
sub usage {
print "\ndb2omf {-o <output_file>} <docbook_file>\n\n",
"db2omf - version $_vrs\n\n";
exit(0);
}
# __END__ delimits the script from the data section;
# the following data is read in by the "DATA" filehandle
__END__
<?xml version='1.0'?>
<!DOCTYPE omf PUBLIC "-//Open Source Metadata Framework (OMF) //DTD OMF.dtd V1.1//EN"
"http://www.ibiblio.org/pub/Linux/metadata/OMF.dtd">
<omf xmlns="http://www.ibiblio.org/osrt/omf/" created="%%DATE" agent="db2omf">
<resource created="%%DATE">
<title created="%%DATE">%%TITLE</title>
<date created="%%DATE">%%C_DATE</date>
%%CREATOR
<versionGroup created="%%DATE">
<version created="%%DATE">
<id created="%%DATE">
%%VERSION
</id>
<date created="%%DATE">
%%V_DATE
</date>
</version>
</versionGroup>
%%KEYWORDS
<description created="%%DATE">
%%DESC
</description>
<type created="%%DATE">%%TYPE</type>
<format created="%%DATE" dtd="DocBook" mime="text/sgml" />
<identifier created="%%DATE" url="file:/%%SGML" />
</resource>
</omf>