new entry - DocBook-to-OMF (db2omf) package

This commit is contained in:
gferg 2001-06-06 22:16:38 +00:00
parent 4bf8df24bb
commit 14944f4b7b
2 changed files with 531 additions and 0 deletions

LDP/builder/db2omf/README Normal file
View File

@ -0,0 +1,33 @@
# db2omf - perl filter for producing OMF files from DocBook SGML #
# version 0.3 / June 2001 #
Copyright (C) 2001-2000 - Greg Ferguson (
This program is free software; you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation; either version 2 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program; if not, write to the Free Software
Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA 02111-1307 USA
Execute script as : db2omf {-o <output_file>} <docbook_file>.sgml
Perl referenced as : /usr/bin/perl
See the 'samples' directory for output examples derived from
actual DocBook (v3.x and v4.x) files.
For more information, see the OMF site -
or contact me (Greg Ferguson -

LDP/builder/db2omf/db2omf Normal file
View File

@ -0,0 +1,498 @@
# DESC: Script for creating an OMF (OpenSource Metadata Framework) file
# from DocBook SGML.
# USAGE: db2omf {-o <output_file>} <docbook_file>.sgml
# Greg Ferguson <>
# 12Mar2001 0.1 initial release
# 26Mar2001 0.2 modified public identifier
# 06jun2001 0.3 various clean-up, rm'd some elements that are not clear
# output stream flush
$| = 1;
use Text::Wrap qw($columns &wrap);
$columns = 70;
$_vrs = '0.3';
$_res_template = '';
$_creator_template = '';
my($in_f, $out_f) = '';
# read in cmd-line arguments
while(1) {
if($ARGV[0] eq "-o") {
$out_f = $ARGV[0];
} elsif($ARGV[0] eq "-h") {
} else {
$in_f = $ARGV[0];
if($ARGV[0] eq '') {
if( $in_f eq '' ) {
print "db2omf: ERROR <docbook_file>.sgml not specified.\n\n";
} elsif( !(-r $in_f) ) {
print "db2omf: ERROR cannot read $f ($!)\n\n";
if( $out_f eq '' ) {
$in_f =~ /([^\/]+)\.sgml$/i;
$out_f = $1 . ".omf";
# get today's date; timestamp the template entries
my(@t_comps) = localtime(time());
$i = (($t_comps[5])+0) + 1900;
my($y) = "$i";
$i = (($t_comps[4])+0) + 1;
my($m) = "$i";
my($d) = $t_comps[3];
$_today = "${y}" . (length($m) == 1 ? '0' : '') . "${m}" .
(length($d) == 1 ? '0' : '') . "${d}";
# read in the OMF resource template
$_res_template .= $_;
$_creator_template =" <creator created=\"%%DATE\">\n" .
" <person created=\"%%DATE\">\n" .
" <firstName created=\"%%DATE\">\n" .
" %%FIRSTNAME\n" .
" </firstName>\n" .
" <lastName created=\"%%DATE\">\n" .
" %%LASTNAME\n" .
" </lastName>\n" .
" <email created=\"%%DATE\">\n" .
" %%EMAIL\n" .
" </email>\n" .
" </person>\n" .
" </creator>";
$_creator_template =~ s/%%DATE/$_today/g;
$_res_template =~ s/%%DATE/$_today/g;
$_res_template =~ s/%%CREATOR/$_creator_template/;
$_res_template =~ s/%%SGML/$in_f/;
# write out the OMF file
open(OMF, "> $out_f") || die "db2omf: cannot write to $out_f ($!)\n";
print OMF $_res_template, "\n";
print "db2omf: created $out_f from $in_f\n";
sub proc_sgml {
my($f) = @_;
my($buf,$vers,$auth,$tmp,$y,$m,$d,$s) = '';
my($i, $notfound) = 0;
# read in file (book/article info area only)
open(DBF, "$f") || die "db2omf: cannot open $f ($!)\n";
while(<DBF>) {
if($notfound == 0) {
if( $_ =~ /<article/i || $_ =~ /<book/i ) {
$notfound = 1;
} else {
if($_ =~ /^</ && !($_ =~ />\n$/)) {
if($_ =~ /<\/artheader/i || /<\/articleinfo/i || $_ =~ /<\/bookinfo/i) {
$buf .= $_;
if($_ ne '') {
$buf .= $_;
$buf =~ s/\r//g;
# grab title; clean it up
($s) = ($buf =~ m#<title>\s*(.*?)\s*</title>#is);
$s =~ s/\n//g;
$s =~ s/<subtitle>\s*(.*?)\s*<\/subtitle>//gi;
$s =~ s/<[^>]*>//gs;
$s =~ s/[\s\.\ ]+$//g;
$s =~ s/^[\s\.\ ]+//g;
$s =~ s/&lowbar;/_/g;
$_res_template =~ s/%%TITLE/$s/;
# document type; based on title (or filename...)
if( $s =~ /how[\ ]*to/i || $f =~ /how[\ ]*to/i ) {
$_res_template =~ s/%%TYPE/HOWTO/;
} elsif( $s =~ /guide/i ) {
$_res_template =~ s/%%TYPE/guide/;
} else {
$_res_template =~ s/%%TYPE//;
# keywords; based on title
$s = lc($s);
$s =~ s/[\-\;\,\:\(\)\+\/\\]+/\ /g;
my($wrd) = '';
my(@wrds) = ('to', 'the', 'and', 'is', 'a',
'of', 'for', 'from', 'but', 'brief', 'at', 'how',
'with', 'on', 'off', 'via', 'list', 'la', 'le', 'mini');
foreach $wrd (@wrds) {
$s =~ s/[\ ]+$wrd[\ ]+/\ /g;
$s =~ s/cd\ rom/cd-rom/g;
$s =~ s/i\ o/i\/o/g;
@wrds = split(/\ /, $s);
%seen = ();
@wrds = grep(!$seen{$_}++, @wrds);
$s = '';
foreach (sort(@wrds)) {
$s .= " <keywords created=\"${_today}\">" . $_ . "</keywords>\n";
$_res_template =~ s/%%KEYWORDS/$s/g;
# grab author/email
# XXX TODO: only grabbing one author...
($auth) = ($buf =~ m#<author>\s*(.*?)\s*</author>#is);
($s) = ($auth =~ m#<firstname>\s*(.*?)\s*</firstname>#is);
$_creator_template =~ s/%%FIRSTNAME/$s/;
($s) = ($auth =~ m#<surname>\s*(.*?)\s*</surname>#is);
$_creator_template =~ s/%%LASTNAME/$s/;
($s) = ($auth =~ m#<email>\s*(.*?)\s*</email>#is);
$_creator_template =~ s/%%EMAIL/$s/;
# grab description/abstract; clean it up
($s) = ($buf =~ m#<abstract>\s*(.*?)\s*</abstract>#is);
$s =~ s/<indexterm>\s*(.*?)\s*<\/indexterm>//gim;
$s =~ s/<[^>]*>//gs;
$s =~ s/\n/\ /g;
$s =~ s/[\ ]{2,}/\ /g;
$tmp = wrap(' ', ' ', $s);
$_res_template =~ s/%%DESC/$tmp/;
# grab rev/date; take the last entry
# XXX TODO: sometimes <revhistory> resides outside the meta area...
$vers = '';
$tmp = '';
($tmp) = ($buf =~ m#<pubdate[^>]*>\s*(.*?)\s*</pubdate>#is);
if( $tmp eq '' || $buf =~ /<revhistory>/i ) {
# grab rev/date; take the first entry, or should it be the last?!
if( $buf =~ m#<revnumber>\s*(.*?)\s*</revnumber>#gis ) {
$vers = $1;
if( $buf =~ m#<date>\s*(.*?)\s*</date>#gis ) {
$tmp = $1;
} elsif( $buf =~ m#<releaseinfo>\s*(.*?)\s*</releaseinfo>#gis ) {
$tmp = $1;
if( $vers eq '' && $tmp ne '' ) {
# pull the version out of the <{pub}date> specification
if( $tmp =~ /[Rr]+evision:[\ ]+([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv]+ersion[\ ]+([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv]+([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv\.\,]+\ ([\w\.\-]+)[,\ ]+/ ) {
$vers = $1;
} elsif( $tmp =~ /[Vv]+er[\.\ ]+([\w]+)\-([\w]+)\-([\w]+)\-([\w]+)/ ) {
$vers = $4;
} elsif( $tmp =~ /^([\d\.]+)[,\ ]+/ ) {
$vers = $1;
if( $tmp =~ /[Vv]+[\,\ ]+[\d]+[\ ]+[A-Za-z]+[\ ]+[\d]{4}/
$tmp =~ /draft[\,\ ]+/i ) {
# date is the version, so null it
$vers = '';
if( $vers eq '' || !($vers =~ /\d+/) || $vers =~/\d\d\d\d/ ) {
# a fallback, rarely needed...
if( open(HTF_REV, "grep -i '<revnumber>' $f |") ) {
$vers = <HTF_REV>;
$vers =~ s/<[^>]*>//gs;
$vers =~ s/^[Vv]//;
$vers =~ s/^[\s\.\,\;\ \"\>]+//g;
if( $vers eq '' ) {
($vers) = ($buf =~ m#<edition>\s*(.*?)\s*</edition>#is);
$vers =~ s/revision//i;
$vers =~ s/version//i;
$vers =~ s/ver//i;
$vers =~ s/v//i;
$vers =~ s/^[\s\.\,\;\ \"\>]+//g;
if( $vers eq '' ) {
$vers = '1.0(?)';
} else {
$vers =~ s/^[Vv]//;
$vers =~ s/^\.//;
$tmp =~ s/(?:rd|nd|st|th)[\ \,]+/,\ /g;
if( $tmp =~ /(\d\d\d\d)[\/\-]+(\d\d)[\/\-]+(\d\d)/ ) {
$y = $1;
$m = $2;
$d = $3;
} elsif( $tmp =~ /\w+[\ ]+([\w]+)[\ ]+([\d]+)[\ ]+[\d\:]+[\ ]+\w+[\ ]+(\d\d\d\d)/ ) {
$y = $3;
$m = &get_date($1);
$d = $2;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /(\w+)[\.,\ ]+(\d+)[,\ ]+(\d\d\d\d)/ ) {
$m = &get_date($1);
$d = $2;
$y = $3;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /[\ ]+(\w+)[\,\.\ ]+(\d\d\d\d)/ ) {
$y = $2;
$m = &get_date($1);
$d = '01';
if( $tmp =~ /(\d+)[\ ]+[\w\.]+[\ ]+\d\d\d\d/ ) {
$d = $1;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /(\d+)[\/\.]+(\d+)[\/\.]+(\d\d\d\d)/ ) {
$y = $3;
$m = $1;
if( length($m) == 1 ) {
$m = "0$m";
$d = $2;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /[Vv]+er[\.\ ]+([\w]+)\-([\w]+)\-([\w]+)\-([\w]+)/ ) {
$y = $1;
$m = &get_date($2);
if( length($m) == 1 ) {
$m = "0$m";
$d = $3;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /(\d\d\d\d)\-([\w]+)\-(\d\d)/ ) {
$y = $1;
$m = &get_date($2);
$d = $3;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /([\d]+)[\ ]+([\w]+)[,\ ]+(\d\d\d\d)/ ) {
$y = $3;
$m = &get_date($2);
$d = $1;
if( length($d) == 1 ) {
$d = "0$d";
} elsif( $tmp =~ /([\w]+)[\ \,]+(\d\d\d\d)/ ) {
$y = $2;
$m = &get_date($1);
$d = '01';
if( !($y =~ /^2/ || $y =~ /^1/) ) {
$y = '';
if( !($y =~ /^\d\d\d\d$/) ) {
$y = '';
if( $m eq '00' ) {
$y = '';
$s = "${y}${m}${d}";
if( $y eq '' ) {
$s = $_today;
$_res_template =~ s/%%VERSION/$vers/;
$_res_template =~ s/%%V_DATE/$s/;
$_res_template =~ s/%%C_DATE/$s/;
sub get_date {
my($str) = @_;
if($str =~ /^Jan/i ) {
return '01';
} elsif($str =~ /^Feb/i ) {
return '02';
} elsif($str =~ /^Mar/i ) {
return '03';
} elsif($str =~ /^Apr/i ) {
return '04';
} elsif($str =~ /^May/i ) {
return '05';
} elsif($str =~ /^Jun/i ) {
return '06';
} elsif($str =~ /^Jul/i ) {
return '07';
} elsif($str =~ /^Aug/i ) {
return '08';
} elsif($str =~ /^Sep/i ) {
return '09';
} elsif($str =~ /^Oct/i ) {
return '10';
} elsif($str =~ /^Nov/i ) {
return '11';
} elsif($str =~ /^Dec/i ) {
return '12';
return '00';
sub usage {
print "\ndb2omf {-o <output_file>} <docbook_file>.sgml\n\n",
"db2omf - version $_vrs\n\n";
# __END__ delimits the script from the data section;
# the following data is read in by the "DATA" filehandle
<?xml version='1.0'?>
<!DOCTYPE omf PUBLIC "-//Open Source Metadata Framework (OMF) //DTD OMF.dtd V1.1//EN"
<omf xmlns="" created="%%DATE" agent="db2omf">
<resource created="%%DATE">
<title created="%%DATE">%%TITLE</title>
<date created="%%DATE">%%C_DATE</date>
<versionGroup created="%%DATE">
<version created="%%DATE">
<id created="%%DATE">
<date created="%%DATE">
<description created="%%DATE">
<type created="%%DATE">%%TYPE</type>
<format created="%%DATE" dtd="DocBook" mime="text/sgml" />
<identifier created="%%DATE" url="file:/%%SGML" />