mirror of https://github.com/tLDP/LDP
398 lines
9.6 KiB
Perl
Executable File
398 lines
9.6 KiB
Perl
Executable File
#!/usr/bin/perl -W
|
|
|
|
# (P) & (C) 2006 by Peter Bieringer <pb at bieringer dot de>
|
|
|
|
# This program extracts URLs from a Lyx file and checks them
|
|
|
|
# 20060822/PB: major improvement, add support for persistent XML database
|
|
# 20081109/PB: enhancement to detect URLs in newer lyx file format
|
|
# 20090214/PB: detect IPv6 addresses in brackets and remove brackets, otherwise it won't work
|
|
# 20131112/PB: support newer Perl
|
|
|
|
use strict;
|
|
use Net::HTTP;
|
|
use Net::FTP;
|
|
use Net::NNTP;
|
|
use Crypt::SSLeay;
|
|
use LWP::UserAgent;
|
|
use XML::Dumper;
|
|
use Socket qw(NI_NUMERICSERV NI_NUMERICHOST getaddrinfo inet_pton);
|
|
|
|
my $debug = 0xffff & ~(0x20);
|
|
|
|
my %urls;
|
|
my $p_urls = \%urls;
|
|
|
|
my %hosts;
|
|
|
|
my $time = time;
|
|
|
|
my $dbfile;
|
|
|
|
my $dbfile_suffix = ".url-database.xml";
|
|
|
|
|
|
sub quote($) {
|
|
$_[0] =~ s/\`/#60/g;
|
|
$_[0] =~ s/[\200-\377]/\?/g;
|
|
|
|
return $_[0];
|
|
};
|
|
|
|
sub extract_urls($) {
|
|
|
|
my ($url, $desc);
|
|
|
|
print STDERR "DEBUG/extract_urls: open file: $_[0]\n";
|
|
|
|
open FILE, "<" . $_[0] || die "ERROR : can't open file: " . $_[0];
|
|
|
|
my $linecounter = 0;
|
|
while (<FILE>) {
|
|
$linecounter++;
|
|
|
|
chomp $_;
|
|
|
|
if ($_ =~ /LatexCommand \\url\[([^]]*)\]{([^}]*)}/) {
|
|
|
|
$desc = $1;
|
|
$url = $2;
|
|
|
|
print STDERR "DEBUG/extract_urls: desc='$desc' URL=$url\n" if ($debug & 0x10);
|
|
|
|
if (defined $$p_urls{$url}->{'line'}) {
|
|
print STDERR "DEBUG/extract_urls: URL already found earlier - skip\n" if ($debug & 0x10);
|
|
|
|
if ($$p_urls{$url}->{'time'} == $time) {
|
|
|
|
} else {
|
|
# from database, update now
|
|
$$p_urls{$url}->{'time'} = $time;
|
|
$$p_urls{$url}->{'line'} = $linecounter;
|
|
$$p_urls{$url}->{'desc'} = quote($desc);
|
|
};
|
|
next;
|
|
} else {
|
|
$$p_urls{$url}->{'desc'} = quote($desc);
|
|
$$p_urls{$url}->{'time'} = $time;
|
|
$$p_urls{$url}->{'line'} = $linecounter;
|
|
};
|
|
|
|
$url = "";
|
|
$desc = "";
|
|
|
|
} elsif ($_ =~ /name \"([^"]*)\"/) {
|
|
# name "IPv6 & Linux - HowTo"
|
|
$desc = $1;
|
|
|
|
} elsif ($_ =~ /target \"([^"]*)\"/) {
|
|
# target "http://www.bieringer.de/linux/IPv6/"
|
|
$url = $1;
|
|
|
|
print STDERR "DEBUG/extract_urls: desc='$desc' URL=$url\n" if ($debug & 0x10);
|
|
|
|
if (defined $$p_urls{$url}->{'line'}) {
|
|
print STDERR "DEBUG/extract_urls: URL already found earlier - skip\n" if ($debug & 0x10);
|
|
|
|
if ($$p_urls{$url}->{'time'} == $time) {
|
|
|
|
} else {
|
|
# from database, update now
|
|
$$p_urls{$url}->{'time'} = $time;
|
|
$$p_urls{$url}->{'line'} = $linecounter;
|
|
$$p_urls{$url}->{'desc'} = quote($desc);
|
|
};
|
|
next;
|
|
} else {
|
|
$$p_urls{$url}->{'desc'} = quote($desc);
|
|
$$p_urls{$url}->{'time'} = $time;
|
|
$$p_urls{$url}->{'line'} = $linecounter;
|
|
};
|
|
};
|
|
};
|
|
|
|
close(FILE);
|
|
};
|
|
|
|
sub load_urls() {
|
|
if (! -f $dbfile) {
|
|
print STDERR "DEBUG/load_urls: database file doesn't exist, skip load: $dbfile\n" if ($debug & 0x10);
|
|
return 2;
|
|
};
|
|
|
|
my $dump = new XML::Dumper;
|
|
print STDERR "DEBUG/load_urls: load database file: $dbfile\n" if ($debug & 0x10);
|
|
$p_urls = $dump->xml2pl($dbfile);
|
|
};
|
|
|
|
sub store_urls() {
|
|
my $dump = new XML::Dumper;
|
|
$dump->pl2xml($p_urls, $dbfile);
|
|
};
|
|
|
|
sub cleanup_old_urls() {
|
|
for my $url (keys %$p_urls) {
|
|
if ($$p_urls{$url}->{'time'} < $time) {
|
|
print STDERR "DEBUG/cleanup_old_urls: remove old URL from database: $url\n" if ($debug & 0x10);
|
|
my $p_h = $$p_urls{$url};
|
|
delete $$p_urls{$url};
|
|
};
|
|
};
|
|
};
|
|
|
|
sub check_ipv6only($$) {
|
|
print STDERR "DEBUG/check_ipv6only: begin\n" if ($debug & 0x10);
|
|
print STDERR "DEBUG/check_ipv6only: check: " . $_[0] . " on port " . $_[1] . "\n" if ($debug & 0x10);
|
|
|
|
my ($family, $socktype, $proto, $saddr, $canonname, @res, $err);
|
|
my ($host, $port);
|
|
|
|
$family = -1;
|
|
|
|
if ($_[0] =~ /^\[([0-9a-fA-F:]+)\]$/) {
|
|
# Strip [...]
|
|
$host = $1;
|
|
$port = $_[1];
|
|
|
|
print STDERR "DEBUG/check_ipv6only: host: " . $host . "\n" if ($debug & 0x10);
|
|
|
|
socket(Socket_Handle, PF_INET6, SOCK_STREAM, 0) || return 1;
|
|
$saddr = pack_sockaddr_in6($port, inet_pton(AF_INET6, $host));
|
|
connect(Socket_Handle, $saddr) && return 0;
|
|
close(Socket_Handle);
|
|
} else {
|
|
my %hints = ( socktype => SOCK_STREAM, family => AF_INET6);
|
|
($err, @res) = getaddrinfo($_[0], "echo", \%hints);
|
|
|
|
if (defined $err) {
|
|
print STDERR "ERROR/check_ipv6only: getaddrinfo fails: $err\n" if ($debug & 0x10);
|
|
return 1;
|
|
};
|
|
if (scalar(@res) < 5) {
|
|
print STDERR "ERROR/check_ipv6only: getaddrinfo fails\n" if ($debug & 0x10);
|
|
return 1;
|
|
};
|
|
|
|
$family = -1;
|
|
|
|
while (scalar(@res) >= 5) {
|
|
($family, $socktype, $proto, $saddr, $canonname, @res) = @res;
|
|
|
|
($host, $port) = getnameinfo($saddr, NI_NUMERICHOST | NI_NUMERICSERV);
|
|
|
|
print STDERR "Trying to connect to $host port $port...\n";
|
|
|
|
socket(Socket_Handle, $family, $socktype, $proto) || next;
|
|
connect(Socket_Handle, $saddr) && last;
|
|
|
|
close(Socket_Handle);
|
|
$family = -1;
|
|
};
|
|
|
|
if ($family != -1) {
|
|
print STDERR "connected to $host port $port\n";
|
|
close(Socket_Handle);
|
|
return 0;
|
|
} else {
|
|
warn "connect attempt failed\n";
|
|
return 1;
|
|
};
|
|
};
|
|
};
|
|
|
|
sub check_urls() {
|
|
print STDERR "DEBUG/check_urls: begin\n" if ($debug & 0x10);
|
|
|
|
for my $url (sort keys %$p_urls) {
|
|
if (defined $$p_urls{$url}->{'checktime'}) {
|
|
if ($$p_urls{$url}->{'checktime'} > $time - 60*60*24*7) {
|
|
if (defined $$p_urls{$url}->{'checkresult'} && $$p_urls{$url}->{'checkresult'} =~ /^ok/) {
|
|
# Checked during last 7 days - skip
|
|
print STDERR "DEBUG/check_urls: checked during last 7 days - skip: $url\n" if ($debug & 0x10);
|
|
next;
|
|
};
|
|
};
|
|
};
|
|
|
|
print STDERR "DEBUG/check_urls: check now: $url\n" if ($debug & 0x10);
|
|
|
|
my ($host, $port);
|
|
|
|
my $desc = $$p_urls{$url}->{'desc'};
|
|
|
|
my $status = "undef";
|
|
# Extract host
|
|
my ($proto, $hostport, $uri) = $url =~ /^([^:]+):\/\/([^\/]+)(.*)$/;
|
|
|
|
if ($hostport =~ /^([^:]):([0-9]+)$/) {
|
|
$host = $1;
|
|
$port = $2;
|
|
} else {
|
|
$host = $hostport;
|
|
if ($proto eq "http") {
|
|
$port = 80;
|
|
} elsif ($proto eq "ftp") {
|
|
$port = 21;
|
|
} elsif ($proto eq "nntp") {
|
|
$port = 119;
|
|
} elsif ($proto eq "https") {
|
|
$port = 443;
|
|
};
|
|
};
|
|
|
|
# Strip trailing #
|
|
$uri =~ s/#.*//;
|
|
|
|
if (length($uri) == 0) {
|
|
$status = "URI is empty";
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
|
|
my $s;
|
|
|
|
if ($proto eq "ftp") {
|
|
# Check FTP
|
|
print STDERR "DEBUG/check_urls: open FTP connection: $host:$port\n" if ($debug & 0x20);
|
|
$s = Net::FTP->new(Host => $host, Port => $port, Timeout => 30, Passive => 1);
|
|
|
|
if (! defined $s) {
|
|
$status = "Host not found";
|
|
if (! check_ipv6only($host,$port)) {
|
|
$status = "ok (IPv6 only)";
|
|
};
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
|
|
if (!$s->login("anonymous",'-anonymous@')) {
|
|
$status = "FTP anonymous login failed";
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
|
|
if (!$s->cwd($uri)) {
|
|
$status = "FTP can't change to directory $uri";
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
|
|
$status = "ok";
|
|
$s->quit;
|
|
|
|
} elsif ($proto eq "nntp") {
|
|
my $s = Net::NNTP->new(Host => $host, Timeout => 30);
|
|
|
|
if (! defined $s) {
|
|
$status = "Host not found";
|
|
if (! check_ipv6only($host,$port)) {
|
|
$status = "ok (IPv6 only)";
|
|
};
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
$status = "ok";
|
|
|
|
$s->quit;
|
|
|
|
} elsif ($proto eq "https") {
|
|
my $ua = new LWP::UserAgent;
|
|
my $req = new HTTP::Request('HEAD', $url);
|
|
my $res = $ua->request($req);
|
|
|
|
my $code = $res->code;
|
|
|
|
if ($code !~ /^[23]/) {
|
|
$status = "HTTPS reports: $code";
|
|
} else {
|
|
$status = "ok";
|
|
};
|
|
} elsif ($proto eq "http") {
|
|
# Check HTTP
|
|
print STDERR "DEBUG/check_urls: open HTTP connection: $host:$port\n" if ($debug & 0x20);
|
|
$s = Net::HTTP->new(Host => $host, PeerPort => $port, Timeout => 30);
|
|
if (! defined $s) {
|
|
$status = "Host not found";
|
|
if (! check_ipv6only($host,$port)) {
|
|
$status = "ok (IPv6 only)";
|
|
};
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
|
|
print STDERR "DEBUG/check_urls: send HEAD request: $uri\n" if ($debug & 0x20);
|
|
if ($s->write_request(HEAD => $uri, 'User-Agent' => "Mozilla/5.0") == 0) {
|
|
$status = "trouble with uri";
|
|
goto ("LABEL_PRINT");
|
|
};
|
|
|
|
print STDERR "DEBUG/check_urls: wait for response\n" if ($debug & 0x20);
|
|
my($code, $mess, %h) = $s->read_response_headers;
|
|
|
|
print STDERR "DEBUG/check_urls: check response\n" if ($debug & 0x10);
|
|
if ($code !~ /^[23]/) {
|
|
$status = "HTTP reports: $code";
|
|
} else {
|
|
$status = "ok";
|
|
};
|
|
};
|
|
LABEL_PRINT:
|
|
if ($status ne "ok") {
|
|
print "desc='$desc' URL=$url proto=$proto host=$host port=$port uri='$uri'";
|
|
print " status=$status\n\n";
|
|
die;
|
|
};
|
|
LABEL_END:
|
|
$$p_urls{$url}->{'checktime'} = $time;
|
|
$$p_urls{$url}->{'checkresult'} = $status;
|
|
undef $s;
|
|
store_urls();
|
|
};
|
|
};
|
|
|
|
|
|
sub report_urls() {
|
|
print STDERR "DEBUG/report_urls: begin\n" if ($debug & 0x10);
|
|
|
|
for my $url (sort { $$p_urls{$a}->{'line'} <=> $$p_urls{$b}->{'line'} } ( keys %$p_urls)) {
|
|
if ($$p_urls{$url}->{'checkresult'} =~ /^ok/) {
|
|
next;
|
|
};
|
|
|
|
print "NOTICE: URL has a problem: $url\n";
|
|
print " Description : " . $$p_urls{$url}->{'desc'} . "\n";
|
|
print " Line number : " . $$p_urls{$url}->{'line'} . "\n";
|
|
print " Result : " . $$p_urls{$url}->{'checkresult'} . "\n";
|
|
print "\n";
|
|
};
|
|
};
|
|
|
|
|
|
sub check_rfc_urls() {
|
|
print STDERR "DEBUG/check_rfc_urls: begin\n" if ($debug & 0x10);
|
|
|
|
for my $url (sort { $$p_urls{$a}->{'line'} <=> $$p_urls{$b}->{'line'} } ( keys %$p_urls)) {
|
|
if ($url =~ /rfc[0-9]{1,4}/) {
|
|
print "NOTICE: URL points to RFC: $url\n";
|
|
print " Description : " . $$p_urls{$url}->{'desc'} . "\n";
|
|
print " Line number : " . $$p_urls{$url}->{'line'} . "\n";
|
|
print "\n";
|
|
};
|
|
};
|
|
};
|
|
|
|
|
|
##### Main
|
|
|
|
if (! defined $ARGV[0] || $ARGV[0] eq "") {
|
|
die "Missing file name (arg1)";
|
|
};
|
|
|
|
if (! -f $ARGV[0]) {
|
|
die "Argument 1 is not an existing file: " . $ARGV[0];
|
|
};
|
|
|
|
$dbfile = $ARGV[0] . $dbfile_suffix;
|
|
|
|
load_urls();
|
|
extract_urls($ARGV[0]);
|
|
cleanup_old_urls();
|
|
check_urls();
|
|
store_urls();
|
|
check_rfc_urls();
|
|
report_urls();
|