#!/usr/bin/perl -w # churl - tchrist@perl.com # v0.1 (prototype) # # extract urls and verify validity. # only looks for FTP, HTTP, and FILE schemata, # stored in A or IMG tags. # # retrieve Perl source from # http://www.perl.com/CPAN/src/5.0/latest.tar.gz # retrieve the LWP library from # http://www.perl.com/cgi-bin/cpan_mod?module=LWP require 5.002; BEGIN { die "usage: $0 URL ...\n" unless @ARGV } use strict; use URI::URL; use HTML::Parse qw(parse_html); use LWP::Simple qw(get head); $| = 1; my($ht_tree, $linkpair, $fqurl, $base, %saw, @urls, %check_this); foreach ( qw(file ftp http) ) { $check_this{$_}++ } foreach $base ( @ARGV ) { print "$base:\n"; $ht_tree = parse_html(get($base)) || die "no doc"; foreach $linkpair (@{$ht_tree->extract_links(qw)}) { my($link,$elem) = @$linkpair; my $url = url($link,$base); # XXX not real base unless ($saw{ $fqurl = eval { $url->abs } || $url->as_string }++) { print " $url: "; if ( $check_this{ lc($url->abs->scheme) } ) { my @headers = head($fqurl); print @headers ? "OK" : "BAD"; } else { print "SKIPPED"; } print "\n"; } } }