#!/usr/bin/perl -w

# churl - tchrist@perl.com
# v0.1 (prototype)
# 
# extract urls and verify validity.
# only looks for FTP, HTTP, and FILE schemata,
# stored in A or IMG tags.
#
# retrieve Perl source from 
#   http://www.perl.com/CPAN/src/5.0/latest.tar.gz
# retrieve the LWP library from
#   http://www.perl.com/cgi-bin/cpan_mod?module=LWP

require 5.002;

BEGIN { die "usage: $0 URL ...\n" unless @ARGV } 

use strict;
use URI::URL;
use HTML::Parse qw(parse_html);
use LWP::Simple qw(get head);

$| = 1;

my($ht_tree, $linkpair, $fqurl, $base, %saw, @urls, %check_this);
foreach ( qw(file ftp http) ) { $check_this{$_}++ } 

foreach $base ( @ARGV ) { 
    print "$base:\n";
    $ht_tree  = parse_html(get($base)) || die "no doc";
    foreach $linkpair (@{$ht_tree->extract_links(qw<a img>)}) {
	my($link,$elem) = @$linkpair;
	my $url = url($link,$base); # XXX not real base
	unless ($saw{ $fqurl = eval { $url->abs } || $url->as_string }++) { 
	    print "  $url:  ";
	    if ( $check_this{ lc($url->abs->scheme) } ) { 
		my @headers = head($fqurl);
		print @headers ? "OK" : "BAD";
	    } else {
		print "SKIPPED";
	    } 
	    print "\n";
	}
    }
}