(1) For example:
#!/opt/gnu/bin/perl
use strict;
$|++;
use LWP::UserAgent;
use URI;
use HTML::LinkExtor;
use CGI;
#
# LinkCheck.pl
# Accept a CGI parameter url and validate
the links appear in the page
# of the URL.
# Kwok-Bun Yue September 20, 2001
#
# Get user parameter: url.
my $q = new CGI;
my $url = $q->param("url");
# Use LWP::UserAgent to get the page.
my $ua = new LWP::UserAgent;
my $response = $ua->request(new HTTP::Request('GET',
$url));
my $page = $response->content();
my $baseUrl = "";
if ($response->is_error()) {
badUrlPage();
} else {
$baseUrl = $response->base();
validateLinkPage($url, $page, $baseUrl);
}
exit 0;
# Error page: bad URL.
sub badUrlPage {
print $q->header;
print <<__BAD_URL_PAGE;
<html>
<head>
<title>Link Validator: Bad URL</title>
<meta http-equiv="Content-Type" content="text/html;
charset=iso-8859-1">
<link rel="stylesheet" href="LinkChecker.css">
</head>
<body bgcolor="#FFFFFF">
<h3>Link Validation Result</h3>
<p>Thank you for using our services. Unfortunately,
the submitted link
<span class="lightEmp">$url</span>
is either not an URL address or
is not available.
</body>
</html>
__BAD_URL_PAGE
} # badUrl
sub validateLinkPage {
my $NA = "unknown size";
my ($url, $page, $baseUrl) = @_;
# Parse links.
my $currLink = '';
my @links = ();
my $parser = HTML::LinkExtor->new;
$parser->parse($page);
my @parsed_tags = $parser->links();
foreach (@parsed_tags) {
my ($tag, %attr) = @$_;
push(@links, $attr{'href'})
if $tag eq 'a';
}
my $numLinks = 0;
my $numGoodLinks = 0;
my $numBadLinks = 0;
# Sizes, if available, of good links.
my %goodLinkSizes = ();
# Number of occurrences of the bad
links.
my %badLinkCounts = ();
my $ua = new LWP::UserAgent;
# Process links.
foreach (@links) {
# Skip mailto or javascript links.
next if (/^mailto:/i || /^javascript:/i);
# Assume good link syntax.
$numLinks++;
$currLink = /^http:/i ? $_ : new_abs
URI($_, $url)->as_string;
( $numGoodLinks++, next ) if $goodLinkSizes{$currLink};
( $numBadLinks++, next ) if $goodLinkSizes{$currLink};
my $response = $ua->request(new HTTP::Request('GET',
$currLink));
if ($response->is_success) {
$numGoodLinks++;
my $header = $response->headers_as_string;
$goodLinkSizes{$currLink} =
length $response->content();
#if ($header =~ /Content-Length:\s*(\d+)/si)
{
# $goodLinkSizes{$currLink}
= $1;
#}
#else {
# $goodLinkSizes{$currLink}
= $NA;
#}
}
else {
$numBadLinks++;
$badLinkCounts{$currLink}++;
}
}
print $q->header;
print <<__HTML_HEAD;
<html>
<head>
<title>Link Validator</title>
<meta http-equiv="Content-Type" content="text/html;
charset=iso-8859-1">
<link rel="stylesheet" href="LinkChecker.css">
</head>
<body bgcolor="#FFFFFF">
<h3>Link Validation Result</h3>
<p>Thank you for using our services. For
the submitted link <span class="lightEmp">$url</span>,
__HTML_HEAD
if (!$numLinks) {
print <<__NO_LINK_HTML_FOOTER;
there is no link in the page</p>
<html>
__NO_LINK_HTML_FOOTER
return;
}
# print statistics
my $numDistinctLinks = (scalar keys
%goodLinkSizes) + (scalar keys %badLinkCounts);
my $numDistinctGoodLinks = scalar keys
%goodLinkSizes;
my $numDistinctBadLinks = scalar keys
%badLinkCounts;
my $goodLinkPercent = sprintf("%.1f",
$numGoodLinks * 100/ $numLinks);
my $badLinkPercent = sprintf("%.1f",
$numBadLinks * 100/ $numLinks);
print <<__STATISTICS;
there are:</p>
<ul>
<li>$numLinks links.</li>
<li>$numDistinctLinks distinct
links.</li>
<li>$numGoodLinks good links ($goodLinkPercent%).</li>
<li>$numDistinctGoodLinks distinct
good links.</li>
<li>$numBadLinks broken links ($badLinkPercent%).</li>
<li>$numDistinctBadLinks distinct
broken links.</li>
</ul>
__STATISTICS
# print distinct good links.
if ($numDistinctGoodLinks) {
print <<__GOOD_LINKS_HEAD;
<h5>Distinct good links with document
sizes:</h5>
<ul>
__GOOD_LINKS_HEAD
foreach (sort keys %goodLinkSizes)
{
print " <li><a href=\"$_\">$_</a>:
$goodLinkSizes{$_} Bytes.</li>\n";
}
print "</ul>\n";
}
else {
print "There is no good link.<br><br>\n";
}
# print distinct bad links.
if ($numDistinctBadLinks) {
print <<__BAD_LINKS_HEAD;
<h5>Distinct bad links:</h5>
<ul>
__BAD_LINKS_HEAD
foreach (sort keys %badLinkCounts)
{
print " <li>$_.</li>\n";
}
print "</ul>\n";
}
else {
print "There is no bad link.<br><br>\n";
}
print "</body>\n</html>\n";
} # validateLinkPage