CSCI 4230
Internet Applicaiton Development
Fall 2001
Suggested Solution to Homework #4

(1)    For example:

#!/opt/gnu/bin/perl
use strict;
$|++;
use LWP::UserAgent;
use URI;
use HTML::LinkExtor;
use CGI;
#
# LinkCheck.pl
# Accept a CGI parameter url and validate the links appear in the page
# of the URL.
# Kwok-Bun Yue September 20, 2001
#

# Get user parameter: url.
my $q = new CGI;
my $url = $q->param("url");

# Use LWP::UserAgent to get the page.
my $ua = new LWP::UserAgent;
my $response = $ua->request(new HTTP::Request('GET', $url));
my $page = $response->content();
my $baseUrl = "";

if ($response->is_error()) {
 badUrlPage();
} else {
 $baseUrl = $response->base();
 validateLinkPage($url, $page, $baseUrl);
}
exit 0;

# Error page: bad URL.
sub badUrlPage {
 print $q->header;
 print <<__BAD_URL_PAGE;
<html>
<head>
<title>Link Validator: Bad URL</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<link rel="stylesheet" href="LinkChecker.css">
</head>

<body bgcolor="#FFFFFF">
<h3>Link Validation Result</h3>
<p>Thank you for using our services. Unfortunately, the submitted link
<span class="lightEmp">$url</span> is either not an URL address or
is not available.
</body>
</html>
__BAD_URL_PAGE
} # badUrl
 

sub validateLinkPage {
 my $NA = "unknown size";
 my ($url, $page, $baseUrl) = @_;

 #    Parse links.
 my $currLink = '';
 my @links = ();
 my $parser = HTML::LinkExtor->new;
 $parser->parse($page);
 my @parsed_tags = $parser->links();
 foreach (@parsed_tags) {
    my ($tag, %attr) = @$_;
    push(@links, $attr{'href'}) if $tag eq 'a';
 }

 my $numLinks = 0;
 my $numGoodLinks = 0;
 my $numBadLinks = 0;
 # Sizes, if available, of good links.
 my %goodLinkSizes = ();
 # Number of occurrences of the bad links.
 my %badLinkCounts = ();
 my $ua = new LWP::UserAgent;

 #   Process links.
 foreach (@links) {
  # Skip mailto or javascript links.
  next if (/^mailto:/i || /^javascript:/i);
  # Assume good link syntax.
  $numLinks++;
  $currLink = /^http:/i ? $_ : new_abs URI($_, $url)->as_string;

  ( $numGoodLinks++, next ) if $goodLinkSizes{$currLink};
  ( $numBadLinks++, next ) if $goodLinkSizes{$currLink};

  my $response = $ua->request(new HTTP::Request('GET', $currLink));
  if ($response->is_success) {
   $numGoodLinks++;
   my $header = $response->headers_as_string;
   $goodLinkSizes{$currLink} = length $response->content();
   #if ($header =~ /Content-Length:\s*(\d+)/si) {
   # $goodLinkSizes{$currLink} = $1;
   #}
   #else {
   # $goodLinkSizes{$currLink} = $NA;
   #}
  }
  else {
   $numBadLinks++;
   $badLinkCounts{$currLink}++;
  }
 }

 print $q->header;
 print <<__HTML_HEAD;
<html>
<head>
<title>Link Validator</title>
<meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1">
<link rel="stylesheet" href="LinkChecker.css">
</head>

<body bgcolor="#FFFFFF">
<h3>Link Validation Result</h3>
<p>Thank you for using our services. For the submitted link <span class="lightEmp">$url</span>,
__HTML_HEAD

 if (!$numLinks) {
  print <<__NO_LINK_HTML_FOOTER;
there is no link in the page</p>
<html>
__NO_LINK_HTML_FOOTER
  return;
 }

 # print statistics
 my $numDistinctLinks = (scalar keys %goodLinkSizes) + (scalar keys %badLinkCounts);
 my $numDistinctGoodLinks = scalar keys %goodLinkSizes;
 my $numDistinctBadLinks = scalar keys %badLinkCounts;
 my $goodLinkPercent = sprintf("%.1f", $numGoodLinks * 100/ $numLinks);
 my $badLinkPercent = sprintf("%.1f", $numBadLinks * 100/ $numLinks);

 print <<__STATISTICS;
  there are:</p>
<ul>
  <li>$numLinks links.</li>
  <li>$numDistinctLinks distinct links.</li>
  <li>$numGoodLinks good links ($goodLinkPercent%).</li>
  <li>$numDistinctGoodLinks distinct good links.</li>
  <li>$numBadLinks broken links ($badLinkPercent%).</li>
  <li>$numDistinctBadLinks distinct broken links.</li>
</ul>
__STATISTICS

 # print distinct good links.
 if ($numDistinctGoodLinks) {
  print <<__GOOD_LINKS_HEAD;
<h5>Distinct good links with document sizes:</h5>
<ul>
__GOOD_LINKS_HEAD
  foreach (sort keys %goodLinkSizes) {
   print "  <li><a href=\"$_\">$_</a>: $goodLinkSizes{$_} Bytes.</li>\n";
  }
  print "</ul>\n";
 }
 else {
  print "There is no good link.<br><br>\n";
 }
 # print distinct bad links.
 if ($numDistinctBadLinks) {
  print <<__BAD_LINKS_HEAD;
<h5>Distinct bad links:</h5>
<ul>
__BAD_LINKS_HEAD
  foreach (sort keys %badLinkCounts) {
   print "  <li>$_.</li>\n";
  }
  print "</ul>\n";
 }
 else {
  print "There is no bad link.<br><br>\n";
 }
 print "</body>\n</html>\n";
} # validateLinkPage