CSCI 4230
Software Tools
Fall 1999
Suugested Solution to Homework #4

(1)    For example:

#!/opt/gnu/bin/perl
use strict;
$|++;
use LWP::Simple;
use LWP::UserAgent;
use URI::URL;
use CGI qw/:form :html param header/;

my $url_link = param("url");
my $url_doc = get($url_link);

print header, start_html("Check Links Result"), h2("Result of links checking"), "\n";
if (not defined $url_doc) {
   print "Sorry, the link you submitted, <font color=green>$url_link</font> " .
         "is broken.\n" . end_html;
   exit 0;
}

$url_link =~ /(.*)\//;
my $base_dir = $1 . "/";

my $num_links = 0;
my $num_good_links = 0;
my $num_bad_links = 0;
my @links = ();
my @link_names = ();
my @goodlinkp = ();
my $doc = $url_doc;
my $ua = new LWP::UserAgent;

#   Find links.
while ($doc =~ /<A HREF=("?)([^\s>]*)\1[^>]*>/i) {
   # check protocol.
   $_ = $2;
   $doc = $';

   next if (/^mailto:/);
   if (/^http:/) {
      $links[$num_links] = $_;
   }
   else {
      $links[$num_links] = $base_dir . $_ ;
   }

   $link_names[$num_links] = $_;
   if ($goodlinkp[$num_links] = $ua->request(new HTTP::Request('GET', $links[$num_links]))->is_success) {
      $num_good_links++;
   }
   else {
      $num_bad_links++;
   }

   $num_links++;
}

#   Print links
if ($num_links == 0) {
   print "There is no link in <font color=green>$url_link</font>.\n";
}
else {
   print p,
         "There are $num_links links in the submitted url: ",
         "<font color=green>$url_link</font>: ",
         "$num_good_links are good and $num_bad_links are broken.",
         p,
         "<UL>";
   for (my $i=0; $i < $num_links; $i++) {
      if ($goodlinkp[$i]) {
         print "<li><A HREF=\"$links[$i]\">$link_names[$i]</A>";
      }
      else {
         print "<li>$link_names[$i] <FONT COLOR=\"RED\">is broken.</FONT>";
      }
   }
}
print end_html;

Alternatively, you may use the HTML packages.  Note that the results are somewhat different as repeated links are counted as one and links within comments are not counted.

#!/opt/gnu/bin/perl
use strict;
$|++;
use LWP::Simple;
use LWP::UserAgent;
use URI;
use HTML::LinkExtor;
use CGI qw(:standard);

my $url_link = param("url");
my $url_doc = get($url_link);

print header, start_html("Check Links Result"), h2("Result of links checking"), "\n";
if (not defined $url_doc) {
   print "Sorry, the link you submitted, <font color=green>$url_link</font> " .
         "is broken.\n" . end_html;
   exit 0;
}

#   Get base directory.
$url_link =~ /(.*)\//;
my $base_dir = $1 . "/";

#    Parse links.
my @links = ();
my $parser = HTML::LinkExtor->new;
$parser->parse($url_doc);
my @parsed_tags = $parser->links();
foreach (@parsed_tags) {
   my ($tag, %attr) = @$_;
   push(@links, $attr{'href'}) if $tag eq 'a';
}

my $num_links = 0;
my $num_good_links = 0;
my $num_bad_links = 0;
my @link_names = ();
my @goodlinkp = ();
my $ua = new LWP::UserAgent;

#   Find links.
foreach (@links) {
   next if (/^mailto:/);
   if (/^http:/) {
      $links[$num_links] = $_;
   }
   else {
      $links[$num_links] = $base_dir . $_ ;
   }

   $link_names[$num_links] = $_;
   if ($goodlinkp[$num_links] = $ua->request(new HTTP::Request('GET', $links[$num_links]))->is_success) {
      $num_good_links++;
   }
   else {
      $num_bad_links++;
   }
 
   $num_links++;
}

#   Print links
if ($num_links == 0) {
   print "There is no link in <font color=green>$url_link</font>.\n";
}
else {
   print p,
         "There are $num_links links in the submitted url: ",
         "<font color=green>$url_link</font>: ",
         "$num_good_links are good and $num_bad_links are broken.",
         p,
         "<UL>";
   for (my $i=0; $i < $num_links; $i++) {
      if ($goodlinkp[$i]) {
         print "<li><A HREF=\"$links[$i]\">$link_names[$i]</A>", br;
      }
      else {
         print "<li>$link_names[$i] <FONT COLOR=\"RED\">is broken.</FONT>", br;
      }
   }
   print "</UL>";
}
print end_html;