CSCI 4230
Internet Applicaiton Development
Spring 2001
Suggested Solution to Homework #3

(1 and 2)    For example:

my @students = ("Bun Yue;CSCI;A",
                "Joe Go;SWEN;B+",
                "Kenny Bee;CSCI;B",
                "Sadegh Davari;SWEN;A",
                "Mary Matalin;CSCI;C");

my %result = hashify(@students);

foreach my $major (sort keys %result) {
    print "Major $major:\n";
    foreach (sort keys %{$result{$major}}) {
        print "   $_ => ${$result{$major}}{$_}\n";
    }
}
exit 0;

sub hashify {
    my %result = ();
    foreach (@_) {
        my ($name, $major, $grade) = split /;/;
        if ($result{$major}) {
            ${$result{$major}}{$name} = $grade;
        }
        else {
            $result{$major} = {$name=>$grade};
        }
    }
    %result;
}

# Output:
#Major CSCI:
#   Bun Yue => A
#   Kenny Bee => B
#   Mary Matalin => C
#Major SWEN:
#   Joe Go => B+
#   Sadegh Davari => A

(3)    For example,

use strict;
use LWP::Simple;
use LWP::UserAgent;
use URI;
use HTML::LinkExtor;

#
# PageDownloader.pl url dir report.txt
# Bun Yue   February, 2001
#
# This program extracts and saves the page specified by the full url
#   into the directory dir and generates a report to the file
#   report.txt.  The program creates the subdirectory dir in the
#   current directory and generates an error message if the
# directory already exists.  The program saves the page
#   as main.html.  It also extracts all html links (with
#   extensions .htm or .html) and save them in the
#   subdirectory as link00001.html, link00002.html, and
#   so on.
#

# Constant strings for output html filenames.
my $MAIN_PAGE_FILENAME = 'main.html';
my $LINK_PAGE_FILENAME_PREFIX = 'link';
my $LINK_PAGE_LEADING_DIGITS = 5;

# Link page extensions that should be extracted.
my $LINK_PAGE_EXTENSION = 'html|htm';

# Get command line arguments
@ARGV < 3 && die "Usage: PageDownloader.pl url dir report.txt\n";
#   (input url, input directory for storing page files
#   and report, report file name)
my ($url, $dir, $reportFileName) = @ARGV;

# Get the URL page.
my $urlContents = "";  # Contents of input url.
my $baseDNS = "";   # Base DNS of the url.
$urlContents = get($url) || die "Error: Unsuccessful to obtain the page for $url.  Check validity of the url.\n";

# Create the subdirectory under the current directory.
-e $dir && die "The directory name $dir already exist as a directory or file name.  Please use another directory name.\n";
mkdir($dir, 0777) || die "Can't create the directory $dir.\n";
chdir($dir) || die "Can't change to the directory $dir.\n";

# Open report file.
open(REPORT ,">$reportFileName") || die "Can't open report file $reportFileName.\n";

# Save the main page.
open (OUTPUT, ">$MAIN_PAGE_FILENAME") || die "Can't open output file $MAIN_PAGE_FILENAME.\n";
print OUTPUT $urlContents;
close OUTPUT;
print REPORT "This report: $reportFileName\n" .
             "URL for extraction: $url\n" .
             "Subdirectory: $dir\n" .
    "Extraction time: " . (localtime) . "\n\n";

print REPORT "Main URL $url is saved as: $MAIN_PAGE_FILENAME.\n\n";

#   Get base directory.
$url =~ /http:\/\/(.*)\//;
my $baseDNS = $1;

#    Variables for parsing links.
my $savedLinkReport = ""; #   Report string for link page saving.
my @savedLinks = ();  # URL of link pages saved successfully.
      # For future uses.
my @failedLinks = (); # URL of link pages failed to be saved.

# Parsing links.
my $parser = HTML::LinkExtor->new;
$parser->parse($urlContents);
my @parsedTags = $parser->links();
foreach (@parsedTags) {
 # Get the link tag.
 my ($tag, %attr) = @$_;
 next if $tag ne 'a';
 my $linkUrl = $attr{'href'};

 # Skip mailto link.
 next if ($linkUrl =~ /^mailto:/i);
 # Skip url that does not have the extensions.
 next unless $linkUrl =~ /$LINK_PAGE_EXTENSION$/i;

 # Add baseDNS for relative links.
 $linkUrl = "http://" . $baseDNS . "/" . $linkUrl unless $linkUrl =~ /^http:/i;
 
 my $linkUrlContents = "";
 if ($linkUrlContents = get($linkUrl)) {
  # Get the file successfully.
  push @savedLinks, $linkUrl;

  # Get link file name.
  my $linkFileName = $LINK_PAGE_FILENAME_PREFIX .
                     digitString(scalar @savedLinks, $LINK_PAGE_LEADING_DIGITS) .
         ".html";
 
  # Write the link file.
  open LINK, ">$linkFileName.\n";
  print LINK $linkUrlContents;
  close LINK;
  # Update string for the report file.
  $savedLinkReport .= "$linkUrl => $linkFileName.\n";
 }
 else {
  push @failedLinks, $linkUrl;
 }
}

# Print report heading and statistics.
print REPORT "Total links: " . (scalar @savedLinks + scalar @failedLinks) .
             "\n";
print REPORT "Total pages saved: " .(scalar @savedLinks) . "\n";
print REPORT "Total pages failed to be saved: " .(scalar @failedLinks) . "\n\n";

# Print saved links.
print REPORT "Saved links are: \n\n" .
    $savedLinkReport . "\n";

# Print failed links
if (@failedLinks) {
 print REPORT "Links that cannot be saved: \n\n";
 foreach (@failedLinks) {
  print REPORT "$_\n";
 }
}

close(REPORT);
exit 0;  # main

# Return a string of numDigits digits containing the
# number num.  Padded with leading spaces.
sub digitString {
 my ($num, $numDigits) = @_;
 my $result = "0" x $numDigits . $num;
    substr($result, (length $result) - $numDigits, $numDigits);
}