CSCI 4230
Internet Applicaiton Development
Summer 2001
Suggested Solution to Homework #2

(1)    For example:

use strict;
use LWP::Simple;
use HTML::TokeParser;
use URI;

#
# imageLinkExtractor.pl url dir
# Bun Yue   March 8, 2001
#
# This program extracts and saves the images within the
# specified url into the directory dir and generates an
# image descriptin file in the directory.  The images
# are saved using the file names of their url.
#
# Only the images specified by the <img> tags are saved.
# Furthermore, the images should not have an alt
# attribute value that ends with .jpg, .jpeg or .gif.
#
# The image description file stores the description of
# one image per line in the format of
#
# filename::description
#
# The description is the alt attribute value.
#
# The first line of the image description file is
# the title of the image collection.  The title
# is obtained from the third command line argument.
#
# The program creates the subdirectory dir in the
#   current directory and generates an error message if the
# directory already exists.

# Constant strings for output html filenames.
my $IMAGES_DESCRIPTION_FILENAME = 'image_desc.txt';
my $IMAGES_DESCRIPTION_SEPARATOR = '::';

# Check command line arguments
@ARGV < 3 && die "Usage: imageLinkExtractor.pl url dir title\n";

# Get URL, dir and title
my ($url, $dir, $title) = @ARGV;
my $baseDNS = "";    #    Base DNS of the url.
#   Set base directory.
$url =~ /http:\/\/(.*)\//;
$baseDNS = $1;

# Get the URL page content.
my $urlContents = "";    #   Contents of input url.
$urlContents = get($url)
    || die "Error: Unsuccessful to obtain the page for $url.  Check validity of the url.\n";

#    Parse the url content for images (<img> tag).
#    Save the src and alt attribute values as key and value
#    of the hash %images.
my %images = ();
my $parser = HTML::TokeParser->new(\$urlContents);
while (my $token = $parser->get_tag("img")) {
    my $url = $token->[1]{src} || next;
    my $alt = $token->[1]{alt} || next;
    #    Remove images with alt description ending with .jpg, .jpeg and .gif.
    $alt =~ /\.(jpe?g|gif)$/ && next;
    $images{$url} = $alt;
}

if (%images) {
    #    Create the subdirectory under the current directory.
    -e $dir && die "The directory name $dir already exist as a directory or file name.  Please use another directory name.\n";
    mkdir($dir, 0777) || die "Can't create the directory $dir.\n";
    chdir($dir) || die "Can't change to the directory $dir.\n";

    #    open image description file.
    open (IMAGES_DESC, ">$IMAGES_DESCRIPTION_FILENAME")
        || die "Can't open file $IMAGES_DESCRIPTION_FILENAME.";
    print IMAGES_DESC "$title\n";

    #    Loop through collected image url.
    foreach my $imageUrl (keys %images) {
        print "Processing image url $imageUrl ... \n";
        #    Add baseDNS for relative urls.
        $imageUrl = "http://" . $baseDNS . "/" . $imageUrl unless $imageUrl =~ /^http:/i;
        my $imageFilename = $imageUrl;
        #    Remove path.
        $imageFilename =~ s/(.*\/)//;
        #    Get and store the image file.
        getstore($imageUrl, $imageFilename) || next;
        #    Add to image description file.
        print IMAGES_DESC "${imageFilename}${IMAGES_DESCRIPTION_SEPARATOR}$images{$imageUrl}\n";
    }
    close IMAGES_DESC;
    print "\nThe URL $url has successully been processed.\n",
          (keys %images) . " images are added to the directory $dir.\n",
          "Image description file is $IMAGES_DESCRIPTION_FILENAME\n";
}
else {
    print "Sorry, no images can be harvested from the URL $url\n";
}

exit 0;

(2)

#    merge two hashes; no error checking
sub mergeHash {
    my ($href1, $href2) = @_;
    my %result = %$href1;

    foreach (keys %$href2) {
        $result{$_} = $result{$_} ? [$result{$_}, $$href2{$_}] : $$href2{$_};
    }
    %result;
}

A more general solution that merges any number of hashes:

# merge any number of hashes.  No error checking.
sub mergeHash {
    my $href1 = shift;
    my %result = %$href1;

    foreach my $nexthref (@_) {
        foreach (keys %$nexthref) {
            if ($result{$_}) {
                if (ref $result{$_} eq 'ARRAY') {
                    push @{$result{$_}}, $$nexthref{$_};
                }
                else {
                    $result{$_} = [$result{$_}, $$nexthref{$_}];
                }
            }
            else {
                $result{$_} = $$nexthref{$_};
            }
        }
    }
    %result;
}

(3)

foreach my $key (sort keys %result) {
    print "$key => ";
    if (ref $result{$key} eq 'ARRAY') {
        foreach (@{$result{$key}}) {
            print "$_  ";
        }
    }
    else {
        print $result{$key};
    }
    print "\n";
}