(1) For example:
use strict;
use LWP::Simple;
use HTML::TokeParser;
use URI;
#
# imageLinkExtractor.pl url dir
# Bun Yue March 8, 2001
#
# This program extracts and saves the images
within the
# specified url into the directory dir and
generates an
# image descriptin file in the directory.
The images
# are saved using the file names of their
url.
#
# Only the images specified by the <img>
tags are saved.
# Furthermore, the images should not have
an alt
# attribute value that ends with .jpg, .jpeg
or .gif.
#
# The image description file stores the description
of
# one image per line in the format of
#
# filename::description
#
# The description is the alt attribute value.
#
# The first line of the image description
file is
# the title of the image collection.
The title
# is obtained from the third command line
argument.
#
# The program creates the subdirectory dir
in the
# current directory and generates
an error message if the
# directory already exists.
# Constant strings for output html filenames.
my $IMAGES_DESCRIPTION_FILENAME = 'image_desc.txt';
my $IMAGES_DESCRIPTION_SEPARATOR = '::';
# Check command line arguments
@ARGV < 3 && die "Usage: imageLinkExtractor.pl
url dir title\n";
# Get URL, dir and title
my ($url, $dir, $title) = @ARGV;
my $baseDNS = ""; #
Base DNS of the url.
# Set base directory.
$url =~ /http:\/\/(.*)\//;
$baseDNS = $1;
# Get the URL page content.
my $urlContents = ""; #
Contents of input url.
$urlContents = get($url)
|| die "Error: Unsuccessful
to obtain the page for $url. Check validity of the url.\n";
# Parse the url content
for images (<img> tag).
# Save the src and alt
attribute values as key and value
# of the hash %images.
my %images = ();
my $parser = HTML::TokeParser->new(\$urlContents);
while (my $token = $parser->get_tag("img"))
{
my $url = $token->[1]{src}
|| next;
my $alt = $token->[1]{alt}
|| next;
# Remove
images with alt description ending with .jpg, .jpeg and .gif.
$alt =~ /\.(jpe?g|gif)$/
&& next;
$images{$url} = $alt;
}
if (%images) {
# Create
the subdirectory under the current directory.
-e $dir && die
"The directory name $dir already exist as a directory or file name.
Please use another directory name.\n";
mkdir($dir, 0777) || die
"Can't create the directory $dir.\n";
chdir($dir) || die "Can't
change to the directory $dir.\n";
# open
image description file.
open (IMAGES_DESC, ">$IMAGES_DESCRIPTION_FILENAME")
|| die "Can't open file $IMAGES_DESCRIPTION_FILENAME.";
print IMAGES_DESC "$title\n";
# Loop
through collected image url.
foreach my $imageUrl (keys
%images) {
print "Processing image url $imageUrl ... \n";
# Add baseDNS for relative urls.
$imageUrl = "http://" . $baseDNS . "/" . $imageUrl unless $imageUrl =~
/^http:/i;
my $imageFilename = $imageUrl;
# Remove path.
$imageFilename =~ s/(.*\/)//;
# Get and store the image file.
getstore($imageUrl, $imageFilename) || next;
# Add to image description file.
print IMAGES_DESC "${imageFilename}${IMAGES_DESCRIPTION_SEPARATOR}$images{$imageUrl}\n";
}
close IMAGES_DESC;
print "\nThe URL $url
has successully been processed.\n",
(keys %images) . " images are added to the directory $dir.\n",
"Image description file is $IMAGES_DESCRIPTION_FILENAME\n";
}
else {
print "Sorry, no images
can be harvested from the URL $url\n";
}
exit 0;
(2)
# merge two hashes; no error
checking
sub mergeHash {
my ($href1, $href2) =
@_;
my %result = %$href1;
foreach (keys %$href2)
{
$result{$_} = $result{$_} ? [$result{$_}, $$href2{$_}] : $$href2{$_};
}
%result;
}
A more general solution that merges any number of hashes:
# merge any number of hashes. No error
checking.
sub mergeHash {
my $href1 = shift;
my %result = %$href1;
foreach my $nexthref (@_)
{
foreach (keys %$nexthref) {
if ($result{$_}) {
if (ref $result{$_} eq 'ARRAY') {
push @{$result{$_}}, $$nexthref{$_};
}
else {
$result{$_} = [$result{$_}, $$nexthref{$_}];
}
}
else {
$result{$_} = $$nexthref{$_};
}
}
}
%result;
}
(3)
foreach my $key (sort keys %result) {
print "$key => ";
if (ref $result{$key}
eq 'ARRAY') {
foreach (@{$result{$key}}) {
print "$_ ";
}
}
else {
print $result{$key};
}
print "\n";
}