CSCI 5733
XML Application Development
Summer 2003
Suggested Solution to Homework #1

(1) For example (not fully documented):

use strict;
use CGI;
use LWP::Simple qw/get/;
$|++;

#
#   Kwok-Bun Yue
#   h1sol.pl: suggested solution for HW #1, Summer 2003, CSCI 5733 XML Application Development.
#   See: http://dcm.uhcl.edu/yue/courses/xml/Summer2003/hw/h1.asp
#

my $q = new CGI;

#   Porting Constants
#   URLs of input CSV (MS DOS Excel format) and XML documents.
my $CSV_URL = "http://dcm.uhcl.edu/yue/courses/xml/Summer2003/hw/h1dat1.csv";
#   my $CSV_URL = "http://dcm.uhcl.edu/yue/courses/xml/Summer2003/hw/h1dat1alt.csv";
my $XML_URL = "http://dcm.uhcl.edu/yue/courses/xml/Summer2003/hw/h1dat2.xml";

#   Get HTTP parameters: ranking, author and subject.
my $ranking = $q->param("ranking");
my $author = $q->param("author");
my $subject = $q->param("subject");

#   Retrieve contents of the input CSV and XML documents.
my $inputCsv = get($CSV_URL);
my $inputXml = get($XML_URL);

#   Convert to local \n.
$inputCsv =~ s/\015?\012/\n/g;
$inputXml =~ s/\015?\012/\n/g;

#   Get quotations from CSV and XML documents.
my @result = getQuotationsFromCSV($inputCsv);

#   XML Encode values from CSV
for (my $i=0; $i<scalar @result; $i++) {
   %{$result[$i]} = xmlEncodeHash(%{$result[$i]});
}
push @result, getQuotationsFromXML($inputXml);

#   output result
print <<__XML_HEAD;
Content-type:text/xml

<?xml version="1.0"?>
__XML_HEAD

#   print root start tag.
print "<quotation";
if ($author) {
   print " author=\"$author\"";
}
if ($subject) {
   print " subject=\"$subject\"";
}
if ($ranking) {
   print " ranking=\"$ranking\"";
}
print ">\n";

my $record;
foreach $record (@result) {
   if (matchQuote($author, $subject, $ranking, %$record)) {
      print "   <quote";
      print " author=\"", ${$record}{"author"}, "\"" unless $author;
      print " subject=\"", ${$record}{"subject"}, "\"" unless $subject;
      print " ranking=\"", ${$record}{"ranking"}, "\"" unless $ranking;
      print ">", ${$record}{"quote"}, "</quote>\n";
   }
}

#   print root end tag.
print "</quotation>";
exit 0;   #   main

#
#   Extracting fields from CSV string.  No error handling.
#   Return an array of references to hashes.  Each hash
#   stores a record.
#
sub getQuotationsFromCSV {
   my @result = ();
   my @csvLines = split /\n/, shift;

   #   Get the first line containing field names.
   my @names = getOneQuotationFromCSV(shift @csvLines);

   #   Break down csv line and store field values in a
   #   new hash.  Add the hash to the result.
   foreach (@csvLines) {
      my @fields = getOneQuotationFromCSV($_);
      my %record = ();
      foreach (@names) {
         $record{$_} = shift @fields;
      }
      push @result, \%record;
   }
   @result;
}   #   getQuotationFromCSV

#
#   getOneQuotationFromCSV
#      Break down a single line in CSV format into an array
#      of field values.  The second input is the delimiter
#      used for the CSV format.  Default is ".
#      no error handling.
#   Existing modules, such as CSV.pm can be used as an alternative.
sub getOneQuotationFromCSV {
   my $line = shift;
   my $delimiter = shift;
   $delimiter = "," unless $delimiter;

   my @result;      #   result array to be retturned.
   #   State:
   #      0: start state
   #      1: normal: the first character is not a ".
   #      2: quoted: the first character is a "
   #      3: A " is ncountered within state 2.
   #   "x becomes x.
   my $state = 0;
   my $currentField = "";
   foreach (split //, $line) {
      if ("\"" eq $_) {
         if ($state == 0) {
            $state = 2;
         }
         elsif ($state == 2) {
            $state = 3;
         }
         elsif ($state == 3) {
            $currentField .= "\"";
            $state = 2;
         }
         #   State 1 is not possible if there is no error.
      }
      elsif ($delimiter eq $_) {
         if ($state == 0 || $state == 1) {
            #   An empty field if state == 0.
            push @result, $currentField;
            $state = 0;
            $currentField = "";
         }
         elsif ($state == 2) {
            $currentField .= $delimiter;
         }
         else {   #   state = 3
            push @result, $currentField;           
            $state = 0;
            $currentField = "";
         }
      }
      else {   #   other characters.
         $currentField .= $_;
      }   #   else
   }   #   foreach

   #   push remaining contents
   push @result, $currentField;

   return @result;
}   #   getOneQuotationFromCSV

#
#   Extracting fields from XML string without using
#   an XML parser.  No error handling.
#
sub getQuotationsFromXML {
   my $line = shift;
   my @result = ();
   $line =~ s/\n//gs;
   while ($line =~ /<quote\s+(.*?)>(.*?)<\/quote>/g) {
      my %record = ();
      $record{'quote'} = $2;
      my $attrs = $1;
      while ($attrs =~ /\s*(\w+)\s*=\s*(["'])(.*?)\2/g) {
         $record{$1} = $3;
      }
      push @result, \%record;
   }
   @result;
}   #   getQuotationsFromXML

#   XML Encode one string
sub xmlEncodeOne {
   my $result = shift;
   $result =~ s/&/&amp;/g;
   $result =~ s/</&lt;/g;
   $result =~ s/>/&gt;/g;
   $result =~ s/"/&quot;/g;
   $result =~ s/'/&apos;/g;
   $result;
}   #   xmlEncodeOne

#   XML Encode the values of a hash.
sub xmlEncodeHash {
   my %result = @_;
   foreach (keys %result) {
      $result{$_} = xmlEncodeOne($result{$_});
   }
   %result;
}   #   xmlEncodeHash

#   Return true if the record hash matches all the values of author,
#   subject and ranking, if their values are not null
sub matchQuote {
   my $author = shift;
   my $subject = shift;
   my $ranking = shift;
   my %record = @_;
   return 0 if ($author && $author ne $record{"author"});
   return 0 if ($subject && $subject ne $record{"subject"});
   return 0 if ($ranking && $ranking ne $record{"ranking"});
   1;
}   #   matchQuote;


(2) For example

<!ELEMENT quotations (quote*)>
<!ATTLIST quotations
   author CDATA #IMPLIED
   subject CDATA #IMPLIED
   ranking NMTOKEN #IMPLIED>
<!ELEMENT quote (#PCDATA)>
<!ATTLIST quote
   author CDATA #IMPLIED
   subject CDATA #IMPLIED
   ranking NMTOKEN #IMPLIED>