autoexch02.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:22 2010 from autoexch02.pl 2006/08/26 7.7 KB.

#!/Perl
use LWP::Simple;
require "logfile.pl" or die "Missing logfile.pl ...\n"; # my simple log file and some other utility subs
require "htmltools.pl" or die "Missing htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp'.$0.'.txt';
my $outfil1 = 'tempcurr02.htm';
my $outfil2 = 'tempcurr02.csv';
# program variables
my $URL = 'http://www.exchangerate.com/';
my @hrefs = ();
my @imgs = ();
my @currency = ();
my $m_date1 = '';
my $m_date2 = '';
my $test1 = 0;
my $etext = '';
open_log($outfile);
prt( "$0 ... Hello, World...\n" );
if ($test1) {
prt("Fetching text from $outfil1 ...\n");
open CIF, "<$outfil1" or mydie("ERROR: Unable to open $outfil1 ...\n");
my @atext = <CIF>; # slurp it into a line array
close CIF;
prt( "From $outfil1, got ".scalar @atext." lines ...\n" );
$etext = join( "", @atext);
prt( "len=".length($etext)."\n"); # "[$txt2]\n");
write2file($etext, 'tempcurr02a.htm');
} else {
prt("Fetching text from $URL ...\n");
my $text = get("$URL");
my $tcnt = length($text);
prt( "Got $tcnt characters from URL $URL ...\n");
$etext = htmlexpand($text);
write2file($etext,$outfil1);
}
##prt("[$text]\n");
##my $ctext = $etext;
my $ctext = htmlclean02($etext);
prt( "len=".length($ctext)."\n"); # "[$txt2]\n");
my $ccnt = extractcurrencies02($ctext);
###my $ccnt = scalar @currency;
prt( "Got $ccnt currencies ...\n" );
my $msg = '';
my $txt = '';
# AUSTRALIA Dollar AUD 1.310649 ### + 1.311001 -0.0268% + 1  
if ($ccnt) {
   open OF, ">$outfil2" or mydie("ERROR: Unable to open CSV out file!\n");
   prt("Writing CSV file $outfil2 ...\n");
   $msg = "Country,Currency,ISO,Rate $m_date1,Rate $m_date2"; 
   prt("$msg\n");
   print OF $msg."\n";
   for (my $i = 0; $i < $ccnt; $i++) {
      $msg = "".($currency[$i][0]).",".($currency[$i][1]).",".($currency[$i][2]).",".($currency[$i][3]).",".($currency[$i][4]);
      prt( "$msg\n" );
      print OF $msg."\n";
   }
   $msg = "From [$URL] on ".(scalar localtime);
   prt("$msg\n");
   print OF $msg."\n";
   close(OF);
} else {
   prt( "WARNING: FAILED TO EXTRACT ANY CURRENCIES!!!\n" );
}
prt( "All done ...\n" );
close_log($outfile,1);
###system( $outfil1 );
###system( $outfil2 );
exit(0);
# <TH COLSPAN=2> Country</TH>
# <TH> Currency </TH>
# <TH> ISO </TH>
# <TH> 08/24/06 </TH>
# <TH> 08/23/06 </TH>
# <TH COLSPAN=2> Change </TH>
my $m_stat1 = '^Country';
my $m_stat2 = '^Currency';
my $m_stat3 = '^ISO';
my $m_stat4 = '^\d{2}\D{1}\d{2}\D{1}\d{2}';   # <TH> 08/24/06 </TH>
my $m_stat5 = '^\d{2}\/\d{2}\/\d{2}';   # <TH> 08/23/06 </TH>
my $m_stat6 = ''; # <TH COLSPAN=2> Change </TH>
my $m_stat7 = ''; #
sub extractcurrencies02 {
   my ($txt) = shift;
   my $len = length($txt);
   my $i = 0;
   my $tag = '';
   my $tt = '';
   my $st = 0;
   my $cnt = 0;
   my @arr = ();
   my $rcnt = 0;
   my $isth = 0;
   prt( "Processing $len characters ...\n" );
   for ($i = 0; $i < $len; $i++) {
      $ch = substr($txt,$i,1);
      if ($ch eq '<') {
         $tag = $ch;
         $i++;
         for ( ; $i < $len; $i++) {
            $ch = substr($txt,$i,1);
            $tag .= $ch;
            if ($ch eq '>') {
               ###prt( "Have tag [$tag] ...\n" );
               last;
            }
         }
         $ch = '';
         if (($tag =~ /^<td.*>/i)||($tag =~ /^<th.*>/i)) {
            $i++;
            $tt = '';
            if ($tag =~ /^<td.*>/i) {
               ###prt( "Got TD tag [$tag] ...\n" );
               $isth = 0;
            } else {
               ###prt( "Got TH tag [$tag] ...\n" );
               $isth = 1;
            }
            for ( ; $i < $len; $i++) {
               $ch = substr($txt,$i,1);
               if ($ch eq '<') {
                  $tt = trimbothends($tt);
                  if (length($tt)) {
                     ###$i--; # back up to this char
                     last;
                  } else {
                     # eat this next - assume another tag
                     # note this 'eats' though the first <td><img ...> etc tag
                     $i++;
                     for ( ; $i < $len; $i++) {
                        $ch = substr($txt,$i,1);
                        if ($ch eq '>') {
                           last;
                        }
                     }
                  }
               } else {
                  $tt .= $ch
               }
            }
            ###prt( "Got TH/TD tag [$tag] ...txt=[$tt]\n" );
            if ($st == 7) {
               ##prt( "Got TH/TD tag [$tag] ...txt=[$tt]\n" );
               if ($cnt == 0) {
                  $cnt = 1;
                  @arr = ();
                  push(@arr, $tt);
               } elsif ($cnt == 1) {
                  $cnt = 2;
                  push(@arr, $tt);
               } elsif ($cnt == 2) {
                  $cnt = 3;
                  push(@arr, $tt);
               } elsif ($cnt == 3) {
                  $cnt = 4;
                  push(@arr, $tt);
               } elsif ($cnt == 4) {
                  $cnt = 5;
                  push(@arr, $tt);
               } elsif ($cnt == 5) {
                  $cnt = 6;
                  push(@arr, $tt);
               } elsif ($cnt == 6) {
                  push(@arr, $tt);
                  $cnt = 0;
               }
            } elsif ($isth) {
               if ($st == 0) {
                  ##if ($tt =~ /$m_stat1/) {
                  if ($tt =~ /^Country/) {
                     prt("At state 1 ...\n");
                     $st = 1;
                  }
               } elsif ($st == 1) {
                  ##if ($tt =~ /$m_stat2/) {
                  if ($tt =~ /^Currency/) {
                     prt("At state 2 ...\n");
                     $st = 2;
                  } else {
                     $st = 0;
                  }
               } elsif ($st == 2) {
                  ##if ($tt =~ /$m_stat3/) {
                  if ($tt =~ /^ISO/) {
                     prt("At state 3 ...\n");
                     $st = 3;
                  } else {
                     $st = 0;
                  }
               } elsif ($st == 3) {
                  ##if ($tt =~ /$m_stat3/) {
                  if ($tt =~ /^(\d{2}\D{1}\d{2}\D{1}\d{2})/) {
                     $m_date1 = $1;
                     prt("At state 4 ... $m_date1 ...\n");
                     $st = 4;
                  } else {
                     $st = 0;
                  }
               } elsif ($st == 4) {
                  ##if ($tt =~ /$m_stat4/) {
                  if ($tt =~ /^(\d{2}\D{1}\d{2}\D{1}\d{2})/) {
                     $m_date2 = $1;
                     prt("At state 5 ... $m_date2 ...\n");
                     $st = 5;
                  } else {
                     $st = 0;
                  }
               } elsif ($st == 5) {
                  prt("At state 6/7 ...\n");
                  $st = 7;
               } elsif ($st == 6) {
                  prt("At state 7 ...\n");
                  $st = 7;
               }
            }
         } elsif ($tag =~ /^<\/tr.*/i) {
            if (($st == 7) && ($cnt >= 5)) {
               ###prt("End of row $cnt - return to cnt 0 ...\n");
               # AUSTRALIA Dollar AUD 1.310649 1.311001 -0.0268% + <something>  
               prt( "Arr=[" );
               foreach my $itm (@arr) {
                  prt("$itm,");
               }
               prt("]\n");
               push(@currency, [$arr[0], $arr[1], $arr[2], $arr[3], $arr[4]]);
               $rcnt++;
            }
            $cnt = 0;
         } elsif ($tag =~ /^<\/table.*/i) {
            if ($st > 0) {
               ###prt("End table - return to state 0 ...\n");
               $st = 0;
            }
         }
      }
   }
   return $rcnt;
}
sub htmlclean02 {
   my ($rtxt) = shift;
prt( "len=".length($rtxt)." Drop comments <!--...--> ...\n");
$rtxt = dropcomments($rtxt);
prt( "len=".length($rtxt)." Strip <HEAD>...</HEAD> tag ...\n");
$rtxt = striptag($rtxt, 'HEAD');
prt( "len=".length($rtxt)." Strip <script>...</script> tag ...\n");
$rtxt = striptag($rtxt,'script');
prt( "len=".length($rtxt)." Strip <noscript>...</noscript> tag ...\n");
$rtxt = striptag($rtxt,'noscript');
prt( "len=".length($rtxt)." Strip <SELECT>...</SELECT> tag ...\n");
$rtxt = striptag($rtxt,'select');
prt( "len=".length($rtxt)." Remove <font ...> tags ...\n");
$rtxt = removefont($rtxt);
prt( "len=".length($rtxt)." Remove <b> tags ...\n");
$rtxt = removetag($rtxt,'b');
prt( "len=".length($rtxt)." Remove <nobr> tags ...\n");
$rtxt = removetag($rtxt,'nobr');
prt( "len=".length($rtxt)." Remove th attributes ...\n");
$rtxt = removetagattrib($rtxt,'th');
prt( "len=".length($rtxt)." Remove tr attributes ...\n");
$rtxt = removetrattrib($rtxt);
prt( "len=".length($rtxt)." Remove td attributes ...\n");
$rtxt = removetdattrib($rtxt);
prt( "len=".length($rtxt)." Delete <a...> & </a>\n");
$rtxt = collecthrefs($rtxt,1);
prt( "len=".length($rtxt)." Delete <img...>\n");
$rtxt = collectimgs($rtxt,1);
prt( "len=".length($rtxt)." Do substitutions ...\n");
$rtxt = substitutions($rtxt);
prt( "len=".length($rtxt)." Trim blank lines ...\n");
$rtxt = trimblanklines($rtxt);
prt( "len=".length($rtxt)." Trim inline td ...\n");
$rtxt = triminlinetd($rtxt);
   return $rtxt;
}
# eof - autoexch01.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional