iso639.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:43 2010 from iso639.pl 2006/09/25 7.1 KB.

#!/Perl
# iso639.pl
# AIM to download the table from the site, and build a 'validation' table ...
use strict;
use warnings;
use LWP::Simple;
require "logfile.pl" or die "Missing logfile.pl ...\n"; # my simple log file and some other utility subs
require "htmltools.pl" or die "Missing htmltools.pl ...\n";
# log file stuff
my ($LF);
my $outfile = 'temp'.$0.'.txt';
my $dodownload = 0;
my $URL = 'http://www.loc.gov/standards/iso639-2/englangn.html';
my $out_file = 'iso639.htm';
open_log($outfile);
prt( "$0 ... Hello, World...\n" );
prt( "Moment ... loading $URL ...\n" );
my $tbltext = '';
my @lines = ();
my @tlines = ();
my $cnt = 0;
my $line = '';
my $tln = '';
my $col = 0;
my $col1 = '';
my $col2 = '';
my $col3 = '';
my $col4 = '';
my $i = 0;
my @colist = ();
my %iso3 = ();
my %iso2 = ();
my $tblcnt = 0;
my $lncnt = 0;
my $maxcnt = 15; # maximum search for </td>
my $enttbl = 0;
my $bgntbl = 0;
my $thcnt = 0;
if ($dodownload) {
   $tbltext = get($URL);
   $cnt = length($tbltext);
   prt( "Downloaded $cnt characters ... writing $out_file ...\n" );
   write2file( $tbltext, $out_file );
} else {
   prt( "Loading $out_file ... moment ...\n" );
   open INF, "<$out_file" or mydie( "ERROR: Failed to open [$out_file] ... $! \n" );
   @lines = <INF>;   # slurp it into lines
   close INF;
   $tbltext = join("",@lines);
   $tbltext = tag2newline($tbltext, 'table');
   $tbltext = tag2newline($tbltext, 'tr');
   $tbltext = tag2newline($tbltext, 'th');
   $tbltext = tag2newline($tbltext, 'td');
   $tbltext = trimblanklines($tbltext);
   write2file($tbltext, 'temptbl.htm');
   $cnt = length($tbltext);
   prt( "Loaded $cnt characters ... from $out_file ...\n" );
}
@lines = split("\n", $tbltext);
$cnt = scalar @lines;
prt( "Got $cnt lines to process ... \n" );
foreach $line (@lines) {
   $tln = trimall( html_subs($line) );
   push(@tlines, $tln) if length($tln);
}
$cnt = scalar @tlines;
prt( "Got $cnt trimmed lines to process ... \n" );
##foreach $line (@tlines) {
for ($i = 0; $i < $cnt; $i++) {
   $lncnt++;
   $line = $tlines[$i];
   if ($line =~ /<table.*>/i) {
      $tblcnt++;
      $bgntbl = 1;
      $enttbl++;   # count another entry to a table
      prt( "$lncnt - Enter table $tblcnt ...[$line]\n" );
      $thcnt = 0;
   } elsif ($line =~ /<\/table>/i) {
      if ($tblcnt) {
         prt( "$lncnt - Exit table $tblcnt ...[$line]\n" );
         $tblcnt--;
      } else {
         prt( "$lncnt - Exit table NO COUNT! ...[$line]\n" );
      }
   } elsif ($line =~ /<tr.*?>/i) {
      if ($col == 4) {
         ###prt( "[$col1] [$col2] [$col3] [$col4] \n" );
         # Language Name English French 639-2 639-1 
         push(@colist, [ $col1, $col2, $col3, $col4 ]);
      } else {
         if ($bgntbl || ($enttbl < 4) || ($thcnt > 0)) {
            # $col should be zero if just entered, and not yet in second
         } else {
            prt( "$lncnt - $tblcnt CHECK col=$col ...[$col1] [$col2] [$col3] [$col4]\n" );
         }
      }
      $bgntbl = 0;
      $thcnt = 0;
      $col = 0;
   } elsif ($line =~ /<td.*?>/i) {
      my $lc = 0;
      while ( !($line =~ /<\/td>/i) ) {
         $i++;
         $lncnt++;
         if ($i < $cnt) {
            $line .= $tlines[$i];
         } else {
            last;
         }
         $lc++;
         if ($lc > $maxcnt) {
            last; # exit anyway ...
         }
      }
      $tln = removetag($line, 'td');
      $tln = removetag($tln, 'span');
      $tln = removetag($tln, 'p');
      $tln = removetag($tln, 'br');
      $tln = removetag($tln, 'font');
      $tln = striptag($tln, 'a');
      $col++;
      if ($col == 1) {
         $col1 = $tln;
      } elsif ($col == 2) {
         $col2 = $tln;
      } elsif ($col == 3) {
         $col3 = $tln;
      } elsif ($col == 4) {
         $col4 = $tln;
      }
   } elsif ($line =~ /<th.*?>/i) {
      my $lc = 0;
      while ( !($line =~ /<\/th>/i) ) {
         $i++;
         $lncnt++;
         if ($i < $cnt) {
            $line .= $tlines[$i];
         } else {
            last;
         }
         $lc++;
         if ($lc > $maxcnt) {
            last; # exit anyway ...
         }
      }
      $thcnt++;
   }
}
$cnt = scalar @colist;
prt( "Got $cnt in country list ... \n" );
# Language Name English French 639-2 639-1 
for ($i = 0; $i < $cnt; $i++) {
   $col1 = trimall($colist[$i][0]);
   $col2 = trimall($colist[$i][1]);
   $col3 = trimall($colist[$i][2]);
   $col4 = trimall($colist[$i][3]);
   if ($col3 =~ /\//) {
      my @arr = split("/",$col3);
      foreach my $i3 (@arr) {
         if (defined $iso3{$i3}) {
            if ( !is_the_same( $iso3{$i3}, $col1) ) {
               prt( "Exists! [$i3] = [" . $iso3{$i3} . "] adding [$col1] ...\n" );
               $iso3{$i3} .= ' ' . $col1;
            }
         } else {
            $iso3{$i3} = $col1;
         }
      }
   } else {
      if (defined $iso3{$col3}) {
         if ( !is_the_same( $iso3{$col3}, $col1) ) {
            prt( "Exists! [$col3] = [" . $iso3{$col3} . "] adding [$col1] ...\n" );
            $iso3{$col3} .= ' ' . $col1;
         }
      } else {
         $iso3{$col3} = $col1;
      }
   }
   if (length($col4)) {
      if (defined $iso2{$col4}) {
         if ( !is_the_same( $iso2{$col4}, $col1) ) {
            prt( "Exists! [$col4] = [" . $iso2{$col4} . "] adding [$col1] ...\n" );
            $iso2{$col4} .= ' ' . $col1;
         }
      } else {
         $iso2{$col4} = $col1;
      }
   }
   ###prt( "[$col1] [$col2] [$col3] [$col4] \n" );
}
my @keys3 = keys %iso3;
my @keys2 = keys %iso2;
my $kc3 = scalar @keys3;
my $kc2 = scalar @keys2;
prt( "Got $kc3 ISO 639-2 and $kc2 ISO 639-1 ... writting file ...\n" ); 
my $msg = '';
my $maxln = 75;
$line = '';
$cnt = 0;
$msg .= 'tmbstr ISO639_2 = {' . "\n";
foreach $tln (sort @keys3) {
   $cnt++;
   $line .= '"'.$tln.'"';
   ##if ($cnt < $kc3) {
      $line .= ', ';
   ##}
   if (length($line) > $maxln) {
      $msg .= $line . "\n";
      $line = '';
   }
}
if (length($line)) {
   $msg .= $line . "\n";
   $line = '';
}
$msg .= " 0 };\n\n";
$msg .= 'tmbstr ISO639_1[] = {' . "\n";
$cnt = 0;
foreach $tln (sort @keys2) {
   $cnt++;
   $line .= '"'.$tln.'"';
   ##if ($cnt < $kc2) {
      $line .= ', ';
   ##}
   if (length($line) > $maxln) {
      $msg .= $line . "\n";
      $line = '';
   }
}
if (length($line)) {
   $msg .= $line . "\n";
   $line = '';
}
$msg .= " 0 };\n";
write2file( $msg, "tempiso.txt" );
prt( "All written to tempiso.txt ...\n" );
if ($dodownload) {
   system( $out_file );
}
close_log($outfile,1);
exit(0);
sub is_the_same {
   my ($t1, $t2) = @_;
   my (@a1, @a2);
   my $ct = 0;
   my $ct1 = 0;
   my $ct2 = 0;
   @a1 = split(/[;,]/,$t1);
   @a2 = split(/[;,]/,$t2);
   foreach my $b1 (@a1) {
      $ct1++;
      $b1 = trimall($b1);
      foreach my $b2 (@a2) {
         $b2 = trimall($b2);
         if ($b1 eq $b2) {
            $ct2++;
            last;
         }
      }
   }
   if ($ct1 == $ct2) {
      return 1;
   }
   return 0;
}
sub write2file {
   my ($txt,$fil) = @_;
   open WOF, ">$fil" or mydie( "ERROR: Unable to open $fil! - $! \n" );
   print WOF $txt;
   close WOF;
}
sub trimall {
   my ($ln) = shift;
   chomp $ln;
   $ln =~ s/\r$//;
   $ln =~ s/\t/ /g;
   while ($ln =~ /\s\s/) {
      $ln =~ s/\s\s/ /g;
   }
   while ($ln =~ /^\s/) {
      $ln = substr($ln,1);
   }
   while ($ln =~ /\s$/) {
      $ln = substr($ln,0, length($ln) - 1);
   }
   return $ln;
}
sub html_subs {
   my ($htm) = shift;
   $htm = substitutions($htm);
   $htm =~ s/&eacute;/e/gm;
   $htm =~ s/&egrave;/e/gm;
   $htm =~ s/&ccedil;/c/gm;
   $htm =~ s/&aring;/a/gm;
   $htm =~ s/&#231;/c/gm;
   $htm =~ s/&agrave;/a/gm;
   $htm =~ s/&#180;/'/gm;
   $htm =~ s/&iuml;/i/gm;
   $htm =~ s/&acirc;/a/gm;
   $htm =~ s/&Icirc;/I/gm;
   $htm =~ s/&#252;/u/gm;
   return $htm;
}
# eof - iso639.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional