h2h01.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:42 2010 from h2h01.pl 2009/08/01 15.8 KB.

#!/perl -w
# NAME: h2h01.pl
# AIM: Convert a HTML file, to an online HTML page, color coded...
# 29/07/2009 geoff mclane http://geoffair.net/mperl3 
use strict;
use warnings;
use File::Basename;   # to split path into ($name, $dir, $ext)
use File::stat; # to get the file date
#require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
require 'fgutils.pl' or die "Unable to load fgutils.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
   my @tmpsp = split(/\\/,$pgmname);
   $pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);
my $in_file = 'C:\Projects\tidy\tests\2811690-04.html';
# options
my $tab_space = ' ';
my $out_file = 'temph2h.htm';
my $addspace = 0;
# coloring
my $a_class = 'a'; # RED
my $b_class = 'b'; # comments (#006666)
my $c_class = 'c'; # reserved words (blue)
my $e_class = 'e'; # known WIN32 words
my $t_class = 't'; # quoted - single and double (#006600)
my $red_count = 0;
my $blue_count = 0;
my $comm_count = 0;
my $quot_count = 0;
my $eclass_count = 0;
# debug
my $dbg1 = 0;  # show SET messages
my $dbg2 = 0;  # show type message
my $dbg3 = 0;
my $dbg4 = 0;
my $convspace = 1;
my $saveconv = 1;
prt( "$0 ... Processing $in_file...\n" );
my @delimiters_NOT_USED = ( ' ', ',', '(', ')', '{', '}', '[', ']', '-', '+', '*', '%', '/', '=', '"', "'", '~',
'!', '&', '|', '<', '>', '?', ':', ';', '.', '#', "\t" );
sub write_head($$$) {
   my ($fil, $title, $msg) = @_;
   my $head = <<EOF;
<!DOCTYPE html PUBLIC "-//W3C//DTD HTML 4.01 Transitional//EN"
"http://www.w3.org/TR/html4/loose.dtd">
<html>
 <head>
  <title>
   $titlee
  </title>
  <meta http-equiv="Content-Language"
        content="en-us">
  <meta http-equiv="Content-Type"
        content="text/html; charset=us-ascii">
  <link rel="stylesheet"
        href="cxx.css"
        type="text/css">
 </head>
 <body>
  <a name="top"
        id="top"></a>
  <h1>
   $titlee
  </h1>
  <p class="top"><a href="index.htm">index</a></p>
  <p>$msgg
  </p>
EOF
   write2file($head,$fil);   # create and write to file
}
sub open_pre($) {
   my ($fil) = shift;
   $saveconv = $convspace;
   append2file("\n<pre class=\"cd\">",$fil); # append to file
   $convspace = 0;
}
sub close_pre($) {
   my ($fil) = shift;
   append2file("\n</pre>\n",$fil); # append to file
   $convspace = $saveconv;
}
sub append_tail($$) {
   my ($fil, $msg) = @_;
   my $tail = <<EOF;
  <hr class="mini">
  <p class="top">
   <a target="_self"
      href="#top">top</a>
  </p>
  <p>
   <a name="end"
      id="end"></a> <a target="_blank"
      href="http://tidy.sourceforge.net/"><img border="0"
        src="images/checked_by_tidy.gif"
        alt="checked by tidy"
        width="32"
        height="32"></a>&nbsp; <a href="http://validator.w3.org/check?uri=referer"
      target="_blank"><img src="images/valid-html401.gif"
        alt="Valid HTML 4.01 Transitional"
        width="88"
        height="31"></a>
  </p>
  <!-- $msg -->
 </body>
</html>
EOF
   append2file($tail,$fil); # append to file
}
######################################################
# Converting SPACES to '&nbsp;'
# Of course this could be done just using perl's
# powerful search and replace, but this handles
# any number of spaces, only converting the number
# minus 1 to &nbsp; ... not sure how to have
# this level of control with regex replacement
######################################################
sub conv_spaces {
   my $t = shift;
   my ($c, $i, $nt, $ln, $sc, $sp);
   $nt = ''; # accumulate new line here
   $ln = length($t);
   for ($i = 0; $i < $ln; $i++) {
      $c = substr($t,$i,1);
      if ($c eq ' ') {
         $i++; # bump to next 
         $sc = 0;
         $sp = '';
         for ( ; $i < $ln; $i++) {
            $c = substr($t,$i,1);
            if ($c ne ' ') {
               last; # exit
            }
            $sc++;
            $sp .= $c;
         }
         if ($sc) {
            $sp =~ s/ /&nbsp;/g;
            $nt .= $sp;
         }
         $i--; # back up one
         $c = ' '; # add back the 1 space
      }
      $nt .= $c;
   }
   prt( "conv_space: from [$t] to [$nt] ...\n" ) if $dbg4;
   return $nt;
}
###########################################################################
# VERY IMPORTANT SERVICE
# This converts the 'text' into HTML text, but only does a partial job!
# 1. Convert '&' to '&amp;' to avoid interpreting as replacement
# 2. Convert '<' to '&lt;' and '>' to '&gt;', to avoid interpreting as HTML
# 3. Convert '"' to '&quot;'
# 4. Convert '\t' to SPACES
# 5. Finally, if there are double or more SPACES, convert to '&nbsp;'
###########################################################################
sub html_line {
   my $t = shift;
   my $ot = $t;
   $t =~ s/&/&amp;/g; # all '&' become '&amp;'
   $t =~ s/</&lt;/g; # make sure all '<' is/are swapped out
   $t =~ s/>/&gt;/g; # make sure all '>' is/are swapped out
   $t =~ s/\"/&quot;/g; # and all quotes become &quot;
   $t =~ s/\t/$tab_space/g; # tabs to spaces
   if ($convspace && ($t =~ /\s\s/)) { # if any two consecutive white space
      $t = conv_spaces($t);
   }
   prt( "html_line: from [$ot] to [$t] ...\n" ) if $dbg3;
   return $t;
}
sub add_red($) {
   my ($t) = shift;
   $red_count++;
   return ('<span class="'.$a_class.'">'.$t.'</span>');
}
sub add_comm($) {
   my ($t) = shift;
   $comm_count++;
   return ('<span class="'.$b_class.'">'.$t.'</span>');
}
# reserved words
sub add_blue($) {
   my ($t) = shift;
   $blue_count++;
   return ('<span class="'.$c_class.'">'.$t.'</span>');
}
sub add_eclass($) {
   my ($t) = shift;
   $eclass_count++;
   return ('<span class="'.$e_class.'">'.$t.'</span>');
}
sub add_quot($) {
   my ($t) = shift;
   $quot_count++;
   return ('<span class="'.$t_class.'">'.$t.'</span>');
}
sub space_split_keep {
   my ($txt) = shift;
   my $len = length($txt);
   my ($k, $ch, $tag, $incomm, $k2, $nch);
   my @arr = ();
   $tag = '';
   $incomm = 0;
   for ($k = 0; $k < $len; $k++) {
      $ch = substr($txt,$k,1);
      $k2 = $k + 1;
      $nch = ($k2 < $len) ? substr($txt,$k2,1) : "";
      if ($incomm) {
         $tag .= $ch;
         $incomm = 0 if ($ch eq '"');
      } elsif ($ch =~ /\s/) { # any spacey char
         $tag .= $ch;
         while( ($k2 < $len) && length($nch) && ($nch =~ /\s/) ) {
            $tag .= $nch;
            $k++;
            $k2 = $k + 1;
            $nch = ($k2 < $len) ? substr($txt,$k2,1) : "";
         }
         push(@arr, $tag);
         $tag = '';
      } elsif (($ch =~ /\//)&&($nch eq '>')) { # 04/10/2008, but only if before '>' 24/09/2008 add this as well
         push(@arr, $tag) if (length($tag));
         $tag = $ch; # restart tag with this character
      } else {
         $tag .= $ch;
         $incomm = 1 if ($ch eq '"');
      }
   }
   push(@arr, $tag) if (length($tag));
   return @arr;
}
sub get_tag_split($$$) {
   my ($tx,$ln,$rk) = @_;
   my ($k, $c, $sp);
   my $tg = '';
   for ($k = 0; $k < $ln; $k++) {
      $c = substr($tx,$k,1);
      if ($c eq '<') {  # start of tag
         $k++; # bump to next, and get whole tag
         for (; $k < $ln; $k++) {
            $c = substr($tx,$k,1);
            if ($c eq '>') {
               # $k++; # include this char
               last;
            }
            $tg .= $c;
         }
         last;
      }
   }
   if ($k == $ln) {
      prt("$tx");
      prt("\n") if (! $tx =~ /\n$/);
      prt("ERROR: NOT coded\n" );
      exit 1;
   }
   $$rk = $k;
   #prt( "tag: [$tg]\n" );
   #my @arr = space_split_keep($tg);
   #my $cnt = 0;
   #foreach my $tmp (@arr) {
   #   $cnt++;
   #   prt(" $cnt: [$tmp]\n");
   #}
   #return @arr;
   return space_split_keep($tg);
}
sub is_entity($) {
   my ($t) = shift;
   my $l = length($t);
   if (($l > 2)&&($t =~ /^&/)) {
      for (my $p = 1; $p < $l; $p++) {
         my $c = substr($t,$p,1);
         if (($c =~ /\w/) || (($c eq '#')&&($p == 1))) {    # alphanumeric or '#' char is first
            next; # while suitable chars
         } elsif (($p > 1)&&($c eq ';')) {
            return 1;   # reach the first ';' IT IS AN ENTITY!
         } else {
            last; # no go - out of here
         }
      }
   }
   return 0;
}
sub process_file($) {
   my ($fil) = shift;
   #my @html = ();
   my $html = ();
   if (open INF, "<$fil") {
      my @lines = <INF>;
      close INF;
      my $lncnt = scalar @lines;
      prt("Processing $lncnt lines, from $fil...\n");
      my ($i, $j, $k, $t, $line, $ch, $len, @atag, $tag, $cnt, $txt, $htm, $tmp, $ltx, $ch2);
      my ($lnbal, $iscomm, $tst, $isdata, $isphp, $isasp, $isjava, $isproc);
      $txt = '';
      $line = '';
      for ($i = 0; $i < $lncnt; $i++) {
         $line .= $lines[$i];
         # chomp $line;
         # $line = trim_all($line);
      }
         $len = length($line);
         for ($j = 0; $j < $len; $j++) {
            $ch = substr($line,$j,1);
            if ($ch eq '<') {
               $ltx = length($txt);
               if ($ltx > 0) {
                  prt("$txt");
                  #push(@html, html_line($txt));
                  if (($txt =~ /&/)&&($txt =~ /;/)) {
                     $tmp = '';  # appears text contains one or more entities
                     # so give these another color
                     for ($t = 0; $t < $ltx; $t++) {
                        $ch2 = substr($txt,$t,1);
                        if ( ($ch2 eq '&') && (is_entity(substr($txt,$t)) ) ) {
                           $html .= html_line($tmp) if length($tmp);
                           $tmp = $ch2;   # start the entity
                           $t++;    # bump to next
                           for (; $t < $ltx; $t++) {
                              $ch2 = substr($txt,$t,1);
                              $tmp .= $ch2;
                              if ($ch2 eq ';') {
                                 last; # end of entity
                              }
                           }
                           $html .= add_eclass(html_line($tmp));
                           $tmp = '';
                        } else {
                           $tmp .= $ch2;
                        }
                     }
                     $html .= html_line($tmp) if length($tmp);
                     # $html .= add_comm(html_line($txt));
                  } else {
                     $html .= html_line($txt);
                  }
                  $txt = '';
               }
               # need to got to the END of this tag - especially for say <![DATA[ ... ]]>
               $lnbal = substr($line,$j);
               # @atag = get_tag_split( substr($line,$j), $len - $j, \$k );
               @atag = get_tag_split( $lnbal, $len - $j, \$k );
               $cnt = scalar @atag;
               $tag = $atag[0];  # get the HTML TAG
               $iscomm = 0;
               $isdata = 0;
               $isphp = 0;
               $isasp = 0;
               $isjava = 0;
               $isproc = 0;
               if ($cnt > 1) {
                  if ($tag =~ /^!--/) {
                     prt("SET comment [$tag]\n") if ($dbg1);
                     $iscomm = 1;
                  } elsif ($tag =~ /^!\[CDATA\[/) {
                     prt("SET CDATA [$tag]\n") if ($dbg1);
                     $isdata = 1;
                  } elsif ($tag =~ /^?php/) {
                     prt("SET PHP [$tag]\n") if ($dbg1);
                     $isphp = 1;
                  } elsif ($tag =~ /^%/) {
                     prt("SET ASP [$tag]\n") if ($dbg1);
                     $isasp = 1;
                  } elsif ($tag =~ /^#/) {
                     prt("SET JAVA [$tag]\n") if ($dbg1);
                     $isjava = 1;
                  } elsif ($tag =~ /^\?/) {
                     prt("SET PROC [$tag]\n") if ($dbg1);
                     $isproc = 1;
                  } else {
                     prt("SET NONE [$tag]\n") if ($dbg1);
                  }
                  $html .= add_blue(html_line("<$tag"));
                  $htm = "<$tag";
                  $tmp = '';
                  for ($t = 1; $t < $cnt; $t++) {
                     $tst = $atag[$t];
                     if (($t + 1) == $cnt) {
                        last if ($isdata && ($tst eq ']]'));
                        last if ($iscomm && ($tst eq '--'));
                        last if ($isphp &&  ($tst eq '?'));
                        last if ($isasp &&  ($tst eq '%'));
                        last if ($isjava && ($tst eq '#'));
                     }
                     $tmp .= " " if (length($tmp) && $addspace);
                     $tmp .= $tst;
                  }
                  $htm .= $tmp;
                  $html .= " " if ($addspace);
                  if ($iscomm) {
                     prt("IS comment [$tmp]\n") if ($dbg2);
                     $html .= add_comm(html_line($tmp));
                     $html .= add_blue(html_line("-->"));
                     $htm .= "--";
                  } elsif ($isdata) {
                     prt("IS CDATA [$tmp]\n") if ($dbg2);
                     $html .= add_quot(html_line($tmp));
                     $html .= add_blue(html_line("]]>"));
                     $htm .= "]]>";
                  } elsif ($isphp) {
                     prt("IS PHP [$tmp]\n") if ($dbg2);
                     $html .= add_quot(html_line($tmp));
                     $html .= add_blue(html_line("?>"));
                     $htm .= "?>";
                  } elsif ($isasp) {
                     prt("IS ASP [$tmp]\n") if ($dbg2);
                     $html .= add_quot(html_line($tmp));
                     $html .= add_blue(html_line("%>"));
                     $htm .= "%>";
                  } elsif ($isjava) {
                     prt("IS JAVA [$tmp]\n") if ($dbg2);
                     $html .= add_quot(html_line($tmp));
                     $html .= add_blue(html_line("#>"));
                     $htm .= "#>";
                  } elsif ($isproc) {
                     prt("IS PROC [$tmp]\n") if ($dbg2);
                     $html .= add_quot(html_line($tmp));
                     $html .= add_blue(html_line(">"));
                     $htm .= ">";
                  } else {
                     prt("IS NOT [$tmp]\n") if ($dbg2);
                     $html .= add_red(html_line($tmp));
                     $htm .= ">";
                     $html .= add_blue(html_line(">"))
                  }
                  #push(@html, add_red(html_line($htm)));
                  #$html .= add_blue(html_line($htm));
               } else {
                  # simple tag - cover it in BLUE
                  $htm = "<$tag>";
                  $html .= add_blue(html_line($htm));
               }
               prt("$htm");
               $j += $k;
            } else {
               $txt .= $ch;
            }
         }
         #$html .= "\n";
      #}
   } else {
      prt("ERROR: Unable to open [$fil]...\n");
   }
   #return @html;
   return $html;
}
#my @html_lines = process_file($in_file);
#if (@html_lines) {
my $html_lines = process_file($in_file);
my ($in_title,$dir) = fileparse($in_file);
if ( length($html_lines) > 0) {
   prt( "Writing lines to $out_file ...\n" );
   my $sb = stat($in_file);
   my $tit = "$in_title to HTML";
   my $cur_tm = localtime(time());
   my $msg = "Generated: On $cur_tm,\n<br>From: $in_file, dated ".scalar localtime($sb->mtime).", with size ".$sb->size." bytes.";
   my $tmsg = "GA: Generated by $pgmname, on $cur_tm, from $in_title";
   #write_head($out_file, "HTML of $in_file", "HTML conversion of [$in_file]");
   write_head($out_file, $tit, $msg);
   open_pre( $out_file );
   #append2file(join("\n",@html_lines), $out_file);
   append2file($html_lines, $out_file);
   close_pre( $out_file );
   append_tail( $out_file, $tmsg);
   system($out_file);
}
close_log($outfile,0);
exit(0);
# eof - h2h01.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional