msh2sh.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:46 2010 from msh2sh.pl 2005/05/03 7.8 KB.

#!/usr/bin/perl
use HTML::Parser ();
use Data::Dump ();
my ($FH, $HH);
my $instyle = 0;
my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # spacey if ($var ~= /$WHITE_PATTERN2/o ) { ...}
my $stripC = 1;
my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
my $defout = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/tempmoon.htm";
my $inpfil;
my $inhtml = 0; # in document
my $inpara = 0; # in paragraph tag
my $inhead = 0; # processing header
my $inbody = 0; # body processing
my $instyle = 0; # style processing
my $clearhtml = 1; # clear HTML attributes
my $clearop = 1; # clear MS o:p paragraph thingy 
my $clearpatts = 1; # clear paragraph attributes
my $clearhstyl = 1; # no SYTLE statment in head - include through file, if required
my $clearbsyle = 1; # no BODY attributes
my $cleartdsty = 0; # clear TD attributes
my $fiximg = 1; # modify the IMG tag
my $clearhlink = 1; # clear a LINK REL statement
my $clearspan = 1; # remove all SPAN tags
my $cleardiv = 1; # remove all DIV tags
my $clearmeta = 1; # remove META (head) tag
my $doout = 1; # do the OUTPUT, but can be off'ed ...
open $HH, ">$defout" or die "No HTML file ... [$defout]!\n";
open $FH, ">temphtml.txt" or die "No OUT file ...\n";
open $CH, ">tempskip.txt" or die "No SKIP file ...\n";
## Event table
##  ["S",  $tag, $attr, $attrseq, $text]
##  ["E",  $tag, $text]
##  ["T",  $text, $is_data]
##  ["C",  $text]
##  ["D",  $text]
##  ["PI", $token0, $text]
sub addh {
   local ($typ,$txt,$atr) = @_;
   my $outtext = $txt;
   my $logtext = '';
   my $outon = 1;
   $outtext .= "\n";
   if ($txt =~ /$WHITE_PATTERN2/o) {
      # skip just space
      $logtext .= "WT[$txt]\n";
   } else {
      $logtext .= "TT[$txt]\n";
   }
   if ($atr) {
      local ($k, $v, $i);
      $logtext .= "Showing attributes ... \n";
      # print $FH %$atr, "\n";
      $i = 0;
      #while (($key, $value) = each %atr) {
      #    print $FH $key, "\n";
      #    # delete $hash{$key};   # This is safe
      #}
      #foreach $k (keys %atr) {
      #   $i++;
      #   print $FH "$i [$k] = [$atr{$k}]\n";
      #}
      #foreach $k (keys %atr) {
      #   $i++;
      #   $v = $atr{$k};
      #   #print $FH "$i $k = $atr{$k}\n";
      #   print $FH "$i $k = $v\n";
      #}
      while (($k,$v) = each %$atr){
         $i++;
         # print $FH "$i key[$k] = val[$v]\n";
         #print $FH "$i $k=$v\n";
         $logtext .= "$i $k=$v\n";
      }
      if ($i == 0) {
         $logtext .= "NULL LIST!\n";
      }
   } else {
      $logtext .= "NO attributes ... \n";
   }
   if ($typ eq 'C') {
      if ($stripC) {
         $outon = 0; # switch OFF output
      }
   }
   if ($outon) {
      print $HH $outtext;
      print $FH $logtext;
   } else {
      print $CH $outtext;
      print $CH $logtext;
   }
}
sub h {
    my($event, $line, $column, $text, $tagname, $attr) = @_;
   my $typ = uc(substr($event,0,1)); ## get TYPE
    my @d =  "$typ L$line C$column";
    #substr($text, 40) = "..." if length($text) > 40;
    push(@d, $text);
    push(@d, $tagname) if defined $tagname;
   push(@d, $attr) if $attr;
   my $otxt = Data::Dump::dump(@d);
   #print $FH Data::Dump::dump(@d), "\n";
    #print Data::Dump::dump(@d), "\n";
   print "$otxt\n";
   # now process the data ...
   my $tag = '*NO_TAG*';
   my $locout = 1; # one time only output flag
   my $i;
   if (defined $tagname) {
      $tag = uc($tagname);
   }
   # Event table
   ########################################################################
   if ($typ eq 'S') {
      ##  ["S",  $tag, $attr, $attrseq, $text]
      if ($tag eq 'HTML') {
         $inhtml = 1;
         if ($clearhtml) {
            $text = '<html>';
         }
      } elsif ($tag eq 'P') {
         $inpara = 1;
         if ($clearpatts) {
            print $FH "Paragraph from $text to <p> ...\n";
            $text = "<p>";
         }
      } elsif ($tag eq 'HEAD') {
         $inhead = 1;
      } elsif ($tag eq 'BODY') {
         $inbody = 1;
         if ($clearbsyle) { # no BODY attributes
            $text = '<BODY>';
         }
      } elsif ($tag eq 'STYLE') {
         $instyle =1;
         if ($clearhstyl) {
            # in head - close out S style to E sytle
            if ($inhead) {
               $doout = 0; # CLOSE output
            }
         }
      } elsif ($tag eq 'SPAN') {
         if ($clearspan) { # remove all SPAN tags
            $locout = 0;
         }
      } elsif ($tag eq 'O:P') {
         if ($clearop) { # clear MS o:p paragraph thingy 
            $locout = 0;
         }
      } elsif ($tag eq 'LINK') {
         if ($clearhlink) {
            $locout = 0;
         }
      } elsif ($tag eq 'DIV') {
         if ($cleardiv) {
            $locout = 0;
         }
      } elsif ($tag eq 'TD') {
         if ($cleartdsty) {
            $text = '<td>';
         }
      } elsif ($tag eq 'IMG') {
         if ($fiximg) { # modify the IMG tag
            my %att = %$attr; # copy the HASH, to do modifications
            $i = 0;
            foreach $key (keys %att) {
               if ($key eq 'v:shapes') {
                  print $FH "Deleting attrib $key ...\n";
                  delete $att{$key}; # remove this MS reference
                  $i++;
               }
            }
            if ($i) {
               $text = '<IMG'; # start IMG tag again
               while (($key,$value) = each %att) {
                  $text .= " $key=$value";
               }
               $text .= '>'; # close IMG tag
            }
         }
      } elsif ($tag eq 'META') {
         if ($clearmeta) { # remove META (head) tag
            $locout = 0;
         }
      }
   ########################################################################
   } elsif ($typ eq 'E') {
      ##  ["E",  $tag, $text]
      if ($tag eq 'HTML') {
         $inhtml = 0;
      } elsif ($tag eq 'P') {
         $inpara = 0;
      } elsif ($tag eq 'HEAD') {
         $inhead = 0;
      } elsif ($tag eq 'BODY') {
         $inbody = 0;
      } elsif ($tag eq 'STYLE') {
         $instyle = 0;
         if ($clearhstyl) {
            # in head - close out S style to E sytle
            if ($inhead) {
               $doout = 1; # OPEN output
               $locout = 0; # but NOT for this style one
            }
         }
      } elsif ($tag eq 'SPAN') {
         if ($clearspan) { # remove all SPAN tags
            $locout = 0;
         }
      } elsif ($tag eq 'O:P') {
         if ($clearop) { # clear MS o:p paragraph thingy 
            $locout = 0;
         }
      } elsif ($tag eq 'LINK') {
         if ($clearhlink) {
            $locout = 0;
         }
      } elsif ($tag eq 'DIV') {
         if ($cleardiv) {
            $locout = 0;
         }
      }
   } elsif ($typ eq 'T') {
      ##  ["T",  $text, $is_data]
   } elsif ($typ eq 'C') {
      ##  ["C",  $text]
   } elsif ($typ eq 'D') {
      ##  ["D",  $text]
   } elsif ($typ eq 'P') {
      ##  ["PI", $token0, $text]
   }
   ### end event table ###########################################################
   if ($text =~ /$WHITE_PATTERN2/o) {
      print $CH "ws[$otxt]\n";
      print $CH "ws[$text]\n";
   } else {
      if ($typ eq 'C') {
         print $CH "cd[$otxt]\n";
         print $CH "cd[$text]\n";
      } else {
         if ($doout && $locout) {
            print $FH "$otxt\n";
            print $HH "$text\n";
         } else {
            print $CH "$otxt\n";
            print $CH "$text\n";
         }
      }
   }
}
sub h2 {
    my($event, $line, $column, $text, $tagname, $attr) = @_;
   my $typ = uc(substr($event,0,1)); ## get TYPE
    my @d =  "$typ L$line C$column";
    #substr($text, 40) = "..." if length($text) > 40;
    push(@d, $text);
    push(@d, $tagname) if defined $tagname;
    push(@d, $attr) if $attr;
   if ($typ ne "XXX") {
      print $FH Data::Dump::dump(@d), "\n";
   }
    print Data::Dump::dump(@d), "\n";
   my $msg = "$typ = ";
   my $doadd = 0;
   my $tag;
   if (defined $tagname) {
      $tag = $tagname;
      $msg .= "<$tagname> ";
      $doadd++;
   } else {
      $tag = "*NO_TAG*";
      $msg .= "*NO TAG* ";
   }
   if ($text =~ /$WHITE_PATTERN2/o) {
      $msg .= "WHITE TEXT ONLY ";
   } else {
      $msg .= "with Text ";
      $doadd++;
   }
   if ($attr) {
      $msg .= "and attributes.";
      $doadd++;
   } else {
      $msg .= "NO ATTRIBUTES.";
   }
   print $FH $msg, "\n";
   if ($doadd) {
      addh($typ, $text, $attr);
   } else {
      print $FH "Skipped ... " . @d . "\n";
   }
}
my $p = HTML::Parser->new(api_version => 3);
$p->handler(default => \&h, "event, line, column, text, tagname, attr");
# $p->parse_file(@ARGV ? shift : die "No input given ....\n");
$inpfil = $definp;
$p->parse_file($inpfil);
close $FH; # log file output
close $HH; # ouput HTML file
close $CH; # log of discarded items
system $defout; # run the HTML file

index -|- top

checked by tidy  Valid HTML 4.01 Transitional