stripms.pl to HTML.

index -|- end

Generated: Tue Feb 2 17:54:57 2010 from stripms.pl 2005/05/01 6.9 KB.

#!/usr/bin/perl
use strict;
use FILE::stat;
use CGI;
use CGI qw(:standard);
use HTML::Parser ();
my $verbose = 0;
my $package = 'stripms.pl';
my $logfile = 'striplog.txt';
my $actfile = '';
my @input_files = ();
my $linebyline = 0;
my $usearray = 1;
my $msg;
my $htmdata = '';
my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm";
my $dotoken = 0;
parse_arguments(@ARGV);
die "program: no input files found or specified\n" if ! @input_files;
$msg = "Begin processing ...\n";
open (OP, "> $logfile") or die "No LOG file ...\n";
print OP $msg;
print $msg;
foreach $actfile (@input_files) {
 #my $sb = stat($actfile);
 my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size,
       $atime,$mtime,$ctime,$blksize,$blocks)
           = stat($actfile);
 print OP "Processing [$actfile] ... $size bytes ...\n";
 printf "File is %s, is %s bytes, perm %04o, mtime %s\n",
    $actfile,
    $size, $mode & 07777, scalar localtime $mtime;
 open (INP, $actfile) or die "ERROR: Unable to OPEN file $actfile!\n";
 if ($linebyline) {
  while ( <INP> ) {
     chomp;
     my $line = $_;
    print "$line\n";
  } ## WHILE THERE IS input
 } elsif ($usearray) {
  my @lines = <INP>;
  chomp @lines;
  print OP "\n*********************************************\n";
  print "@lines";
  print OP "@lines";
  my $all1 = join (' ',@lines);
  print OP "\n*********************************************\n";
  print $all1;
  $htmdata = $all1;
  print OP $all1;
  print OP "\n*********************************************\n";
  print "\n***** search for emails ****\n";
  my $str = $all1;
  while ($str =~ /(([\w._-]+)\@([\w._-]+))/) { ## look for an email addr
     $msg = "user:$2 host:$3 all:$1\n"; ## parts of the addr
    print OP $msg;
    print $msg;
    $str = $'; ## set the str to be the "rest" of the string
  }
  print "\n***** done search for emails ****\n";
 } else {
  $/ = undef; 
  my $all = <INP>;    ## read the whole file into one string 
  print $all;
 }
 close (INP); ## close this INPUT file
} ## end foreach input ...
print "\nDone foreach ...\n";
##$actfile = "c:/HOMEPAGE/P26/home2.htm";
sub dotoken {
print "\nDoning token parser ...\n";
use HTML::TokeParser;
my $p2 = HTML::TokeParser->new($actfile); ## shift||"index.html");
while (my $token = $p2->get_tag("a")) {
    my $url = $token->[1]{href} || "-";
    my $text = $p2->get_trimmed_text("/a");
    print "$url\t$text\n";
}
print "\nDone URL list ...$actfile\n";
my $p3 = HTML::TokeParser->new($actfile) ||
      die "Can't open: $!";
##  ["S",  $tag, $attr, $attrseq, $text]
##  ["E",  $tag, $text]
##  ["T",  $text, $is_data]
##  ["C",  $text]
##  ["D",  $text]
##  ["PI", $token0, $text]
##use Switch;
my $WHITE_PATTERN = "^[ \t]*\$"; # only spacey stuff, like if ( /$WHITE_PATTERN/o ) { ...}
my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # only spacey stuff, like if ( /$WHITE_PATTERN/o ) { ...}
 while (my $tok3 = $p3->get_token) {
     # ...
    my $typ = $tok3->[0];
    my $txt = $tok3->[1];
    print OP "Before=[$txt]\n";
   chomp $txt;
    print OP "AfterC=[$txt]\n";
   if ($txt =~ /$WHITE_PATTERN2/o) {
      # skip just space
   } else {
##    switch ($typ) {
##       case "S" {    }
##       case "E" {    }
##     }
   if ($typ eq "S") {
      $msg = "S=<" . $txt . ">" . $tok3->[3];
   } elsif ($typ eq "E") {
      $msg = "E=</" . $txt . ">" . $tok3->[2];
   } elsif ($typ eq "T") {
      $msg = "T=$txt";
   } elsif ($typ eq "C") {
      $msg = "C=$txt";
   } elsif ($typ eq "D") {
      $msg = "D=$txt";
   } elsif ($typ eq "PI") {
      $msg = "PI=$txt";
   } else {
      $msg = "??" . $typ . $txt;
   }
   $msg .= "\n";
   print $msg;
   print OP $msg;
   }
 }
} ## if $dotoken
###use HTML::Parser ();
##use HTML::Parser;
#HTML::Parser->new(default_h => [\&my_text, "text"], ### [sub { print shift }, 'text'],
#                  comment_h => [\&my_comm, "text"],
#                  )->parse_file($actfile || die) || die $!;
# Create parser object
my $pp = HTML::Parser->new( api_version => 3,
                  ### start_h => [\&my_start, "tagname, attr"],
                  start_h => [\&my_start2, "self,tokens" ],
                        end_h   => [\&my_end,   "tagname"],
                  text_h  => [\&my_text,  "text"],
                        marked_sections => 1,
                       );
  sub start_handler
  {
    return if shift ne "title";
    my $self = shift;
    $self->handler(text => sub { print shift }, "dtext");
    $self->handler(end  => sub { shift->eof if shift eq "title"; },
                           "tagname,self");
  }
##  my $p = HTML::Parser->new(api_version => 3);
##  $p->handler( start => \&start_handler, "tagname,self");
##  $p->parse_file($actfile || die) || die $!;
 # Parse document text chunk by chunk
 # $pp->parse($chunk1);
# $pp->parse($htmdata);
 #...
# $pp->eof;                 # signal end of document
# Parse directly from file
print "\n********* parse directly from file ************************************\n";
$pp->parse_file("$actfile"); ### foo.html");
print "\n********* end parse directly from file ************************************\n";
 # or
# open(F, "$actfile") || die "Unable to open file ...\n";
# $pP->parse_file(*F);
print "THE END!\n";
close OP;
sub my_start2 {
   my ($self, @toks) = @_;
   $msg = '';
   foreach my $tkn (@toks) {
      $msg .= " $tkn"
   }
   print "@toks\n";
   print "$msg \n";
}
sub my_start {
   my ($tg,%atr) = @_;
   my @attr = %atr;
   #$msg = "Start=[$tg] Attr=[";
   #$msg .= join (" ", @attr);
   #$msg .= "]\n";
   #print $msg;
   print "size of hash:  " . keys( %atr ) . ".\n";
   $msg = "Start=[$tg] Attr=[";
    while ( my ($key, $value) = each(%atr) ) {
        $msg .= " $key => $value";
    }
   $msg .= "]\n";
   print $msg;
   $msg = "Start=[$tg] Attr=[";
   for my $key2 ( keys %atr ) {
        my $value2 = $atr{$key2};
        $msg .= " $key2 => $value2";
    }
   $msg .= "]\n";
   print $msg;
}
sub my_end {
   my ($tge) = @_;
   $msg = "End=[$tge]\n";
   #print $msg;
}
sub my_text {
   my ($txt) = @_;
   ## chomp $txt;
    ## print "text=[$txt]\n";
   ## print $txt;
}
sub my_comm {
   my ($txt) = @_;
   ## chomp $txt;
    ## print "text=[$txt]\n";
   print $txt;
}
sub parse_arguments {
 my @av = @_; # take it off the passed stack
 while (@av) {
  if ($av[0] eq '-v') {
   print "$package: Version 0.0.1 ... setting verbose ...\n";
   $verbose = 1;
  } elsif (($av[0] eq '-help') || ($av[0] eq '-h') || ($av[0] eq '-?')) {
   die "$package filename [-v = version] [-? = this help] - strip MS html from file ...\n";
  } elsif ($av[0] =~ /^-/) {
   die "$package: unrecognised option --$av[0]\nTry $package -help, for more information ;=)).\n";
  } else {
   $actfile = $av[0];
   print "Adding file [$actfile] ...\n";
   if ( !( -f $actfile) ) {
    die "$package: Unable to loacte [$actfile]!\n";
   }
   push(@input_files, $actfile);
  }
  shift @av; # move to next argument
 } # while arguments
 ### set default
 push(@input_files, $definp) if ! @input_files;
}

index -|- top

checked by tidy  Valid HTML 4.01 Transitional