Generated: Tue Feb 2 17:54:57 2010 from stripms.pl 2005/05/01 6.9 KB.
#!/usr/bin/perl use strict; use FILE::stat; use CGI; use CGI qw(:standard); use HTML::Parser (); my $verbose = 0; my $package = 'stripms.pl'; my $logfile = 'striplog.txt'; my $actfile = ''; my @input_files = (); my $linebyline = 0; my $usearray = 1; my $msg; my $htmdata = ''; my $definp = "C:/Documents and Settings/Geoff McLane.PRO-1/My Documents/My Webs/moon-01.htm"; my $dotoken = 0; parse_arguments(@ARGV); die "program: no input files found or specified\n" if ! @input_files; $msg = "Begin processing ...\n"; open (OP, "> $logfile") or die "No LOG file ...\n"; print OP $msg; print $msg; foreach $actfile (@input_files) { #my $sb = stat($actfile); my ($dev,$ino,$mode,$nlink,$uid,$gid,$rdev,$size, $atime,$mtime,$ctime,$blksize,$blocks) = stat($actfile); print OP "Processing [$actfile] ... $size bytes ...\n"; printf "File is %s, is %s bytes, perm %04o, mtime %s\n", $actfile, $size, $mode & 07777, scalar localtime $mtime; open (INP, $actfile) or die "ERROR: Unable to OPEN file $actfile!\n"; if ($linebyline) { while ( <INP> ) { chomp; my $line = $_; print "$line\n"; } ## WHILE THERE IS input } elsif ($usearray) { my @lines = <INP>; chomp @lines; print OP "\n*********************************************\n"; print "@lines"; print OP "@lines"; my $all1 = join (' ',@lines); print OP "\n*********************************************\n"; print $all1; $htmdata = $all1; print OP $all1; print OP "\n*********************************************\n"; print "\n***** search for emails ****\n"; my $str = $all1; while ($str =~ /(([\w._-]+)\@([\w._-]+))/) { ## look for an email addr $msg = "user:$2 host:$3 all:$1\n"; ## parts of the addr print OP $msg; print $msg; $str = $'; ## set the str to be the "rest" of the string } print "\n***** done search for emails ****\n"; } else { $/ = undef; my $all = <INP>; ## read the whole file into one string print $all; } close (INP); ## close this INPUT file } ## end foreach input ... print "\nDone foreach ...\n"; ##$actfile = "c:/HOMEPAGE/P26/home2.htm"; sub dotoken { print "\nDoning token parser ...\n"; use HTML::TokeParser; my $p2 = HTML::TokeParser->new($actfile); ## shift||"index.html"); while (my $token = $p2->get_tag("a")) { my $url = $token->[1]{href} || "-"; my $text = $p2->get_trimmed_text("/a"); print "$url\t$text\n"; } print "\nDone URL list ...$actfile\n"; my $p3 = HTML::TokeParser->new($actfile) || die "Can't open: $!"; ## ["S", $tag, $attr, $attrseq, $text] ## ["E", $tag, $text] ## ["T", $text, $is_data] ## ["C", $text] ## ["D", $text] ## ["PI", $token0, $text] ##use Switch; my $WHITE_PATTERN = "^[ \t]*\$"; # only spacey stuff, like if ( /$WHITE_PATTERN/o ) { ...} my $WHITE_PATTERN2 = "^[ \t\n\r]*\$"; # only spacey stuff, like if ( /$WHITE_PATTERN/o ) { ...} while (my $tok3 = $p3->get_token) { # ... my $typ = $tok3->[0]; my $txt = $tok3->[1]; print OP "Before=[$txt]\n"; chomp $txt; print OP "AfterC=[$txt]\n"; if ($txt =~ /$WHITE_PATTERN2/o) { # skip just space } else { ## switch ($typ) { ## case "S" { } ## case "E" { } ## } if ($typ eq "S") { $msg = "S=<" . $txt . ">" . $tok3->[3]; } elsif ($typ eq "E") { $msg = "E=</" . $txt . ">" . $tok3->[2]; } elsif ($typ eq "T") { $msg = "T=$txt"; } elsif ($typ eq "C") { $msg = "C=$txt"; } elsif ($typ eq "D") { $msg = "D=$txt"; } elsif ($typ eq "PI") { $msg = "PI=$txt"; } else { $msg = "??" . $typ . $txt; } $msg .= "\n"; print $msg; print OP $msg; } } } ## if $dotoken ###use HTML::Parser (); ##use HTML::Parser; #HTML::Parser->new(default_h => [\&my_text, "text"], ### [sub { print shift }, 'text'], # comment_h => [\&my_comm, "text"], # )->parse_file($actfile || die) || die $!; # Create parser object my $pp = HTML::Parser->new( api_version => 3, ### start_h => [\&my_start, "tagname, attr"], start_h => [\&my_start2, "self,tokens" ], end_h => [\&my_end, "tagname"], text_h => [\&my_text, "text"], marked_sections => 1, ); sub start_handler { return if shift ne "title"; my $self = shift; $self->handler(text => sub { print shift }, "dtext"); $self->handler(end => sub { shift->eof if shift eq "title"; }, "tagname,self"); } ## my $p = HTML::Parser->new(api_version => 3); ## $p->handler( start => \&start_handler, "tagname,self"); ## $p->parse_file($actfile || die) || die $!; # Parse document text chunk by chunk # $pp->parse($chunk1); # $pp->parse($htmdata); #... # $pp->eof; # signal end of document # Parse directly from file print "\n********* parse directly from file ************************************\n"; $pp->parse_file("$actfile"); ### foo.html"); print "\n********* end parse directly from file ************************************\n"; # or # open(F, "$actfile") || die "Unable to open file ...\n"; # $pP->parse_file(*F); print "THE END!\n"; close OP; sub my_start2 { my ($self, @toks) = @_; $msg = ''; foreach my $tkn (@toks) { $msg .= " $tkn" } print "@toks\n"; print "$msg \n"; } sub my_start { my ($tg,%atr) = @_; my @attr = %atr; #$msg = "Start=[$tg] Attr=["; #$msg .= join (" ", @attr); #$msg .= "]\n"; #print $msg; print "size of hash: " . keys( %atr ) . ".\n"; $msg = "Start=[$tg] Attr=["; while ( my ($key, $value) = each(%atr) ) { $msg .= " $key => $value"; } $msg .= "]\n"; print $msg; $msg = "Start=[$tg] Attr=["; for my $key2 ( keys %atr ) { my $value2 = $atr{$key2}; $msg .= " $key2 => $value2"; } $msg .= "]\n"; print $msg; } sub my_end { my ($tge) = @_; $msg = "End=[$tge]\n"; #print $msg; } sub my_text { my ($txt) = @_; ## chomp $txt; ## print "text=[$txt]\n"; ## print $txt; } sub my_comm { my ($txt) = @_; ## chomp $txt; ## print "text=[$txt]\n"; print $txt; } sub parse_arguments { my @av = @_; # take it off the passed stack while (@av) { if ($av[0] eq '-v') { print "$package: Version 0.0.1 ... setting verbose ...\n"; $verbose = 1; } elsif (($av[0] eq '-help') || ($av[0] eq '-h') || ($av[0] eq '-?')) { die "$package filename [-v = version] [-? = this help] - strip MS html from file ...\n"; } elsif ($av[0] =~ /^-/) { die "$package: unrecognised option --$av[0]\nTry $package -help, for more information ;=)).\n"; } else { $actfile = $av[0]; print "Adding file [$actfile] ...\n"; if ( !( -f $actfile) ) { die "$package: Unable to loacte [$actfile]!\n"; } push(@input_files, $actfile); } shift @av; # move to next argument } # while arguments ### set default push(@input_files, $definp) if ! @input_files; }