htmlparse02.pl to HTML.

Generated: Tue Feb 2 17:54:42 2010 from htmlparse02.pl 2008/07/24 4.4 KB.
#!/perl -w
# NAME: htmlparse.pl
# AIM:
use strict;
use warnings;
use HTML::Parser;
require 'logfile.pl' or die "Unable to load logfile.pl ...\n";
# log file stuff
my ($LF);
my $pgmname = $0;
if ($pgmname =~ /\w{1}:\\.*/) {
    my @tmpsp = split(/\\/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = "temp.$pgmname.txt";
open_log($outfile);
prt( "$0 ... Hello, World ...testing HTML::Parser ...\n" );
my @accum = ();
my ($p);
my $type = 1; #1;   # 5; #3;  #2;
my @array = ();
my $scnt = 0;
if ($type == 1) {
    # Create parser object
    $p = HTML::Parser->new( api_version => 3,
        start_h => [\&start_sub, "self, tagname, attr, attrseq, text"],
        end_h   => [\&end_sub,   "self, tagname, text"],
        text_h  => [\&text_sub,  "self, text"],
        process_h => [\&process_sub, "self, token0, text"],
        marked_sections => 1
        );
}
#my $p = HTML::Parser->new( api_version => 3,
#                         marked_sections => 1,
#                       );
# Parse document text chunk by chunk
# $p->parse($chunk1);
# $p->parse($chunk2);
#...
# $p->eof;                 # signal end of document
if ($type == 2) {
    # THIS FAILS ????
    #################
    $p = HTML::Parser->new( api_version => 2 );
    #$p = HTML::Parser->new();
    #$p->handler(start   => "start_sub",   "self, tagname, attr, attrseq, text");
    #$p->handler(end     => "end_sub",     "self, tagname, text");
    #$p->handler(text    => "text_sub",    "self, text, is_cdata");
    #$p->handler(process => "process_sub", "self, token0, text");
    $p->handler(start   => [\&start_sub,   "self, tagname, attr, attrseq, text"]);
    $p->handler(end     => [\&end_sub,     "self, tagname, text"]);
    $p->handler(text    => [\&text_sub,    "self, text, is_cdata"]);
    $p->handler(process => [\&process_sub, "self, token0, text"]);
    $p->handler(comment =>
             sub {
       my($self, $tokens) = @_;
       for (@$tokens) {$self->comment($_);}},
             "self, tokens");
    $p->handler(declaration =>
             sub {
       my $self = shift;
       $self->declaration(substr($_[0], 2, -1));},
             "self, text");
}
if ($type == 3) {
    $p = HTML::Parser->new();
    # Event is one of text, start, end, declaration, comment, process or default.
    $p->handler(start =>  \@accum, '"S", attr, attrseq, text' );
    # The array elements will be ['S', \%attr, \@attr_seq, $text]. 
}
if ($type == 5) {
    $p = HTML::Parser->new(api_version => 3,
        handlers => { text => [\@array, "event,text"],
                      comment => [\@array, "event,text"] }
    );
}
# Parse directly from file
$p->parse_file('favorites.htm');
# or
# open(F, "foo.html") || die;
# $p->parse_file(*F);
if ($type == 3) {
    $scnt = scalar @accum;
    prt( "$type: Got $scnt start events ...\n" );
}
if ($type == 5) {
    $scnt = scalar @array;
    prt( "$type: Got $scnt text, comment events ...\n" );
    for (my $k = 0; $k < $scnt; $k++) {
        my $ev = $array[$k][0];
        my $tx = $array[$k][1];
        my $ttx = trim_all($tx);
        if (length($ttx)) {
            prt( "$ev - $tx ...\n" );
        } else {
            prt( "$ev - blank ...\n" );
        }
    }
}
close_log($outfile,1);
exit(0);
sub start_sub {
    my($self, $tagname, $attr, $attrseq, $origtext) = @_;
    my $attrs = '';
    my ($ky, $msg, $cnt);
    foreach $ky (keys %$attr) {
        $attrs .= ' '.$ky.'="'.$$attr{$ky}.'"';
    }
    $msg = "$type:1: start tag=<$tagname";
    $msg .= $attrs if length($attrs);
    $msg .= '>';
    prt( "$msg\n" );
    $attrs = '';
    $cnt = 0;
    foreach $ky (@$attrseq) {
        $cnt++;
        $attrs .= '|' if length($attrs);
        $attrs .= $ky;
    }
    if (length($attrs) && ($cnt > 1)) {
        $msg = "$type:2: start tag=$tagname";
        $msg .= ' - attr order = ['.$attrs.']';
        prt( "$msg\n" );
    }
    prt( "$type:3: start ot[$origtext]\n" ) if ($cnt > 1);
}
sub end_sub {
    my($self, $tagname, $origtext) = @_;
    prt( "$type: end tag=$tagname ot[$origtext] ...\n" );
}
sub text_sub {
    my($self, $origtext) = @_;
    my $ttxt = trim_all($origtext);
    if (length($ttxt)) {
        prt( "$type: text ot[$origtext] ...\n" );
    } else {
        prt( "$type: text blank ...\n" );
    }
}
sub process_sub {
    prt( "$type: process called ...\n" );
}
# eof
index -|- top