Generated: Tue Feb 2 17:54:25 2010 from chkxml.pl 2010/01/28 21.3 KB.
#!/usr/bin/perl -w # ########################################################### # NAME: chkxml.pl # AIM: Given an XML file, test it - a SIMPLE test # 27/01/2010 - add BOM support, even GUESSING when no BOM!!! # 23/01/2010 - update many many things, # but still does NOT handle UTF-16 encloded files - see below # 15/11/2008 - geoff mclane - http://geoffair.net/mperl # ########################################################### use strict; use warnings; my ($perl_root,$log_lib); if ($^O eq 'MSWin32') { $perl_root = 'C:\GTools\perl'; $log_lib = 'logfile.pl'; } else { $perl_root = '/home/geoff/bin'; $log_lib = 'logfile.pl'; } unshift(@INC, $perl_root); require $log_lib or die "Unable to load $log_lib...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $perl_root."/temp.$pgmname.txt"; open_log($outfile); # FOR UTF-16 (UNICODE) NOTES # ========================== # from : http://www.perlmonks.org/?node_id=649456 # Would need to 'use PerlIO::encoding;', then # open( IN, '<:raw:encoding(UTF-16LE)', $in_file); # or perhaps :raw:encoding(UTF-16LE):crlf:utf8 # or these parameters used on 'binmode' # see sub has_utf_16_BOM($) below... # UNICODE file my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\slashdot16.xml'; # This could be detected by reading the BOM # 0000:0000 FF FE 3C 00 3F 00 78 00 6D 00 6C 00 20 00 76 00 ..<.?.x.m.l. .v. # 0000:0010 65 00 72 00 73 00 69 00 6F 00 6E 00 3D 00 22 00 e.r.s.i.o.n.=.". # 0000:0020 31 00 2E 00 30 00 22 00 20 00 65 00 6E 00 63 00 1...0.". .e.n.c. # 0000:0030 6F 00 64 00 69 00 6E 00 67 00 3D 00 22 00 75 00 o.d.i.n.g.=.".u. # 0000:0040 74 00 66 00 2D 00 31 00 36 00 22 00 3F 00 3E 00 t.f.-.1.6.".?.>. # NOW HANDLED # =========== #my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\utf16bebom.xml'; # another #my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\utf16lebom.xml'; #my $def_file = 'test4.xml'; #my $def_file = 'tests.xml'; #my $def_file = 'C:\DTEMP\libxml2-2.6.30\test\errors\attr2.xml'; #my $def_file = 'C:\FGCVS\FlightGear\data/Aircraft/B-2/B-2-set.xml'; #my $def_file = 'REC-xml-19980210.xml'; #my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\valid\REC-xml-19980210.xml'; #my $def_file = 'C:\DTEMP\libxml2-2.6.30\result\noent\badcomment.xml'; #my $def_file = 'C:\DTEMP\FG\CubeServ420.xml'; #my $def_file = "C:\\FGCVS\\FlightGear\\data\\Aircraft\\787\\787.xml"; #my $def_file = 'C:\FG\26\data\Protocol\FgfsSharp.xml'; #my $def_file = "C:\\FGCVS\\FlightGear\\data\\Aircraft\\c172p\\c172p-set.xml"; # features my $debug_on = 1; # to run WITHOUT parameters my $load_log = 0; my $verbose = 0; my $dbg_cdata = 0; # show begin and end of CDATA my $dbg_lines = 0; # show EACH line # program variables my @warnings = (); my @in_files = (); # list of INPUT files... sub pgm_exit($$) { my ($ret,$msg) = @_; show_warnings($ret); if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } close_log($outfile,$load_log); exit($ret); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$// if ($tx =~ /\n$/); prt("$tx\n"); push(@warnings,$tx); } sub show_warnings($) { my ($dbg) = shift; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS ...\n" ); foreach my $line (@warnings) { prt("$line\n" ); } prt("\n"); } elsif ($dbg) { prt("\nNo warnings issued.\n\n"); } } sub prtv($) { my ($txt) = @_; prt($txt) if ($verbose); } sub has_utf_16_BOM($) { my ($fil) = shift; if (open INF, "<$fil") { binmode INF; my $buf = ""; if ((read INF, $buf, 2) == 2) { close INF; my $od1 = ord(substr($buf,0,1)); my $od2 = ord(substr($buf,1,1)); if (($od1 == 0xFF)&&($od2 == 0xFE)) { return (16+2); # LittleEndians (windows) } elsif (($od1 == 0xFE)&&($od2 == 0xFF)) { return (16+4); # BigEndians (unix) } elsif ($od1 == 0) { return 4; } elsif ($od2 == 0) { return 2; } return 1; } close INF; } return 0; } ############################################################# # main process sub get_attrs_ref($) { my ($txt) = @_; $txt = substr($txt,1) while ($txt =~ /^\s/); # clear any leading spaces my $len = length($txt); my ($i,$cc,$key,$val); my %h = (); $i = 0; while ($i < $len) { $key = ''; for (; $i < $len; $i++) { $cc = substr($txt,$i,1); if ($cc eq '=') { $i++; $cc = substr($txt,$i,1); last; } $key .= $cc; } return \%h if ($cc ne '"'); $i++; $val = ''; for (; $i < $len; $i++) { $cc = substr($txt,$i,1); last if ($cc eq '"'); $val .= $cc; } $h{$key} = $val; $i++; # bump over 2nd inverted commas # and eat any spaces for (; $i < $len; $i++) { $cc = substr($txt,$i,1); last if ( !($cc =~ /\s/) ); } } return \%h; } sub search_harder($$$$) { my ($bal,$rlines,$i,$max) = @_; my $iret = 1; my $len = length($bal); my ($j,$cc,$line); for ($j = 0; $j < $len; $j++) { $cc = substr($bal,$j,1); if ($cc eq '>') { return 0; } if (!($cc =~ /\s/)) { return 1; } } for (; $i < $max; $i++) { $line = ${$rlines}[$i]; chomp $line; $line = trim_all($line); $len = length($line); # get new length to process for ($j = 0; $j < $len; $j++) { $cc = substr($line,$j,1); if ($cc eq '>') { return 0; } if (!($cc =~ /\s/)) { return 1; } } } return $iret; } # WOW, found this as the DOCTYPE # <!DOCTYPE WMT_MS_Capabilities SYSTEM # "http://schemas.cubewerx.com/schemas/wms/1.1.0/WMT_MS_Capabilities.dtd" # [ # <!-- vendor-specific elements defined here --> # <!ELEMENT VendorSpecificCapabilities EMPTY> # ]> # which is parsed correctly by IE... sub process_xml_line_array($$$) { my ($fil,$bom,$rlines) = @_; my $iret = 0; my $lncnt = scalar @{$rlines}; my ($line, $i, $lnnum); my ($j, $ch, $tag, $intag, $nch, $j2, $len); my ($attrs, $hadsp, $text, $tmp, $ttag, $ltag); my ($pc,$ptag,$ppc,$pppc,$incomm,$ctag,$stkcnt); my ($attref,$pushed,$msg,$total_chars,$doc_root,$doc_head,$doc_type); my ($indt,$indtsqr,$comtext,$incdata,$plnn,$pref,$lentag); prtv( "Processing $lncnt lines, from $fil..." ); if ($bom & 6) { prtv(" in UTF-16"); prtv("LE") if ($bom & 2); prtv("BE") if ($bom & 4); prtv("(BOM)") if ($bom & 16); } prtv("\n"); $lnnum = 0; $tag = ''; $intag = 0; $nch = ''; $attrs = ''; $hadsp = 0; $text = ''; $ch = ''; $ppc = ''; $incomm = 0; $ctag = ''; $pushed = 0; my @tags = (); $total_chars = 0; $doc_root = ''; $doc_head = ''; $doc_type = ''; $indt = 0; $indtsqr = 0; $incdata = 0; $lentag = 0; for ($i = 0; $i < $lncnt; $i++) { $lnnum++; $line = ${$rlines}[$i]; $len = length($line); $total_chars += $len; # now tidy the line a little chomp $line; $line = trim_all($line); $len = length($line); # get new length to process # process the line of text prt("$lnnum: [$line]\n") if ($dbg_lines); for ($j = 0; $j < $len; $j++) { $j2 = $j + 1; $pppc = $ppc; $ppc = $pc; $pc = $ch; # store into previous $ch = substr($line,$j,1); $nch = (($j2 < $len) ? substr($line,$j2,1) : " "); if ( !$incdata && !$incomm && ($ch eq '-') && ($pc eq '-') && ($ppc eq '!') && ($pppc eq '<') ) { $incomm = 1; prtv(sprintf("%4d: <!-- Begin comment\n",$lnnum)); $tag .= $ch; # no space yet, so put it in TAG $j++; $j2 = $j + 1; $pppc = $ppc; $ppc = $pc; $pc = $ch; # store into previous $ch = substr($line,$j,1); $nch = (($j2 < $len) ? substr($line,$j2,1) : " "); next if ($j2 >= $len); $pc = " "; # ensure if this NEXT is a '>', that <!--> does NOT close comments $comtext = $tag; } if ($intag) { # IN A TAG - had '<' opening # ========================== if ($hadsp) { $attrs .= $ch; # after any SPACE while IN a TAG, put it in TEXT } else { $tag .= $ch; # else no space yet, to put it in TAG $lentag = length($tag); } if ($indt) { if ($incomm) { $incomm = 0 if (($ch eq '>')&&($pc eq '-')&&($ppc eq '-')); prtv(sprintf("%4d: Exit comment -->\n",$lnnum)) if (!$incomm); } elsif ($indtsqr) { $indtsqr = 0 if ($ch eq ']'); prtv(sprintf("%4d: End DOCTYPE CDATA] [$tag]\n",$lnnum)) if (!$indtsqr); } else { $indtsqr = 1 if ($ch eq '['); prtv(sprintf("%4d: Bgn DOCTYPE [CDATA [$tag]\n",$lnnum)) if ($indtsqr); } if ( !$indtsqr && ($ch eq '>') ) { $indt = 0; prtv(sprintf("%4d: End DOCTYPE [$tag]\n",$lnnum)); # is NOT an open element, SO... $attrs = ''; # clear ALL attributes $text = ''; # clear ALL text $tag = ''; $intag = 0; # and OUT of DOCTYPE tag } next; # stay to EAT whole DOCTYPE declaration... } # must EAT in line no space CDATA tags $incdata = 1 if (($lentag == 9)&&($tag =~ /^<!\[CDATA\[/) && !$hadsp); prt("$lnnum: BEGIN CDATA\n") if ($incdata && ($lentag == 9) && !$hadsp && $dbg_cdata); # just wait for TAG close # if ($ch eq '>') { #if ( !$incomm && ($ch eq '>')) { if ( !$incdata && !$incomm && ($ch eq '>')) { $pushed = 0; $msg = sprintf("%4d: '>'",$lnnum); $attrs =~ s/>$//; $attrs =~ s/\/$//; $attrs =~ s/\s+//; $attref = get_attrs_ref($attrs); if ($incomm) { $msg .= " COMMENT"; } elsif ($tag =~ /^<\?/) { $msg .= " HEADER"; $doc_head = $tag.$attrs; } elsif ($tag =~ /^<\//) { # prt("/"); $tag =~ s/^<\///; $tag =~ s/>$//; if ($tag eq '!--') { $msg .= " COMMENT"; } elsif (@tags) { $ttag = trim_all($tag); #$ltag = $tags[-1]; $pref = $tags[-1]; $ltag = ${$pref}[0]; $plnn = ${$pref}[1]; #if ($tags[-1] eq $tag) { if ($ttag eq $ltag) { #$ptag = pop @tags; $pref = pop @tags; $ptag = ${$pref}[0]; $plnn = ${$pref}[1]; $stkcnt = scalar @tags; $msg .= " POP plnn [$plnn] ($stkcnt)"; $ctag = $ptag; } else { $msg .= " FAILED!!!!!"; #prtw( "WARNING: Last tag [".$tags[-1]."] NOT $tag\n" ); prtw( "WARNING: Last tag [$ltag]($plnn) NOT [$ttag]($lnnum)\n" ); $iret |= 4; } } else { # NOT comment, NO tags $msg .= " MISSED!!!!!"; prtw( "WARNING: NO TAGS ON STACK $tag\n" ); $iret |= 4; } } elsif ($tag =~ /^</) { $tag =~ s/^<//; $tag =~ s/>$//; $ttag = trim_all($tag); #if ($tag eq '!--') { if ($tag =~ /^!--/) { $msg .= " COMMENT [$ttag]"; } elsif ($tag =~ /^!\[CDATA\[/) { $msg .= " CDATA [$ttag]"; $incdata = 0 if (($ch eq '>')&&($pc eq ']')&&($ppc eq ']')); # out of <![CDATA[...with... ]]> $msg .= " CDATA CLOSED" if (!$incdata); } elsif ($tag =~ /^!DOCTYPE/) { $msg .= " DOCTYPE [$ttag]"; } elsif ($pc eq '/') { $msg .= " COMPLETE [$ttag]"; } else { push(@tags, [$ttag,$lnnum]); $stkcnt = scalar @tags; $ctag = $ttag; $msg .= " PUSHED [$ttag] ($stkcnt)"; $pushed = 1; # flag an OPEN TAG if ($stkcnt == 1) { if (length($doc_root)) { prtw("WARNING: Appears to have MULTIPLE document ROOTS [$doc_root], NOW [$ttag]\n"); prtw("Only one top level element is allowed in an XML document.\n"); $iret |= 16; } $doc_root = $ttag; } } } else { prt( " ???? CHECK ME " ); $iret |= 8; } $msg .= " tag [$tag]"; $msg .= " attrs [$attrs]" if (length($attrs)); $msg .= " text [$text]" if (length($text)); prtv("$msg\n"); $tag = ''; if (!$pushed) { # is NOT an open element, SO... $attrs = ''; # clear ALL attributes $text = ''; # clear ALL text } $intag = 0; #} elsif ($ch =~ /\s/) { #} elsif ( !$incomm && ($ch =~ /\s/)) { } elsif ( !$incdata && !$incomm && ($ch =~ /\s/)) { $hadsp = 1; $indt = 1 if ($tag =~ /^<!DOCTYPE/); $incdata = 1 if ($tag =~ /^<!\[CDATA\[/); prtv(sprintf("%4d: Begin DOCTYPE\n",$lnnum)) if ($indt); } elsif ($incdata) { $incdata = 0 if (($nch eq '>')&&($ch eq ']')&&($pc eq ']')); # out of <![CDATA[...with... ]]> # BUT what about ]] + <newline(s)> # > is also an END of CDATA if (($ch eq ']') && ($pc eq ']') && ($nch =~ /\s/) && $incdata ) { # must search HARDER for close my $bal = (($j2+1) < $len) ? substr($line,($j2+1)) : ""; # more in this line $incdata = search_harder($bal,$rlines,$i,$lncnt); prt(sprintf("%4d: SPECIAL End CDATA\n",$lnnum)) if (!$incdata); } prt("$lnnum: EXIT CDATA\n") if (!$incdata && $dbg_cdata); } } else { # NOT yet in a TAG # ================ if ($ch eq '<') { # Enter a TAG with '<' # out any previous TEXT $attrs =~ s/>$//; if (length($text)||length($attrs)) { $msg = sprintf("%4d:",$lnnum); $msg .= " TEXT [$text] ctag [$ctag]"; $msg .= " attrs [$attrs]" if (length($attrs)); prtv("$msg\n"); } ###$attrs = ''; # clear any attributes ###$text = ''; # start the TAG $tag = $ch; $intag = 1; # set IN A TAG $hadsp = 0; # had no space in tag yet } else { $text .= $ch; # accumulate TEXT (between tags) } } # POST character processing if ($incomm) { if (($ch eq '>')&&($pc eq '-')&&($ppc eq '-')) { $incomm = 0; prtv(sprintf("%4d: Exit comment",$lnnum)); prtv(" $tag$attrs\n"); $tag = ''; $attrs = ''; # clear ALL attributes $text = ''; # clear ALL text $intag = 0; } } } # for line of text $pppc = $ppc; $ppc = $pc; $pc = $ch; # store into previous $ch = "\n"; $hadsp = 1; # is the SAME as a SPACE } # ============================= prt("File: [$fil]. Doc root [$doc_root]\n"); if (@tags) { $len = scalar @tags; #$msg = join('|',@tags); $msg = ''; for ($i = 0; $i < $len; $i++) { $pref = $tags[$i]; $msg .= " | " if (length($msg)); $msg .= ${$pref}[0]."(".${$pref}[1].")"; } prtw( "WARNING: Still $len ON STACK! List [$msg]\n" ); $iret |= 1; } elsif ($iret) { prt( "HAS WARNINGS... $lncnt lines, $total_chars characters.\n" ); } else { prt( "Appears CLEAN... $lncnt lines, $total_chars characters.\n" ); } return $iret; } sub process_xml_file($) { my ($fil) = @_; my $ret = 0; my $bom = has_utf_16_BOM($fil); my (@lines); if (open INF, "<$fil") { if ($bom & 2) { binmode INF, ":encoding(UTF-16LE)"; } elsif ($bom & 4) { binmode INF, ":encoding(UTF-16BE)"; } @lines = <INF>; close INF; $lines[0] = substr($lines[0],1) if ($bom & 16); # move PAST the BOM $ret = process_xml_line_array($fil,$bom,\@lines); } else { prtw( "WARNING: FAILED TO OPEN $fil ...\n" ); $ret = 3; } return $ret; } sub process_xml_files($) { my ($rfils) = @_; my ($fil); my $res = 0; foreach $fil (@{$rfils}) { $res |= process_xml_file($fil); } return $res; } ################################## # ### MAIN ### parse_args(@ARGV); #prt( "$pgmname: Processing file list...\n" ); my $r = process_xml_files( \@in_files ); pgm_exit($r,""); ################################## sub give_help { prt("$pgmname: Version 0.0.1 2010/01/25\n"); prt("Usage: [Options] input_xml_file\n"); prt("Options:\n"); prt(" -h (-?) - This help, and exit 0\n"); prt(" -l - Load log at end.\n"); prt(" -v - Set verbose output.\n"); prt("Given an XML input file, parse, display, and advise results.\n"); pgm_exit(0,"Help exit"); } sub set_verbosity { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); #if ($sarg =~ /^v/i) { if ($sarg =~ /^v(.*)$/i) { $verbose++; $sarg = $1; if (length($sarg)) { if ($sarg =~ /(=|:)(.+)/) { $sarg = $2; if ($sarg =~ /^\d+$/) { $verbose = $sarg; } else { prt("ERROR: -v= can only be followed by a number, NOT [$sarg]!\n"); goto ON_ERROR; } } else { $sarg = $1; prt("ERROR: -v can only be followed by '=' or ':', NOT [$sarg]!\n"); goto ON_ERROR; } } prtv("Set verbosity to $verbose.\n"); } } shift @av; } return 0; ON_ERROR: prt("ERROR: Unknown argument [$arg]! Aborting...\n"); pgm_exit(2,"BAD ARGUMENT"); return 1; } sub parse_args { my (@av) = @_; set_verbosity(@av); while (@av) { my $arg = $av[0]; if ($arg =~ /^-/) { my $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg =~ /^\?/)) { prt("Got argument [$arg]...\n"); give_help(); } elsif ($sarg =~ /^v/i) { #prtv("Set verbosity to $verbose.\n"); } elsif ($sarg =~ /^l/i) { $load_log = 1; prtv("Set to load log at end.\n"); } else { prt("ERROR: Unknown argument [$arg]! Aborting...\n"); pgm_exit(2,"BAD ARGUMENT"); } } else { push(@in_files,$arg); prtv("Added input file to [$arg]\n"); } shift @av; } if ($debug_on) { if (!@in_files) { push(@in_files,$def_file); prtv("Set input file to default [$def_file]\n"); $load_log = 1; $verbose = 1; } } } # eof - chkxml.pl