Generated: Tue Feb 2 17:54:23 2010 from chkhlinks.pl 2007/06/01 8.6 KB.
#!/perl -w # NAME: chkhlinks.pl # AIM: Given a input FOLDER, check all the HTML found for a <a href="...." # reference and make sure that reference EXISTS, # either as a LOCAL file, # or that an IP address can be obtained for the HOST if http://<something> ... # 31/05/2007 - geoff mclane - geoffair.com/mperl/index.htm use strict; use warnings; use File::Basename; use Socket; unshift(@INC, 'C:/GTools/perl'); require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; # log file stuff my ($LF); my $pgmname = $0; if ($pgmname =~ /\w{1}:\\.*/) { my @tmpsp = split(/\\/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = "temp.$pgmname.txt"; open_log($outfile); prt( "$0 ... Hello, World ...\n" ); my $recurse = 1; # recursive my $ignfpd = 1; # ignore FRONTPAGE folders my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived ); my $in_folder = "C:\\HOMEPAGE\\GeoffAir"; my @in_files = (); my $cnt = 0; my $file = ''; my $warnings = ''; my @httprefs = (); my @httpsrefs = (); my @ftprefs = (); my @mtrefs = (); my $hcnt = 0; my $href = ''; my %hrefs = (); my $val = ''; my $msg = ''; my @scripts = (); my $scnt = 0; # debug only bits my $dbg1 = 0; # show entering folder ... my $dbg2 = 0; # show ALL HREF entries ... my $dbg3 = 0; # show IP found ... my $dbg4 = 0; # show entered/exit script my $dbg5 = 0; # show 'ok' when found my $dbg6 = 0; # show processing lines my $dbg7 = 0; # show anchor count my $dbg8 = 0; # show unique anchor href my $dbg9 = 0; # show files with SCRIPTS my $dbg10 = 0; # show diag for get_img_srcs() ... my $dbg11 = 1; # show FULL filename for missing IP ... parse_args(@ARGV); process_folder( $in_folder ); $cnt = scalar @in_files; prt( "Found $cnt HTML files to process ...\n" ); foreach $file (@in_files) { my ($nm,$dir) = fileparse($file); if (open INF, "<$file") { my @lines = <INF>; close INF; @lines = dropcomments_from_array(@lines); my @srcs = get_href_srcs($file, @lines); my $scnt = scalar @srcs; if ($scnt) { prt( "Found $scnt anchor href= in $nm ...\n" ) if ($dbg7); for (my $i = 0; $i < $scnt; $i++) { my $src = $srcs[$i][0]; my $lnnos = $srcs[$i][1]; if ($src =~ /^http:/i) { # remote HREF push(@httprefs, [$src, $file, $lnnos] ); } elsif ($src =~ /^https:/i) { # remote HREF push(@httpsrefs, [$src, $file, $lnnos] ); } elsif ($src =~ /^ftp:/i) { # remote HREF push(@ftprefs, [$src, $file, $lnnos] ); } elsif ($src =~ /^mailto:/i) { # remote HREF push(@mtrefs, [$src, $file, $lnnos] ); } elsif ( $src =~ /^#/ ) { # local in page HREF } elsif ( $src =~ /^javascript:/i ) { # a JAVASCRIPT HREF } else { my $ind = index($src,'#'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } $ind = index($src,'?'); if ( $ind != -1 ) { $src = substr($src,0,$ind); } my $ff = $dir.$src; if ( -f $ff ) { prt( "$src - ok\n" ) if ($dbg5); } else { my $msg = "WARNING: [$src] NOT FOUND! in [$file]$lnnos"; $warnings .= "\n" if length($warnings); $warnings .= $msg; prt( "$msg\n" ); } } } } else { prt( "Found NO HREFs in $nm ...\n" ); } } } $hcnt = scalar @httprefs; if ($hcnt) { prt( "Found $hcnt HREF entries ..." ); for (my $i = 0; $i < $hcnt; $i++) { $href = $httprefs[$i][0]; $file = $httprefs[$i][1]; my ($nm,$dir) = fileparse($file); if (defined( $hrefs{$href} )) { $val = $hrefs{$href}; if ($dbg11) { $val .= ' '.$file; } else { $val .= ' '.$nm; } $hrefs{$href} = $val; } else { if ($dbg11) { $val = $file; } else { $val = $nm; } $hrefs{$href} = $val; } prt( "$href in [$file]\n" ) if ($dbg2); } $hcnt = scalar keys(%hrefs); prt( "Found $hcnt different entries ..." ); foreach my $key (keys %hrefs) { $val = $hrefs{$key}; prt( "$key in $val\n" ) if ($dbg8); if ($key =~ /^http:\/\//i) { my $hkey = substr($key, 7); my @arr = split( /\//, $hkey ); $hkey = $arr[0]; if (showIPAddress( $hkey ) == 0) { $msg = "FAILED: NO IP FOR HOST [$hkey][$val]"; $warnings .= "\n" if length($warnings); $warnings .= $msg; prt( "$msg\n" ); } } } } $scnt = scalar @scripts; if ($scnt && $dbg9) { prt( "Got $scnt files containing SCRIPTS ...\n" ); # push(@scripts, [$fil, $lns]); for (my $i = 0; $i < $scnt; $i++) { $file = $scripts[$i][0]; $val = $scripts[$i][1]; prt( "$file $val\n" ); } } if (length($warnings)) { prt( "\nWARNINGS FOLLOW:\n$warnings\n" ); } else { prt( "No warnings ...\n" ); } close_log($outfile,1); exit(0); ################################## sub showIPAddress { my ($nm) = shift; my @addr = gethostbyname($nm); my $cnt = 0; if( !@addr ) { prt( "Can't resolve $nm: $!\n" ); return 0; } @addr = map { inet_ntoa($_) } @addr[4 .. $#addr]; foreach my $k (@addr) { $cnt++; prt( "$cnt: $nm resolves to IP [$k]\n" ) if ($dbg3); } return $cnt; } sub add_2_scripts { my ($fil, $lns) = @_; my $sc = scalar @scripts; for (my $i = 0; $i < $sc; $i++) { my $cf = $scripts[$i][0]; if ($cf eq $fil) { my $lc = $scripts[$i][1]; $lc .= ":$lns"; $scripts[$i][1] = $lc; return 0; } } push(@scripts, [$fil, $lns]); return 1; } sub get_href_srcs { my ($fil, @lns) = @_; my $lc = scalar @lns; my $scnt = 0; my $slns = 0; # count the SCRIPT lines my ($nm,$dir) = fileparse( $fil ); prt( "Processing $lc lines from [$nm] dir=[$dir]...\n" ) if ($dbg6); my @isrc = (); my $ln = ''; my $bal = ''; my $inscript = 0; $slns = 0; my $bgnln = 0; my $endln = 0; for (my $i = 0; $i < $lc; $i++) { $ln = $bal; $bal = ''; $ln .= $lns[$i]; chomp $ln; prt( "$i [$ln] ...\n" ) if ($dbg10); if ($inscript) { if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, $slns ); $slns = 0; next; } $slns++; next; } if ( $ln =~ /<a\s+(.*)/i ) { my $iln = $1; prt( "Found [$iln] ...\n" ) if ($dbg10); $bgnln = $i; while ( !($iln =~ />/) && ($i < $lc)) { $i++; my $nxln = $lns[$i]; chomp $nxln; prt( "Adding [$nxln] ...\n" ) if ($dbg10); $iln .= ' '.$nxln; } $endln = $i; my $ind = index($iln, '>'); if ($ind != -1) { $bal = substr($iln, $ind+1); $iln = substr($iln, 0, $ind+1); } #if ($iln =~ /src=\"(.+)\"/i) { if ($iln =~ /href=\s*\"(\S+)\"/i) { prt( "HREF = $1\nIn line [$iln]...\n" ) if ($dbg10); push(@isrc, [$1, "$bgnln:$endln"] ); $scnt++; } else { if ( $iln =~ /name=\s*\"(\S+)\"/i ) { # ignore BOOKMARKS } else { $msg = "WARNING: HREF NOT FOUND in [$iln]..."; $warnings .= "\n" if length($warnings); $warnings .= $msg; prt( "$msg\n" ); } } } elsif ( $ln =~ /<script.*>/i ) { $inscript = 1; prt( "Entered a SCRIPT ...\n" ) if ($dbg4); $slns = 0; $ln = substr($ln, 7); if ($ln =~ /<\/script>/i) { $inscript =0; prt( "EXIT a SCRIPT ...\n" ) if ($dbg4); add_2_scripts( $fil, 1 ); $slns = 0; } } } if ($inscript) { $msg = "WARNING: EXIT WHILE IN SCRIPT in [$fil]..."; $warnings .= "\n" if length($warnings); $warnings .= $msg; prt( "$msg\n" ); } prt( "Returning $scnt HREF sources ...\n") if ($dbg10); return @isrc; } sub parse_args { my (@av) = @_; while (@av) { $in_folder = $av[0]; shift @av; } } sub is_my_ext { my ($fil) = shift; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); if ((lc($ext) eq ".htm")||(lc($ext) eq ".html")) { return 1; } return 0; } # my $ignfpd = 1; # ignore FRONTPAGE folders sub is_fp_folder { my ($inf) = shift; foreach my $fil (@fpfolders) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } sub process_folder { my ($inf) = shift; my $fcnt = 0; prt( "Processing $inf folder ...\n" ) if ($dbg1); if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; foreach my $fil (@files) { if (($fil eq ".")||($fil eq "..")) { next; } my $ff = $inf."\\".$fil; if ( -d $ff ) { if ($recurse) { if ($ignfpd && is_fp_folder($fil)) { # ignore FRONTPAGE folders next; } process_folder( $ff ); } } else { if (is_my_ext($fil)) { push(@in_files, $ff); $fcnt++; } } } prt( "Processed $inf folder finding $fcnt HTML files ...\n" ); } else { prt( "ERROR: Failed to open folder $inf ...\n" ); } } # eof - chkhlinks.pl