Generated: Sun Apr 15 11:46:19 2012 from gethrefs.pl 2011/10/20 25.9 KB.
#!/perl -w # NAME: gethrefs.pl # AIM: Parse a HTML file, and extract HREF links # 20/10/2011 - Add user options - parse_args() # 18/07/2010 - revisit and test... use strict; use warnings; use File::Basename; # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] ) use Cwd; my $perl_dir = 'C:\GTools\perl'; unshift(@INC, $perl_dir); require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n"; ###require 'logfile.pl' or die "Unable to load logfile.pl ...\n"; require 'htmltools.pl' or die "Unable to load htmltools.pl ...\n"; use constant { HRT_UNKNOWN => 0, HRT_LOCAL => 1, HRT_LINK => 2, HRT_SCRIPT => 3, HRT_FILE => 4, HRT_BASE => 5 }; use constant { FT_UNKNOWN => 0, FT_HTML => 1, FT_GRAF => 2, FT_CSS => 3, FT_SCRIPT => 4, FT_TEXT => 5, FT_ZIP => 6, FT_DIR => 7 }; # offsets in file array use constant { OF_FF => 0, # full file name OF_HR => 1, # array ref of href links OF_IM => 2, # array ref of image links OF_LK => 3, # linked count OF_SP => 4, # spare OF_TO => 5, # links TO OF_FM => 6, # links FROM OF_FT => 7 # file type }; # for htmltools, if functions used my @imgs = (); my @hrefs = (); # log file stuff our ($LF); my $pgmname = $0; if ($pgmname =~ /(\\|\/)/) { my @tmpsp = split(/(\\|\/)/,$pgmname); $pgmname = $tmpsp[-1]; } my $outfile = $perl_dir."\\temp.$pgmname.txt"; open_log($outfile); # user variables my $VERS = "0.0.2 2011-10-20"; my $load_log = 0; my $in_file = ''; my $debug_on = 0; my $def_file = 'C:\GTools\java\examples\JavaTech\Code_List.htm'; ##my $in_file = 'C:\HOMEPAGE\GA\travel\maroc\index.htm'; ###my $in_file = 'temphtml.htm'; my @all_hrefs = (); my $verbosity = 0; my $splout = $perl_dir."\\tempspec.txt"; # CONSTANTS ########### # File Type Extensions my @html_extension = qw( .htm .html .shtml .php ); my @graf_extension = qw( .jpg .jpeg .gif .png .bmp .ico .mpg ); my @css_extension = qw( .css ); my @script_extension = qw( .js .class .cgi .java ); my @zip_extension = qw( .zip .tar .gz .jar ); my @txt_extension = qw( .txt .doc ); # private FRONTPAGE folders my @fpfolders = qw( _vti_cnf _vti_pvt _private _derived ); # features my $show_not_found = 0; my $show_local_links = 0; # also show INTERNAL (in page) links my $ignfpd = 1; # ignore FRONTPAGE folders my @excludes = qw( desktop.ini php.ini blank.html blank.htm ); my $recurse = 1; # recursive my @splexcludes = qw( macpc ); my %ext_hash = (); my @all_files = (); my $refcnt = 0; my @done_files = (); my %not_found = (); my ($base_file,$base_dir); my $base_href = ''; # set if <BASE href="..."> found # DEBUG my $dbg1 = 0; # show discarded material my $dbg2 = 0; # show "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]... my $dbg3 = 0; # show Processing $lncnt lines from $fil ... my $dbg4 = 0; # show File [$name], in [$rdir] ... my $dbg5 = 0; # show HREF immediately my $dbg6 = 0; # show 'tag' immediately ### program variables my @warnings = (); my $cwd = cwd(); my $os = $^O; sub VERB1() { return $verbosity >= 1; } sub VERB2() { return $verbosity >= 2; } sub VERB5() { return $verbosity >= 5; } sub VERB9() { return $verbosity >= 9; } sub show_warnings($) { my ($val) = @_; if (@warnings) { prt( "\nGot ".scalar @warnings." WARNINGS...\n" ); foreach my $itm (@warnings) { prt("$itm\n"); } prt("\n"); } else { ### prt( "\nNo warnings issued.\n\n" ); } } sub pgm_exit($$) { my ($val,$msg) = @_; if (length($msg)) { $msg .= "\n" if (!($msg =~ /\n$/)); prt($msg); } show_warnings($val); close_log($outfile,$load_log); exit($val); } sub prtw($) { my ($tx) = shift; $tx =~ s/\n$//; prt("$tx\n"); push(@warnings,$tx); } ################################################################## # OF_FF => 0, # full file name # OF_HR => 1, # array ref of href links # OF_IM => 2, # array ref of image links # OF_LK => 3, # linked count # OF_SP => 4, # spare # OF_TO => 5, # links TO # OF_FM => 6, # links FROM # OF_FT => 7 # file type sub show_results { my $fcnt = scalar @all_files; my ($i, $ff, $ft, $cnt); for ($i = 0; $i < $fcnt; $i++) { $ft = $all_files[$i][OF_FT]; if ($ft == FT_HTML) { $cnt = $all_files[$i][OF_SP]; if ($cnt == 0) { $ff = $all_files[$i][OF_FF]; prt( "Missed [$ff]\n" ); } } } } ################################################################## sub get_href_type_const { my ($hrf) = shift; if ($hrf =~ /^http(s*):\/\//i) { return HRT_LINK; } elsif ($hrf =~ /^ftp:\/\//i) { return HRT_LINK; } elsif ($hrf =~ /^javascript:/i) { return HRT_SCRIPT; } elsif (substr($hrf,0,1) eq '#') { return HRT_LOCAL; } elsif (index($hrf,'.') >= 0) { return HRT_FILE; } return HRT_UNKNOWN; } sub href_type_to_string { my ($hrt) = shift; if ($hrt == HRT_LINK) { return "extern link"; } elsif ($hrt == HRT_SCRIPT) { return "script"; } elsif ($hrt == HRT_LOCAL) { return "local"; } elsif ($hrt == HRT_FILE) { return "file"; } elsif ($hrt == HRT_BASE) { return "BASE"; } elsif ($hrt == HRT_UNKNOWN) { return "unknown"; } return '***FIX ME*** uncased type'; } sub get_hrefs_from_string($) { my ($ln) = shift; my ($i, $j, $line, $ch, $ch2, $len, $tag, $disc, $hcnt); my ($bgn, $fhr, $hr, $tail, $max, $hrt, $ft, $ctyp); my ($sp,$tag2,$gottag); my @hrf = (); $ln =~ s/\n/ /g; $ln = trim_all($ln); # sub write2file { my ($txt,$fil) = @_; # write2file($fulln,'tempfl.txt'); $len = length($ln); $disc = ''; $hcnt = 0; # process single long string, char by char for ($i = 0; $i < $len; $i++) { $ch = substr($ln,$i,1); if ($ch eq '<') { $tag = $ch; # start a tag $i++; $ch = substr($ln,$i,1); # could check for things like <! ... later maybe $tag2 = ''; $gottag = 0; for (; $i < $len; $i++) { $ch = substr($ln,$i,1); $tag .= $ch; if ($ch eq '>') { last; } if (!$gottag) { if ($ch =~ /\w/) { $tag2 .= $ch; } else { $gottag = 1; } } } if ($tag =~ /(.*\s+)href(\s*)=/i) { $bgn = $1; $sp = length($2); $hcnt++; $fhr = substr($tag,length($bgn)+5+$sp); $fhr = substr($fhr,1) while ($fhr =~ /^\s/); # remove all LEADING space $ch = substr($fhr,0,1); prt("$tag [$tag2] [$fhr]\n") if ($dbg5); if (($ch eq '"')||($ch eq "'")) { $max = length($fhr); $hr = ''; $tail = ''; # collect actual HREF= for ($j = 1; $j < $max; $j++) { $ch2 = substr($fhr,$j,1); if ($ch eq $ch2) { $tail = substr($fhr,$j); last; } $hr .= $ch2; } if ($tag2 =~ /^BASE$/i) { $hrt = HRT_BASE; prt("Got [$tag2] [$fhr]\n"); } else { $hrt = get_href_type_const($hr); } $ctyp = ''; $ft = FT_UNKNOWN; if ($hrt == HRT_FILE) { $ft = get_file_type_const($hr); $ctyp = "ext[".file_type_const_to_string($ft)."] "; } $ctyp = 'type['.href_type_to_string($hrt)."] $ctyp"; prt("tag [$tag2] [$hr] $ctyp\n") if ($dbg6); $base_href = $hr if ($hrt == HRT_BASE); prt( "$hcnt:HREF: hr[$hr] $ctyp tail[$tail] pre[$disc] tag[$tag] bgn[$bgn]\n" ) if ($dbg2); # href HRT-type FT-file # 0 1 2 push(@hrf, [$hr, $hrt, $ft]); } else { prt( "$hcnt:HREF: fhr[$fhr] pre[$disc] tag[$tag] bgn[$bgn] CHECK ME\n" ); } } else { prt( "DISCARDED: pre[$disc] tag[$tag] ...\n" ) if ($dbg1); } $disc = ''; } else { $disc .= $ch; } } return @hrf; } sub parse_file($$) { my ($bdir,$bfil) = @_; my $fil = $bdir.$bfil; my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt); my @hrf = (); if ( ! open INF, "<$fil") { prt( "WARNING: Can NOT open file [$fil]...\n" ); return \@hrf; } my @lines = <INF>; close INF; $lncnt = scalar @lines; prt( "Processing $lncnt lines from [$fil] ...\n" ); $full = join('',@lines); # sub write2file { my ($txt,$fil) = @_; #my $scrp = return_tag($full,'script'); ##my $scrp = get_all_tag_text($full,'script'); ##write2file($scrp,'tempscript.txt'); ##prt( "Got script text [$scrp]\n" ); @hrf = get_hrefs_from_string($full); $hrcnt = scalar @hrf; $filcnt = 0; for ($i = 0; $i < $hrcnt; $i++) { $fil = $bdir.$hrf[$i][0]; $typ = $hrf[$i][1]; if ($typ == HRT_FILE) { $filcnt++; if (! -f $fil) { if (defined $not_found{$fil}) { $not_found{$fil}++; } else { prt( "WARNING: File [$fil] NOT found ...\n" ) if ($show_not_found); $not_found{$fil} = 1; } } } } prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" ); return \@hrf; } sub parse_file2($$) { my ($bdir,$bfil) = @_; my $fil = $bdir.$bfil; my ($lncnt, $full, $hrcnt, $i, $hfcnt, $typ, $filcnt); my @hrf = (); if ( ! open INF, "<$fil") { prt( "WARNING: Can NOT open file [$fil]...\n" ); return @hrf; } my @lines = <INF>; close INF; $lncnt = scalar @lines; prt( "Processing $lncnt lines from [$fil] ...\n" ); $full = join('',@lines); # sub write2file { my ($txt,$fil) = @_; #my $scrp = return_tag($full,'script'); ##my $scrp = get_all_tag_text($full,'script'); ##write2file($scrp,'tempscript.txt'); ##prt( "Got script text [$scrp]\n" ); @hrf = get_hrefs_from_string($full); $hrcnt = scalar @hrf; $filcnt = 0; for ($i = 0; $i < $hrcnt; $i++) { $fil = $bdir.$hrf[$i][0]; $typ = $hrf[$i][1]; if ($typ == HRT_FILE) { $filcnt++; if (! -f $fil) { if (defined $not_found{$fil}) { $not_found{$fil}++; } else { prt( "WARNING: File [$fil] NOT found ...\n" ) if ($show_not_found); $not_found{$fil} = 1; } } } } prt( "Got $hrcnt HREF entries, from $bfil ... $filcnt link files\n" ); return @hrf; } #################################### ######################################################### # Passed an array of extensions, # check if this is one of them? ######################################################### sub is_my_extension { my ($fil, @exts) = @_; my ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); foreach my $ex (@exts) { if (lc($ex) eq lc($ext)) { return 1; } } return 0; } ############################################ # only looking for HTM, HTML, PHP, # could be extended to others maybe ... ############################################ sub is_htm_extension { my ($fil) = shift; return( is_my_extension($fil, @html_extension) ); } sub is_graphic_extension { my ($fil) = shift; return( is_my_extension($fil, @graf_extension) ); } sub is_zip_extension { my ($fil) = shift; return( is_my_extension($fil, @zip_extension) ); } sub is_css_extension { my ($fil) = shift; return( is_my_extension($fil, @css_extension) ); } sub is_txt_extension { my ($fil) = shift; return( is_my_extension($fil, @txt_extension) ); } sub is_script_extension { my ($fil) = shift; return( is_my_extension($fil, @script_extension) ); } #use constant { # FT_UNKNOWN => 0, ## FT_HTML => 1, ## FT_GRAF => 2, ## FT_CSS => 3, ## FT_SCRIPT => 4, ## FT_TEXT => 5, ## FT_ZIP => 6, ## FT_DIR => 7 #}; sub get_file_type_const { my ($fil) = shift; if (is_htm_extension($fil)) { return FT_HTML; } elsif (is_graphic_extension($fil)) { return FT_GRAF; } elsif (is_zip_extension($fil)) { return FT_ZIP; } elsif (is_css_extension($fil)) { return FT_CSS; } elsif (is_txt_extension($fil)) { return FT_TEXT; } elsif (is_script_extension($fil)) { return FT_SCRIPT; } elsif ($fil =~ /\/$/) { return FT_DIR; } return FT_UNKNOWN; } sub file_type_const_to_string { my ($ft) = shift; if ($ft == FT_HTML) { return "html"; } elsif ($ft == FT_GRAF) { return "graphic"; } elsif ($ft == FT_ZIP) { return "zip"; } elsif ($ft == FT_CSS) { return "css"; } elsif ($ft == FT_TEXT) { return "text"; } elsif ($ft == FT_SCRIPT) { return "script"; } elsif ($ft == FT_DIR) { return "directory"; } elsif ($ft == FT_UNKNOWN) { return "unknown"; } return "***FIX ME*** uncased type $ft!"; } ############################################## #################################################################### # process_folder(folder) # Main DIRECTORY processing function # # Open the FOLDER given, and collect ALL files found, # iterate into sub-directories, if $recurse is non-zero, # and it is NOT a special FRONTPAGE (hidden) FOLDER. # # Files are collected into multidemensional arrays #################################################################### sub process_folder { my ($inf) = shift; my ($ft, $ff, $nm, $dir, $ext, $val, $fil); my $fcnt = 0; prt( "Processing $inf folder ...\n" ) if ($dbg1); if ( opendir( DIR, $inf ) ) { my @files = readdir(DIR); closedir DIR; foreach $fil (@files) { if (($fil eq ".")||($fil eq "..")) { next; } $ft = get_file_type_const($fil); $ff = $inf; $ff .= "\\" if !($inf =~ /[\\\/]$/); $ff .= $fil; if ( -d $ff ) { if ($recurse) { if ($ignfpd && is_fp_folder($fil)) { # ignore FRONTPAGE folders next; } if (@splexcludes && in_spl_excludes($fil)) { next; } process_folder( $ff ); } } else { # NOTE: multidimensional arrays pushed - offsets into arrays if ( !in_excludes($fil) ) { # NOT in @excludes ($nm,$dir,$ext) = fileparse( $fil, qr/\.[^.]*/ ); $val = 0; $val = $ext_hash{$ext} if ( defined $ext_hash{$ext} ); $val++; $ext_hash{$ext} = $val; push(@all_files, [$ff, '', '', 0, 0, '', '', $ft] ); $fcnt++; } } } prt( "Processed $inf folder finding $fcnt files ...\n" ); } else { prt( "ERROR: Failed to open folder $inf ...\n" ); } } ################################################ # my $ignfpd = 1; # ignore FRONTPAGE folders ################################################ sub is_fp_folder { my ($inf) = shift; foreach my $fil (@fpfolders) { if (lc($inf) eq lc($fil)) { return 1; } } return 0; } #################################### # Check if FILE is in EXCLUDE list #################################### sub in_excludes { my ($fil) = shift; my $lcf = lc($fil); foreach my $f (@excludes) { if (lc($f) eq $lcf) { return 1; } } return 0; } sub in_spl_excludes { my ($fldr) = shift; my $lfldr = lc($fldr); foreach my $f (@splexcludes) { if (lc($f) eq $lfldr) { return 1; } } return 0; } sub set_status_case { my ( $ch, $pch, $inccm, $inlnc, $inqot, $qot ) = @_; my $ldbg2 = 0; if ($$inccm) { if (($ch eq '/')&&($pch eq '*')) { $$inccm = 0; prt( "status: End C comment /* */ ...\n" ) if ($ldbg2); } } elsif ($$inlnc ) { if ($ch eq "\n") { $$inlnc = 0; prt( "status: End line comment // ...\n" ) if ($ldbg2); } } elsif ($$inqot ) { if ($ch eq $$qot) { prt( "status: End quote $$qot ...\n" ) if ($ldbg2); $$inqot = 0; $$qot = ''; } } else { if ($ch eq '/') { if ($pch eq '/') { $$inlnc = 1; prt( "status: Entered line comment // ...\n" ) if ($ldbg2); } } elsif ($ch eq '*') { if ($pch eq '/') { $$inccm = 1; prt( "status: Entered C comment /* */ ...\n" ) if ($ldbg2); } } elsif (($ch eq '"')||($ch eq "'")) { $$qot = $ch; $$inqot = 1; prt( "status: Entered quote $$qot ...\n" ) if ($ldbg2); } } } sub get_all_tag_text { my ($txt, $tag) = @_; my $len = length($txt); my $ldbg1 = 0; my $ntxt = ''; my $ch = ''; my $pch = ''; my $ftag = ''; my $nline = ''; my $i = 0; my $intag = 0; my $incomment = 0; my $inqot = 0; # in quotes ' or " my $qot = ''; my $inlnc = 0; # in line comment my $inccm = 0; # in C comment my ($part, $shlen); ###prt("Processing $len chars for $tag ...\n"); for ($i = 0; $i < $len; $i++) { $pch = $ch; $ch = substr($txt, $i, 1); set_status_case( $ch, $pch, \$inccm, \$inlnc, \$inqot, \$qot ); if ($incomment) { $ntxt .= $ch; if ($ch eq '>') { $shlen = -15; if (length($ntxt) < 15) { $shlen = 0 - length($ntxt); } prt( "Potential close [".substr($ntxt,$shlen)."] ...($i)" ) if ($ldbg1); if (substr($ntxt,-3) eq '-->') { if (!$inqot && !$inlnc && !$inccm) { prt( " Yes\n" ) if ($ldbg1); $incomment = 0; # no longer IN comment prt("End comment <!-- --> ...\n") if ($ldbg1); } else { if ($inqot) { prt( " NO DUE TO IN QUOTE\n" ) if ($ldbg1); } elsif ($inlnc) { prt( " NO DUE TO IN LINE COMMENT\n" ) if ($ldbg1); } elsif ($inccm) { prt( " NO DUE TO IN C COMMENT\n" ) if ($ldbg1); } else { prt( " NO DUE TO SOME REASON!!! **** CHECK ME!!! ****\n" ) if ($ldbg1); } } } else { prt( " NO!\n" ) if ($ldbg1); } } } elsif ($intag) { if ($ch eq "<") { ###prt("Got begin < ...\n"); $part = substr($txt,$i,4); if ($part eq '<!--') { # if a powerful comment starts prt("Entering comment <!-- ...\n") if ($ldbg1); $incomment = 1; # unconditionally go to end of this comment $ntxt .= $ch; next; } $i++; $ftag = ''; for ( ; $i < $len; $i++ ) { $pch = $ch; $ch = substr($txt, $i, 1); if ($ch eq '>') { last; } $ftag .= $ch; } $ntxt .= '<'.$ftag; ###prt("Got tag [$ftag] ...\n"); if (lc(substr($ftag,1)) eq lc($tag)) { $intag = 0; } } $ntxt .= $ch; } else { if ($ch eq "<") { ###prt("Got begin < ...\n"); $i++; $ftag = ''; for ( ; $i < $len; $i++ ) { $pch = $ch; $ch = substr($txt, $i, 1); if (($ch eq '>')||($ch eq ' ')||($ch =~ /\s/)) { last; } $ftag .= $ch; } ###prt("Got tag [$ftag] ...\n"); if (lc($ftag) eq lc($tag)) { $ntxt .= '<'.$ftag.$ch; if (($ch eq ' ')||($ch =~ /\s/)) { $i++; for ( ; $i < $len; $i++ ) { $pch = $ch; $ch = substr($txt, $i, 1); $ntxt .= $ch; if ($ch eq '>') { last; } } } ###prt( "Entered tag <$ftag...> ($tag)...\n" ); $intag = 1; } } } } return $ntxt; } sub unix_2_dos { my ($f) = shift; $f =~ s/\//\\/g; return $f; } # OF_FF => 0, # full file name # OF_HR => 1, # array ref of href links # OF_IM => 2, # array ref of image links # OF_LK => 3, # linked count # OF_SP => 4, # spare # OF_TO => 5, # links TO # OF_FM => 6, # links FROM # OF_FT => 7 # file type sub mark_in_all_files { my ($fil, $dir) = @_; my $max = scalar @all_files; my ($i, $ff, $nm, $dr, $fnd); $dir = unix_2_dos($dir); $fnd = 0; for ($i = 0; $i < $max; $i++) { $ff = $all_files[$i][OF_FF]; ($nm,$dr) = fileparse($ff); $dr = unix_2_dos($dr); if (($nm eq $fil)&&($dir eq $dr)) { $all_files[$i][OF_SP]++; $fnd++; } } if (!$fnd) { prt( "WARNING: [$fil] in [$dir] NOT found ...\n" ); } } sub in_done_files { my ($nm, $dr, @dn) = @_; my $ct = scalar @dn; my ($nnm, $ndr); for (my $j = 0; $j < $ct; $j++) { $nnm = $dn[$j][0]; $ndr = $dn[$j][1]; if (($nnm eq $nm) && ($ndr eq $dr)) { return 1; } } return 0; } sub process_references { my ($bdir) = @_; my $rcnt = scalar @all_hrefs; my ($i, $ff, $ft, $name, $rdir, $hrt, @new_hrefs, @nxt_hrefs, $dncnt); $bdir = unix_2_dos($bdir); prt( "Processing $rcnt HREF found ...\n" ); $dncnt = 0; for ($i = 0; $i < $rcnt; $i++) { $ff = $bdir.$all_hrefs[$i][0]; $hrt = $all_hrefs[$i][1]; $ft = $all_hrefs[$i][2]; if (($hrt == HRT_FILE)&&($ft == FT_HTML)) { ($name,$rdir) = fileparse( $ff ); $rdir = unix_2_dos($rdir); if ($ff =~ /\.\./) { # this back up ... prt( "SKIPPING file [$name], in [$rdir] ...\n" ); } else { prt( "File [$name], in [$rdir] ...\n" ) if ($dbg4); mark_in_all_files($name, $rdir); @new_hrefs = parse_file2($rdir, $name); $dncnt++; } push(@done_files, [$name, $rdir]); } } $rcnt = scalar @new_hrefs; prt( "Processed $dncnt file, for new $rcnt files ...\n" ); $dncnt = 0; while ($rcnt) { @nxt_hrefs = (); $dncnt = 0; prt( "Processing $rcnt NEW HREF found ...\n" ); for ($i = 0; $i < $rcnt; $i++) { $ff = $bdir.$new_hrefs[$i][0]; $hrt = $new_hrefs[$i][1]; $ft = $new_hrefs[$i][2]; if (($hrt == HRT_FILE)&&($ft == FT_HTML)) { ($name,$rdir) = fileparse( $ff ); $rdir = unix_2_dos($rdir); if ( !in_done_files($name, $rdir, @done_files) ) { if ($ff =~ /\.\./) { # this back up ... prt( "SKIPPING file [$name], in [$rdir] ...\n" ); } else { prt( "File [$name], in [$rdir] ...\n" ); mark_in_all_files($name, $rdir); @nxt_hrefs = parse_file2($rdir, $name); $dncnt++; } push(@done_files, [$name, $rdir]); } } } @new_hrefs = @nxt_hrefs; $rcnt = scalar @new_hrefs; prt( "Processed $dncnt file, for new $rcnt files ...\n" ); } } sub fits_special_filter($$) { my ($hr,$ra) = @_; # https://git.gitorious.org/flightgear-aircraft/717.git if ($hr =~ /^https:\/\/git\.gitorious\.org\/flightgear-aircraft\/(.+)\.git$/) { push(@{$ra},$1); return 1; } return 0; } sub show_href_list($) { my ($raorg) = @_; # href HRT-type FT-file # 0 1 2 #push(@hrf, [$hr, $hrt, $ft]); my $cnt = scalar @{$raorg}; my @arr = sort @{$raorg}; my $ra = \@arr; my %dupes = (); prt("Display of up to $cnt hrefs... eliminating duplicates\n"); my ($i,$i2,$ccnt,$hr,$hrt,$ft,$min,$len,$shrt,$form,$splcnt); my @specials = (); $i = int($cnt / 10); $form = '%d'; $i2 = 1; while ($i) { $i2++; $form = '%'.sprintf("%d",$i2).'d'; $i = int($i / 10); } $min = 0; for ($i = 0; $i < $cnt; $i++) { $hr = ${$ra}[$i][0]; $hrt = ${$ra}[$i][1]; $ft = ${$ra}[$i][2]; $len = length($hr); $min = $len if ($len > $min); } $i2 = 0; for ($i = 0; $i < $cnt; $i++) { $hr = ${$ra}[$i][0]; $hrt = ${$ra}[$i][1]; $ft = ${$ra}[$i][2]; if (!defined $dupes{$hr}) { # display it $dupes{$hr} = 1; $splcnt++ if (fits_special_filter($hr,\@specials)); $i2++; $ccnt = sprintf($form,$i2); $hr .= ' ' while (length($hr) < $min); $shrt = href_type_to_string($hrt); prt("$ccnt: $hr $shrt\n"); } } prt("Displayed $i2 different hrefs...\n"); if (@specials) { my @arr = sort @specials; write2file(join("\n",@arr)."\n",$splout); prt("Written $splcnt to 'special' list $splout\n"); } } ######################################################## ### MAIN ### parse_args(@ARGV); ($base_file,$base_dir) = fileparse( $in_file ); $base_dir = unix_2_dos($base_dir); # process_folder($base_dir); # get ALL the files, in mutidemensional array my $ref_hrefs = parse_file($base_dir, $base_file); show_href_list($ref_hrefs); push(@done_files, [$base_file, $base_dir]); # mark_in_all_files($base_file, $base_dir); # $refcnt = scalar @all_hrefs; # process_references($base_dir); # show_results(); pgm_exit(0,""); ######################################################## sub give_help { prt("$pgmname: version $VERS\n"); prt("Usage: $pgmname [options] in-file\n"); prt("Options:\n"); prt(" --help (-h or -?) = This help, and exit 0.\n"); prt(" --verb[n] (-v) = Bump [or set] verbosity. def=$verbosity\n"); prt(" --load (-l) = Load LOG at end.\n"); } sub need_arg { my ($arg,@av) = @_; pgm_exit(1,"ERROR: [$arg] must have a following argument!\n") if (!@av); } sub parse_args { my (@av) = @_; my ($arg,$sarg); while (@av) { $arg = $av[0]; if ($arg =~ /^-/) { $sarg = substr($arg,1); $sarg = substr($sarg,1) while ($sarg =~ /^-/); if (($sarg =~ /^h/i)||($sarg eq '?')) { give_help(); pgm_exit(0,"Help exit(0)"); } elsif ($sarg =~ /^v/) { if ($sarg =~ /^v.*(\d+)$/) { $verbosity = $1; } else { while ($sarg =~ /^v/) { $verbosity++; $sarg = substr($sarg,1); } } prt("Verbosity = $verbosity\n") if (VERB1()); } elsif ($sarg =~ /^l/) { $load_log = 1; prt("Set to load log at end\n") if (VERB1()); } else { pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n"); } } else { $in_file = $arg; prt("Set input to [$in_file]\n"); } shift @av; } if ((length($in_file) == 0) && $debug_on) { $in_file = $def_file; } if (length($in_file) == 0) { pgm_exit(1,"ERROR: No input files found in command!\n"); } if (! -f $in_file) { pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n"); } } # eof - gethrefs.pl