chkdupes.pl to HTML.

index -|- end

Generated: Sat Oct 12 17:22:45 2013 from chkdupes.pl 2013/07/10 14.8 KB. text copy

#!/perl -w
# NAME: chkdupes.pl
# AIM: Read a folder, and subfolders, and check for any duplicate file names
# This is so they can all be put in one folder, if possible
# 07/07/2013 - Tied to reduce the 'duplications' but seem to have failed
# 01/02/2013 - Hopefully fix to run in linux
# 19/11/2011 - Allow first item of two to be a single file
# 18/11/2011 - Fix bug if two folder given - src and dest
# 15/08/2011 - Update...
# 22/07/2008 geoff mclane http://geoffair.net/mperl

use strict;
use warnings;
use File::stat;
use File::Basename;  # split path ($name,$dir,$ext) = fileparse($file [, qr/\.[^.]*/] )
use Cwd;
my $os = $^O;
my $perl_dir = '/home/geoff/bin';
my $PATH_SEP = '/';
my $temp_dir = '/tmp';
if ($os =~ /win/i) {
    $perl_dir = 'C:\GTools\perl';
    $temp_dir = $perl_dir;
    $PATH_SEP = "\\";
}
unshift(@INC, $perl_dir);
require 'lib_utils.pl' or die "Unable to load 'lib_utils.pl' Check paths in \@INC...\n";
# log file stuff
our ($LF);
my $pgmname = $0;
if ($pgmname =~ /(\\|\/)/) {
    my @tmpsp = split(/(\\|\/)/,$pgmname);
    $pgmname = $tmpsp[-1];
}
my $outfile = $perl_dir."\\temp.$pgmname.txt";
open_log($outfile);

# user variables
my $VERS = "0.0.4 2013-07-07";
#my $VERS = "0.0.3 2011-11-19";
my $load_log = 0;
my $in_file = '';
my $show_ext = 0;
my $max_name_wid = 65;
my $debug_on = 0;
my $def_file = 'def_file';
my $repos = ".git;.svn;.hg;CVS";
my $do_all_ext = 0;
my $headers_only = 1;

my $def_folder = "C:\\Users\\Public\\SAVES\\peru\\My Pictures\\Carla";   # DEFAULT, if NO command input
my $in_folder = "";
my @folder_list = ();

my @file_list = ();
my %exthash = ();
my @dup_list = ();
my @exl_dirs = ();
my @excludes_files = ();

my $verbose = 0;

### program variables
my @warnings = ();
my $cwd = cwd();

### forward
sub process_folder($);
sub scan_dir($$$);

sub VERB1() { return ($verbose >= 1); }
sub VERB2() { return ($verbose >= 2); }
sub VERB5() { return ($verbose >= 5); }
sub VERB9() { return ($verbose >= 9); }

sub show_warnings($) {
    my ($val) = @_;
    if (@warnings) {
        prt( "\nGot ".scalar @warnings." WARNINGS...\n" );
        foreach my $itm (@warnings) {
           prt("$itm\n");
        }
        prt("\n");
    } else {
        ###prt( "\nNo warnings issued.\n\n" );
    }
}

sub pgm_exit($$) {
    my ($val,$msg) = @_;
    if (length($msg)) {
        $msg .= "\n" if (!($msg =~ /\n$/));
        prt($msg);
    }
    show_warnings($val);
    close_log($outfile,$load_log);
    exit($val);
}


sub prtw($) {
   my ($tx) = shift;
   $tx =~ s/\n$//;
   prt("$tx\n");
   push(@warnings,$tx);
}

sub in_exclude_dirs($) {
    my $dir = shift;
    my ($xcl);
    foreach $xcl (@exl_dirs) {
        if ($dir eq $xcl) {
            return 1;
        }
    }
    return 0;
}

sub is_excluded_file($) {
    my $file = shift;
    my $lcfl = lc($file);
    my ($tfl,$lctf);
    foreach $tfl (@excludes_files) {
        $lctf = lc($tfl);
        return 1 if ($lcfl eq $lctf);
    }
    return 0;
}

sub is_right_type($) {
    my $file = shift;
    return 1 if ($do_all_ext);
    return 1 if (is_h_source($file));
    if (is_c_source($file)) {
        if ($headers_only) {
            return 0;
        } else {
            return 1;
        }
    }
    return 0;
}

sub process_folder($) {
   my ($inf) = shift;
   my @subdirs = ();
    my ($file,$name);
   if (opendir( DIR, $inf)) {
      my @files = readdir(DIR);
      closedir DIR;
      foreach $file (@files) {
         if (($file eq '.')||($file eq '..')) {
            next;
         }
         my $ff = $inf . $PATH_SEP . $file;
         if (-d $ff) {
                if (!in_exclude_dirs($file)) {
                push(@subdirs,$ff);
                }
         } else {
                next if (is_excluded_file($file));
                next if (!is_right_type($file));
            my ($nm,$dir,$ext) = fileparse( $ff, qr/\.[^.]*/ );
                my $sb = stat($ff);
            #my ($nm,$dir) = fileparse( $ff );
            $nm = lc($nm);
            $ext = lc($ext);
            $name = $nm.$ext;
                #                  0    1      2  3  4     5
            push( @file_list, [$ff, $name, 0, 0, \$sb, 0]);
            if (defined $exthash{$ext}) {
               $exthash{$ext} ++;
            } else {
               $exthash{$ext} = 1;
            }
         }
      }
      foreach my $fil (@subdirs) {
         process_folder($fil);
      }
   } else {
      prt( "ERROR: Can NOT open $inf ... $! ... \n" );
   }
}

sub do_one_folder() {
    # get all the files...
    process_folder($folder_list[0]);
    my $incnt = scalar @file_list;
    prt( "Got $incnt file items to check ...\n" );
    my $dup_cnt = 0;
    my ($ff1,$ff2,$i,$j,$name,$sb1,$sb2,$min,$len,$nn1,$nn2,$tm1,$tm2);
    #                   0    1      2  3  4     5
   #push( @file_list, [$ff, $name, 0, 0, \$sb, 0]);
    for ($i = 0; $i < $incnt; $i++) {
        $file_list[$i][2] = 0;  # clear all counts
        $file_list[$i][3] = 0;  # clear all matches
        $file_list[$i][5] = 0;  # clear all DONE
    }
    for ($i = 0; $i < $incnt; $i++) {
        next if ($file_list[$i][5] > 0);
        $name = $file_list[$i][1];
        for ($j = 0; $j < $incnt; $j++) {
            next if ($file_list[$j][5] > 0); # skip if DONE
            next if ($i == $j);
            if ($name eq $file_list[$j][1]) {
                $file_list[$j][2]++;
                $file_list[$i][2]++;
                $file_list[$j][3] = $i;
                $file_list[$i][3] = $j;
                $file_list[$j][5] = 1;  # mark as DONE
            }
        }
        $file_list[$i][5] = 1;  # mark as DONE
    }
    $dup_cnt = 0;
    for ($i = 0; $i < $incnt; $i++) {
        $name = $file_list[$i][1];
        if ($file_list[$i][2] > 0) {
            $dup_cnt++;
        }
    }
    prt( "Got $dup_cnt duplicate names...\n" );
    if (!VERB1()) {
        prt("Add -v1 to produces a list. v5 to show details.\n");
    }
    $min = 0;
    for ($i = 0; $i < $incnt; $i++) {
        $name = $file_list[$i][1];
        if ($file_list[$i][2] > 0) {
            $j = $file_list[$i][3];
            $ff1 = $file_list[$i][0];
            $ff2 = $file_list[$j][0];
            $len = length($ff1);
            $min = $len if ($len > $min);
            $len = length($ff2);
            $min = $len if ($len > $min);
        }
    }
    $min = $max_name_wid if ($min > $max_name_wid);
    for ($i = 0; $i < $incnt; $i++) {
        $file_list[$i][5] = 0;  # clear SHOWN
    }
    $dup_cnt = 0;
    for ($i = 0; $i < $incnt; $i++) {
        $name = $file_list[$i][1];
        next if ($file_list[$i][5] > 0);
        if ($file_list[$i][2] > 0) {
            $j = $file_list[$i][3];
            $ff1 = $file_list[$i][0];
            $ff2 = $file_list[$j][0];
            #prt( "Dupe $name ...\n" );
            if (VERB5()) {
                $sb1 = stat($ff1);
                $sb2 = stat($ff2);
                $nn1 = get_nn($sb1->size);
                $nn2 = get_nn($sb2->size);
                $tm1 = lu_get_YYYYMMDD_hhmmss($sb1->mtime);
                $tm2 = lu_get_YYYYMMDD_hhmmss($sb2->mtime);

                $ff1 .= ' ' while (length($ff1) < $min);
                $ff2 .= ' ' while (length($ff2) < $min);
                $nn1 = ' '.$nn1 while (length($nn1) < 12);
                $nn2 = ' '.$nn2 while (length($nn2) < 12);
                prt("$ff1 $nn1 $tm1\n");
                prt("$ff2 $nn2 $tm1\n");
            } elsif (VERB2()) {
                $ff1 = $file_list[$i][0];
                $sb1 = stat($ff1);
                $nn1 = get_nn($sb1->size);
                $tm1 = lu_get_YYYYMMDD_hhmmss($sb1->mtime);
                $ff1 .= ' ' while (length($ff1) < $min);
                $nn1 = ' '.$nn1 while (length($nn1) < 12);
                prt("$ff1 $nn1 $tm1\n");
            } elsif (VERB1()) {
                $ff1 .= ' ' while (length($ff1) < $min);
                $ff2 .= ' ' while (length($ff2) < $min);
                prt( "$ff1 == $ff2\n" );
            }
            $file_list[$i][5] = 1;  # set SHOWN
            $file_list[$j][5] = 1;  # set SHOWN
            $dup_cnt++;
        }
    }
    prt( "Done $dup_cnt duplicate names...\n" ) if (VERB1());
}

sub scan_dir($$$) {
   my ($ra,$inf,$lev) = @_;
    pgm_exit(1,"ERROR: scan_dir: Passed null value!\n") if (length($inf) == 0);    
   my @subdirs = ();
    my ($file,$ff,$name);
    prt("Scanning [$inf]...\n") if ($lev == 0);
   if (opendir( DIR, $inf)) {
      my @files = readdir(DIR);
      closedir DIR;
        $inf .= "\\" if ( !($inf =~ /(\\|\/)$/) );
      foreach $file (@files) {
         next if (($file eq '.')||($file eq '..'));
            $ff = $inf.$file;
         if (-d $ff) {
            push(@subdirs,$ff);
         } else {
            my ($nm,$dir,$ext) = fileparse( $ff, qr/\.[^.]*/ );
            #my ($nm,$dir) = fileparse( $ff );
            $nm = lc($nm);
            $ext = lc($ext);
            $name = $nm.$ext;
                #              0      1    2      3
            push( @{$ra}, [$file, $ff, $name, 0] );
            if (defined $exthash{$ext}) {
               $exthash{$ext} ++;
            } else {
               $exthash{$ext} = 1;
            }
         }
      }
      foreach my $fil (@subdirs) {
         scan_dir($ra,$fil,$lev+1);
      }
   } else {
      pgm_exit(1,"ERROR: Can NOT open [$inf] ... $! ... \n" );
   }
}

sub compare_lists($$) {
    my ($ra1,$ra2) = @_;    # = \@arr1,\@arr2
    my $cnt1 = scalar @{$ra1};
    my $cnt2 = scalar @{$ra2};
    prt("Comparing list 1 = $cnt1, with list 2 = $cnt2...\n");
    my ($fil1,$fil2,$nm1,$nm2,$fnd,$i,$j,$min,$len);
    #               0      1    2      3
   #push( @{$ra}, [$file, $ff, $name, 0] );
    $min = 0;
    for ($i = 0; $i < $cnt1; $i++) {
        $fil1 = ${$ra1}[$i][0];
        $len = length($fil1);
        $min = $len if ($len > $min);
    }
    for ($i = 0; $i < $cnt1; $i++) {
        $fil1 = ${$ra1}[$i][0];
        $nm1  = ${$ra1}[$i][2];
        $fnd = 0;
        for ($j = 0; $j < $cnt2; $j++) {
            $fil2 = ${$ra2}[$j][0];
            $nm2  = ${$ra2}[$j][2];
            if ($nm1 eq $nm2) {
                $fnd = 1;
                last;
            }
        }
        $fil1 .= ' ' while (length($fil1) < $min);
        if ($fnd) {
            prtw("File $fil1 is DUPLICATED in list 2!\n");
        } else {
            prt("File $fil1 NOT found in list 2!\n");
        }
    }
    #prt("WARNING: Coding NOT completed!\n");
}

sub show_extensions() {
    my $cnt = scalar keys(%exthash);
    prt("Got list of $cnt entensions...\n");
    foreach my $key (keys %exthash) {
        prt( "Extension $key occurs ".$exthash{$key}." times ...\n" );
    }
    prt("Done list of $cnt entensions...\n");
}

parse_args(@ARGV);
###prt( "$0 ... Processing $in_folder ...\n" );
if (scalar @folder_list == 1) {
    do_one_folder();    # check folder for duplicate names???
} elsif (scalar @folder_list == 2) {
    my (@arr1,@arr2);
    my $f1 = $folder_list[0];
    my $f2 = $folder_list[1];
    if (-f $f1) {
        #               0      1    2      3
       #push( @{$ra}, [$file, $ff, $name, 0] );
        my ($nm,$dr) = fileparse($f1);
        push(@arr1,    [$nm,   $f1, lc($nm),0]);
    } elsif (-d $f1) {
        scan_dir(\@arr1,$f1,0);
    } else {
        pgm_exit(1,"First item is neither file, nor folder [$f1]!\n");
    }
    scan_dir(\@arr2,$f2,0);
    compare_lists(\@arr1,\@arr2);
}

show_extensions() if ($show_ext);
pgm_exit(0,"");

################################

sub give_help {
    prt("$pgmname: version $VERS\n");
    prt("Usage: $pgmname [options] in-folder/in-file [in-folder2]\n");
    prt("Options:\n");
    prt(" --help (-h or -?) = This help, and exit 0.\n");
    prt(" --load       (-l) = Load log at end.\n");
    prt(" --show       (-s) = Also show extension list.\n");
    prt(" --verb       (-v) = Bump verbosity.\n");
    prt(" --dir        (-d) = Exclude this directory.\n");
    prt(" --xclude     (-x) = Exclude duplicates of these names.\n");
    prt(" --XCLUDE     (-X) = Exclude repos folders (def=$repos)\n");
    prt("If just ONE directory given, then it will be checked for duplicate files.\n");
    prt("If TWO folders given, they will be compared, and duplicate files reported.\n");
}
sub need_arg {
    my ($arg,@av) = @_;
    pgm_exit(1,"ERROR: [$arg] must have following argument!\n") if (!@av);
}

sub parse_args {
    my (@av) = @_;
    my ($arg,$sarg,$cnt);
    while (@av) {
        $arg = $av[0];
        if ($arg =~ /^-/) {
            $sarg = substr($arg,1);
            $sarg = substr($sarg,1) while ($sarg =~ /^-/);
            if (($sarg =~ /^h/i)||($sarg eq '?')) {
                give_help();
                pgm_exit(0,"Help exit(0)");
            } elsif ($sarg =~ /^l/) {
                if ($sarg =~ /^ll/) {
                    $load_log = 2;
                } else {
                    $load_log = 1;
                }
                prt("Set to load log $load_log\n") if (VERB1());
            } elsif ($sarg =~ /^s/) {
                $show_ext = 1;
            } elsif ($sarg =~ /^v/i) {
                if ($sarg =~ /^v(\d+)$/) {
                    $verbose = $1;
                } else {
                    while ($sarg =~ /^v/) {
                        $verbose++;
                        $sarg = substr($sarg,1);
                    }
                }
                prt("Set verbosity to $verbose\n") if (VERB1());
            } elsif ($sarg =~ /^d/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                push(@exl_dirs,split(";",$sarg));
                prt("Exclude directory $sarg\n") if (VERB1());
            } elsif ($sarg =~ /^x/) {
                need_arg(@av);
                shift @av;
                $sarg = $av[0];
                push(@excludes_files,split(';',$sarg));
                prt("Exclude files $sarg\n") if (VERB1());
            } elsif ($sarg =~ /^X/) {
                push(@exl_dirs,split(";",$repos));
                prt("Exclude repo directories $repos\n") if (VERB1());
            } else {
                pgm_exit(1,"ERROR: Invalid argument [$arg]! Try -?\n");
            }
        } else {
            $in_file = $arg;
            if ((-d $in_file)||(-f $in_file)) {
                push(@folder_list,$in_file);
                $cnt = scalar @folder_list;
                if ($cnt > 2) {
                    pgm_exit(1,"ERROR: Can only give two folders. folder [$in_file] is 3rd!\n");
                }
                prt("Set input $cnt to [$in_file]\n");
            } else {
                pgm_exit(1,"ERROR: Can NOT locate folder [$in_file]!\n");
            }
        }
        shift @av;
    }

    if ((length($in_file) ==  0) && $debug_on) {
        $in_file = $def_file;
    }
    if (length($in_file) ==  0) {
        pgm_exit(1,"ERROR: No input directory found in command!\n");
    }
    #if (! -f $in_file) {
    #    pgm_exit(1,"ERROR: Unable to find in file [$in_file]! Check name, location...\n");
    #}
}


# eof - chkdupes.pl

index -|- top

checked by tidy  Valid HTML 4.01 Transitional