#!/usr/local/bin/perl my $windows = $ENV{'windir'} || $ENV{'WINDIR'}; my $listsep = $ENV{'LISTSEP'} || ($windows? ';' : ':'); my $dirsep = $ENV{'DIRSEP'} || ($windows? '\\' : '/'); my $nocase = $windows; my $USAGE=" USAGE: alike OPTIONS DIRA [DIRB] SYNOPSIS: diff two dirs for common files (by contents,name,etc). AUTHOR: GPL(C) Mohsin Ahmed, http://www.cs.albany.edu/~mosh NOTES: DIRA : A dir or DIRA/SHPAT or SHPAT, (default DIRA is PWD). DIRB : list of dir(s), any dir can be a \$VAR (eg \$PATH). mode 1: DIRA-DIRB,DIRB-DIRA,DIRA*DIRB mode 2: Compare binary contents mode 3: Compare filename by sound mode 4: Approx name match filenames OUTPUT: ==bin= file contents equal, ==soundex= filenames sound similar, =~name= matching file name, OPTIONS: -? -h For this help. -format:str Output formatted by str, with DIRA/DIRB/FILE in str replaced by found material. Use with -u. -diff Same as '-fileonly -ua*b -format:diff DIRA/FILE DIRB/FILE' -fileonly Skip dirs -f/pat Restrict to filenames =~ m/pat/. -v Verbose. -i Toggle nocase, if set lowers all filenames. NT/95 default nocase=$nocase, opp. of Unix. \$VAR Get dirlist from \$VAR. mode 1a: -u File name diff between dira and dirb, default. -ua-b List File names in dira but not in dirb. -ub-a List File names in dirb but not in dira. -ua*b List File names in dira and dirb. -s/p/q/ Do FILE =~ s/p/q/ for above. mode 1b: -ul Only Same named files are binary compared. -ua==b List files that have same contents. -ua!=b List files that differ in contents. -tab Make a table of all files (in dira,dirlist) vs dirlist. -tab1 Show only rows, if dira has that file. -tab2 Show only rows, if two or more dirs have that file. mode 2: -b Different name files are binary compared for dups. mode 3: -snd Show similar sounding ndex(filename) mode 4: -sim Show similar named files. EXAMPLE: alike -ul /bin /usr/bin Finds binary duplicates, same names. alike -u /bin /usr/bin Finds common files, and unique files. alike -ua*b . '\$INCLUDE' Finds common files in PWD and \$INCLUDE. alike -s/\\..*// /bin /dos Ignore extensions and compare -ua*b Note first arg is the mandatory dira. alike -tab /bin/*.exe \$PATH Shows all files in PATH, and their full path. alike -tab2 \"*.h\" \$INCLUDE /usr/inc Show dup include files. BATCH: Produce a batch file to do the diffs. alike -ua*b -format:diff_DIRA/FILE_DIRB/FILE dir1 dir2 alike -diff-u dir1 dir2 "; $verbose = 0; while( $_ = $ARGV[0], /^-/ ){ shift; last if /^--$/; if( m/^-v/ ){ $verbose++; }elsif( m/^-b/ ){ warn "# Binary compare for dup files under different names.\n"; $do_binary++; }elsif( m/^-format:(.+)/ ){ $batchcmd = $1; print STDERR "# format='$batchcmd'.\n"; }elsif( m/^-diff(.*)/ ){ # can be -diff-w (-w is option to diff) unshift( @ARGV, "-format:diff $1 DIRA/FILE DIRB/FILE" ); unshift( @ARGV, "-ua*b"); unshift( @ARGV, "-fileonly"); }elsif( m/^-file/ ){ $fileonly = 1; warn "# fileonly.\n"; }elsif( m,^-f/(.+), ){ $filepat = $1; warn "# matching /$filepat/\n"; }elsif( m,^-s/(.+)/(.*)/, ){ $sre1 = $1; $sre2 = $2; $do_unique ||= '-ua*b'; warn "# FILE =~ -s/$sre1/$sre2/\n"; }elsif( m/^-sim$/ ){ $do_similarnames++; }elsif( m/^-snd$/ ){ $do_soundex++; }elsif( m/^-i/ ){ $nocase = ! $nocase; warn "# nocase=$nocase.\n"; }elsif( m/^-u/ ){ $do_unique = $_; }elsif( m/^-tab/ ){ $do_table_count = 0; if( m/^-tab(\d+)/ ){ $do_table_count = $1; } $do_table = 1; }elsif( m/^-[?h]/ ){ print $USAGE; exit; }elsif( m/^-d/ ){ die "unknown option '$_', see -? for help.\n"; } } # ========================================================== # Get dira and dirb. $dira = shift or die "Need dir, see -? for help\n"; if( (! -d $dira) && ($dira =~ m,[*?],) ){ # dira can be \windows\*.exe or /usr/include/std*.h if( $dira =~ m,^(.*)[/\\](.+), ){ $dira = $1; $filepat = $2; }else{ # dira can be *.exe, then let dira = pwd. $filepat = sh2pat($dira); $dira = '.'; } warn "Using: dira=$1, matching /$filepat/\n"; } foreach $dirb (@ARGV){ # remaining args are dirb. if( $dirb =~ m/^\$(.+)/ ){ # $VAR my $envvar = $1; my $dirlist = $ENV{$envvar} or die "ENV '$envvar' not set.\n"; push( @dirlist, split( /$listsep/, $dirlist ) ); next; } push( @dirlist, split( /$listsep/, $dirb ) ); } # default dirb is PWD. push( @dirlist, '.' ) unless @dirlist; # ========================================================== # find what to do. if( $do_similarnames ){ warn "# mode2. doing similarnames cmp\n"; }elsif( $do_soundex ){ warn "# mode3. doing soundex cmp\n"; }elsif( $do_binary ){ warn "# mode4. doing binary cmp\n"; }elsif( $do_unique ){ warn "# mode1. doing unique '$do_unique' filename\n"; }elsif( $do_table ){ warn "# mode1. doing table\n"; }else{ # die "nothing to do, see help -h\n"; $do_unique = '-u'; # Don't die, do the default. } my( %size, %time ); my( %similar_cachea, %similar_cacheb ); # Used by similarnames() my( %compared ); if( $do_unique ){ # mode 1, dira-dirb, dirb-dira, dira*dirb. foreach $dirb (@dirlist){ my $inodea = join('-',stat($dira)); my $inodeb = join('-',stat($dirb)); if( $inodea eq $inodeb ){ warn "# cmp_dirs($dira==$dirb), skipping.\n"; # next; } warn "# cmp_dirs(A=$dira,B=$dirb)\n"; cmp_dirs($dira,$dirb); } }elsif( $do_table ){ cmp_table( $dira, @dirlist ); }else{ # Compare files under different names, for same contents, # or sounding or similar names. foreach $dirb (@dirlist){ warn "# cmp_files(A=$dira,B=$dirb)\n"; cmp_files($dira,$dirb); } } # ========================================================== # Subroutines. sub cmp_table { # prints a table, one row per file, one column per dirlist # column has 'Y' if file is present in the dir. my @dirlist = @_; my %files; my %dircount; my %seeninodeb; my $dirnum = 0; my @dirlist2; foreach $dirb (@dirlist){ my $inode = join('-',stat($dirb)); if( $seeninodeb{ $inode }++ ){ warn "Duplicate dir:$dirb?\n"; next; } push @dirlist2, $dirb; } @dirlist = @dirlist2; foreach $dirb (@dirlist){ # print the header. print '|' x $dirnum,'+','-' x (20-$dirnum), ':',$dirb,"\n"; $dirnum++; } foreach $dirb (@dirlist){ my @ls_b = getdir( $dirb ); foreach $ls_b (@ls_b){ $files{$ls_b}{$dirb}++; $dircount{$ls_b}++; } } foreach $ls_b (keys %files){ my $line; next if $do_table_count == 1 and !$files{$ls_b}{$dirlist[0]}; # dira has it? next if $do_table_count && $do_table_count > $dircount{$ls_b}; foreach $dirb (@dirlist){ $line .= ($files{$ls_b}{$dirb})? 'Y':'-'; } $line .= sprintf(" %12s\n",$ls_b); print $line; } @dirlist = reverse(@dirlist); $dirnum = scalar( @dirlist); foreach $dirb (@dirlist){ # print the trailer, in reverse. $dirnum--; print '|' x $dirnum,'+','-' x (20-$dirnum), ':',$dirb,"\n"; } } sub cmp_files { my $dira = shift; my $dirb = shift; my @ls_a = getdir( $dira ); my @ls_b = getdir( $dirb ); for $filea (@ls_a){ my $fulla = "$dira$dirsep$filea"; next unless -f $fulla; my $sizea = $size{ $fulla } ||= -s $fulla; for $fileb (@ls_b){ my $fullb = "$dirb$dirsep$fileb"; next unless -f $fullb; my $sizeb = $size{ $fullb } ||= -s $fullb; next if $filea eq $fileb; # skip same names. next if $compared{ "$fullb\007$fulla" } ; $compared{ "$fulla\007$fullb" }++; # mode 3, compare filename by sound. if( $do_soundex ){ next unless soundex( $filea ) eq soundex( $fileb ); printf "%20s =soundex= %20s\n", $fulla, $fullb; next; } # mode 4, string match filenames. if( $do_similarnames ){ next unless similarnames( $filea, $fileb ); printf "%20s ~similar~ %20s\n", $fulla, $fullb; next; } # mode 2, compare binary contents, do if no option specified # if( $do_binary ){ next if $sizea != $sizeb; next if &cmpfileneq( $fulla, $fullb ); printf "%20s =bin= %20s\n", $fulla, $fullb; next; } } } sub getdir { my $dira = shift || '.'; unless( opendir(DIR, $dira ) ){ warn "Cannot opendir($dira)\n"; return 0; } my @lsa = readdir(DIR); closedir(DIR); my @ls_return; foreach (@lsa){ my $full = "$dira$dirsep$_"; next if $fileonly and ! -f $full; next if $_ =~ m/^\.\.?$/; $_ .= '/' if -d $full and $full !~ m,/$,; tr/A-Z/a-z/ if $nocase; next if $filepat and $_ !~ m/$filepat/o; s/$sre1/$sre2/ if $sre1; push( @ls_return, $_ ); } @ls_return = sort @ls_return; warn "dir($dira)=@ls_return\n" if $verbose; return @ls_return; } sub cmp_dirs { my( %xls_a, %xls_b ); my $dira = shift; my $dirb = shift; my @ls_a = getdir( $dira ); my @ls_b = getdir( $dirb ); grep( $xls_b{$_}++, @ls_b ); grep( $xls_a{$_}++, @ls_a ); my @aminusb = grep( !$xls_b{$_}, @ls_a ); my @bminusa = grep( !$xls_a{$_}, @ls_b ); my @ainterb = grep( $xls_a{$_}, @ls_b ); # compare binary contents of files, time consuming. my( @a_diff_b, @a_bin_b ); if( ($do_unique eq '-ua!=b') or ($do_unique eq '-ua==b') or ($do_unique eq '-ul') ) { foreach $file (@ainterb) { my $fulla = "$dira$dirsep$file"; my $fullb = "$dirb$dirsep$file"; my $sizea = $size{ $fulla } ||= -s $fulla; my $timea = $time{ $fulla } ||= filedate(-M _); # int(100*(-M _))/100; my $sizeb = $size{ $fullb } ||= -s $fullb; my $timeb = $time{ $fullb } ||= filedate(-M _); # int(100*(-M _))/100; my $samecontent; if( ($sizea != $sizeb ) || &cmpfileneq( $fulla, $fullb ) ){ $samecontent = '!=bin='; push( @a_diff_b, $file ); }else{ $samecontent = '==bin='; push( @a_bin_b, $file ); } if( $do_unique eq '-ul' ){ printf( "%20s: size:%6d %s %-6d, date:%s %s %s, %s\n", $file, $sizea, &signum( $sizea, $sizeb ), $sizeb, $timea, &signum( $timea, $timeb), $timeb, $samecontent ); } } } if( ($do_unique eq '-u') or ($do_unique eq '-ua-b') ){ print_list( $batchcmd, $dira, $dirb, "\n# A-B .. Only in $dira/ and not in $dirb/", @aminusb ); } if( ($do_unique eq '-u') or ($do_unique eq '-ub-a') ){ print_list( $batchcmd, $dira, $dirb, "\n# B-A .. Not in $dira/ but in $dirb/", @bminusa ); } if( ($do_unique eq '-u') or ($do_unique eq '-ua*b') ){ print_list( $batchcmd, $dira, $dirb, "\n# A*B .. Both $dira and $dirb", @ainterb ); } if( $do_unique eq '-ua==b' ){ print_list( $batchcmd, $dira, $dirb, "\n# A==B .. $dira/ ==bin= $dirb/", @a_bin_b ); } if( $do_unique eq '-ua!=b' ){ print_list( $batchcmd, $dira, $dirb, "\n# A!=B .. $dira/ !=bin= $dirb/", @a_diff_b ); } } sub similarnames { my( $filea, $fileb ) = @_; my( $shorta, $shortb ); if( defined $similar_cachea{ $filea } ){ $shorta = $similar_cachea{ $filea }; }else{ $shorta = lc( $filea ); $shorta =~ s/\..*//; # remove extensions. $shorta =~ s/[^a-zA-Z]//g # del non alphabets if $shorta =~ m/\w\w/; # $shorta =~ s/(.....).*/$1/; # shorten len to 5 char. $similar_cachea{ $filea } = $shorta; warn "## shorten($filea) => $shorta\n" if $verbose > 1; } if( defined $similar_cacheb{ $fileb } ){ $shortb = $similar_cacheb{ $fileb }; }else{ $shortb = lc( $fileb ); $shortb =~ s/\..*//; # del extension. $shortb =~ s/[^a-zA-Z]//g # del non alphabets. if $shortb =~ m/\w\w/; $similar_cacheb{ $fileb } = $shortb; warn "## shorten($fileb) => $shortb\n" if $verbose > 1; } return 0 unless length($shorta)>0 and length($shortb)>0; return ($shorta eq $shortb ) if length($shorta) == 1 ; return ($shorta eq $shortb ) if length($shortb) == 1 ; return 2 if index( $shorta, $shortb ) != -1; return 3 if index( $shortb, $shorta ) != -1; return 0; } # Check if two files are unequal - # Note that size comparison eliminates most combinations, sub cmpfileneq { my( $filea, $fileb ) = @_; my( $sizea, $sizeb, $buflena, $buflenb, $buffera, $bufferb ); return 0 if $filea eq $fileb; # same dir, same file? then ok. $sizea = $size{ $filea } || -s $filea; # Save it for later use. $sizeb = $size{ $fileb } || -s $fileb; # in $size{ } $size{ $filea } = $sizea; $size{ $fileb } = $sizeb; return -1 if $sizea != $sizeb; # size unequal; # Sizes are equal. # Try cached page. # if( $bufcac{ $filea } && $bufcac{ $fileb } and # ($bufcac{ $filea } ne $bufcac{ $fileb }) ) # { # return -2; # "byte differ"; # } if( !open( FA, "< $filea") ){ warn "Cannot read $filea.\n"; return -3; } if( !open( FB, "< $fileb") ){ warn "Cannot read $fileb.\n"; close( FA ); return -4; } binmode( FA ); binmode( FB ); $buflena = $buflenb = 4000; # pagesize. while( $buflena && $buflenb ){ $buflena = sysread( FA, $buffera, $buflena ); $buflenb = sysread( FB, $bufferb, $buflenb ); # $bufcac{ $filea } = $buflena; # cache page. # $bufcac{ $fileb } = $buflenb; if( $buflena != $buflenb ){ close( FA ); close( FB ); warn "read error $buflena != $buflenb\n"; return -5; } if( $buffera ne $bufferb ){ close( FA ); close( FB ); return -6; # byte differ } } close( FA ); close( FB ); return 0; # equal. } sub test_soundex { my( $word ) = @_; if( $word ){ my $value = &soundex( $word ); warn "soundex($word) => $value\n" if $verbose; return; } my %sound_test = ( # Knuth's names to demonstrate the Soundex algorithm. 'Euler' => 'E460', 'Gauss' => 'G200', 'Hilbert' => 'H416', 'Knuth' => 'K530', 'Lloyd' => 'L300', 'Lukasiewicz'=> 'L222', 'Ellery' => 'E460', 'Ghosh' => 'G200', 'Heilbronn' => 'H416', 'Kant' => 'K530', 'Ladd' => 'L300', 'Lissajous' => 'L222', ); foreach $word (keys %sound_test){ my $value = &soundex( $word ); my $realvalue = $sound_test{ $word }; warn "soundex($word) => $value vs Knuth's $realvalue\n" if $verbose or ($value ne $realvalue); } } # Verify this is ok with -t. # Knuth, Donald E. \"The Art of Computer Programming, Vol. 3: Sorting # and Searching\", Addison-Wesley (1973), pp. 391-392." sub soundex { my( $word ) = @_; my %sound_map = ( "B" => 1, "F" => 1, "P" => 1, "V" => 1, "C" => 2, "G" => 2, "J" => 2, "K" => 2, "Q" => 2, "S" => 2, "X" => 2, "Z" => 2, "D" => 3, "T" => 3, "L" => 4, "M" => 5, "N" => 5, "R" => 6, ); $word =~ tr/a-z/A-Z/; # $word =~ tr/A-Z//cd; # $word =~ tr/A-Z//s; my @word = split( /\W*/, $word ); my $key = shift( @word ); my $lastcode = $sound_map{ $key }; foreach $char (@word){ my $thiscode = $sound_map{ $char }; print STDERR "$key <- $char is $sound_map{$char} <- @word $word ", " $thiscode != $lastcode?\n" if $verbose > 1; if( ($thiscode != $lastcode) and ($thiscode > 0) ){ $key .= $thiscode; } $lastcode = $thiscode; last if length $key > 3 ; } $key .= "0" x (4-length($key)); } sub signum { my( $a, $b ) = @_; return '>' if( $a > $b ); return '<' if( $a < $b ); return '=' } sub filedate { my $filetime = shift; my( @date ) = localtime( time() - $filetime*24*3600 ); my $date = sprintf("%04d%02d%02d", # 19991231 for cmp # "%04d/%02d/%02d", # 1999-12-31. 1900+$date[5], 1+$date[4], $date[3] ); # my $date = sprintf("%04d%02d%02d%02d%02d", # 1900+$date[5], 1+$date[4], $date[3], $date[2], $date[1] ); return $date; } sub sh2pat { local( $pat ) = @_ ; # Enter shell pattern, and old value $pat =~ s/\./\227/g; # Just to protect the dots. $pat =~ s/\*/.*/g; # Shell * is Perl .* $pat =~ s/\?/./g; # Shell ? is Perl . $pat =~ s/\227/\\./g; # Shell . is Perl \. # $pat = "^$pat\$"; # Shell a*b is Perl /^a.*b$/ return "$pat"; } sub print_list { my( $batchcmd, $dira, $dirb, $title ) = (shift,shift,shift,shift); if( $batchcmd ){ $batchcmd =~ s/_/ /g; foreach $file (@_){ my $format = $batchcmd; $format =~ s,\bDIRA\b,$dira,g; $format =~ s,\bDIRB\b,$dirb,g; $format =~ s,\bFILE\b,$file,g; $format =~ s,[\\/]\.[\\/],$dirsep,g; # /./ is / print $format,"\n"; } # die "# Done\n"; exit 0; } print STDOUT $title, "\n"; printf "%15s %15s %15s %15s\n", shift, shift, shift, shift while @_; } # EOF