#!/usr/bin/perl $USAGE = ' SYNOPSIS: Sort archive and file listings. AUTHOR: (C) Mohsin Ahmed http://www.cs.albany.edu/~mosh USAGE: > unzip -lv \\*.zip > LIST .. Make a LISTing file from zip. > tar -tvf file.tar >> LIST .. or from tar. > ls-zip [options] LIST .. Sort and Search LIST > sh tmp.bat .. Run script to del dup files. Same archive listed twice will warn and die. OPTIONS: -list Plain list: date size archive:file -s:ps sort by path and size -s:dp sort by date and path (path is dir/file.ext). -s:pd sort by path and date -s:ad sort by archive, path and date -s:sp sort by size and path -s:crc sort by crc, unzip -lv gives crc. -/re Show only matching paths. -!re Skip matching paths. -rad Rename archives by date - latest date of file. -nodup Generate tmp.bat to del dup files in zipfiles, Read Edit and Run tmp.bat, no guarantees. Dup if same crc32 and size and name. Keeps only first of the dups. -fmt:format Print each line in the format specified. ARCHIVE,PATH,FILE,DATE,SIZE are substituted for actuals, __ by a tab. Used to extract/delete selective files. -tfile:file Specify a file to save output, default is tmp.bat -v Verbose EXAMPLES: c:/> unzip -lv *.zip > LIST ... Make LISTing file. c:/> ls-zip -s:sp -/zip -!\s LIST ... Sort listing by size and path, and skip paths with spaces. c:/> ls-zip -nodup LIST ... generate tmp.bat c:/> vi tmp.bat ... Make sure tmp.bat is ok. c:/> tmp ... Does zip -d to delete duplicates. '; @ARGV or die $USAGE; my( $sorttype, $filterin, $filterout ); my $comment = '#'; # if( $ENV{ 'windir' } || $ENV{'WINDIR'} ){ # $comment = 'rem'; # } while( $_ = $ARGV[0], /^-/ ){ shift; if( m/^--$/ ){ last; }elsif( m,-tfile:(.*), ){ $tfile = $1 || 'tmp.bat'; open( TFILE, "> $tfile " ) or die "Cannot write $tfile $!\n"; warn "Writing file $tfile\n"; }elsif( m,-fmt:(.+), ){ unshift( @ARGV, '-tfile:tmp.bat' ); $fmt = $1; }elsif( m,-list, ){ $list++; }elsif( m,-!(.+), ){ $filterout = $1; warn "filterout is $filterout\n"; }elsif( m,-/(.+), ){ $filterin = $1; warn "filterin is $filterin\n"; }elsif( m/-rad/ ){ unshift( @ARGV, '-tfile:tmp.bat' ); $do_rad = 1; }elsif( m/-nodup/ ){ unshift( @ARGV, '-tfile:tmp.bat' ); $nodup = 1; }elsif( m/-s:(.+)/ ){ $sorttype = $1; warn "Sorting by $sorttype\n"; }elsif( m/-v/ ){ $verbose++; }else{ die "$USAGE Unknown option '$_'.\n", } } # Key for all of them is $path = $dira/$base.$extn my( %archive, %dira, %base, %extn, %size, %date, %basecount, %sizecount, %archivedin, %filepath, # for sorting. %crc, %invcrc # for duplicates. ); parselines(); if( $list ){ die "\n"; # parselines did the printing. } if( $nodup ){ my( $dupcount, $dupsize ); ICRC: foreach $crcsize (keys %invcrc ){ my( $crc, $size ) = split( ':', $crcsize ); my $value = $invcrc{$crcsize}; next ICRC unless $size > 0; next ICRC unless $value =~ m/,/; my( $file ) = ( $value =~ m,([^\\/:]+)$, ); ++$dupcount; $out = sprintf "\nrem %03d crc=%s, size=%s, file=$file\n", $dupcount, $crc, $size, $file; print TFILE $out if $tfile; print $out if!$tfile or $verbose; my( @val )= sort split( ',', $value ); while( @val ){ my( $zip, $path ) = split( ':', shift(@val) ); $path =~ s,\./,?/,; # bug: zip -d ./x => zip -d ?/x $out = " zip -d $zip $path\n"; $out = 'rem ' . $out unless @val; # Keep last. print TFILE $out if $tfile; print $out if!$tfile or $verbose; } } if( $tfile ){ close TFILE; print "Wrote $tfile\n"; } die "Found $dupcount dup files.\n" } # Track when these change, so extra post processing can be done. # my( $lastcrc, $lastsize, $lastfile ); my( $lastarchive ); ITEM: foreach $key ( sort { if( ($sorttype eq 'ad' ) or $do_rad ){ # do_rad: sort by archive, path, then date. $archivedin{ $a } cmp $archivedin{ $b } || $date{ $b } - $date{ $a } }elsif( $sorttype eq 'dp' ){ # sort by date, then subsort by name ... $date{ $b } cmp $date{ $a } || $base{ $a } cmp $base{ $b } || $extn{ $a } cmp $extn{ $b } || $dira{ $a } cmp $dira{ $b } }elsif( $sorttype eq 'pd' ){ # sort by path, then date. $base{ $a } cmp $base{ $b } || $extn{ $a } cmp $extn{ $b } || $dira{ $a } cmp $dira{ $b } || $date{ $b } cmp $date{ $a } }elsif( $sorttype eq 'sp' ){ # sort by size, then subsort by name ... $size{ $b } - $size{ $a } || $base{ $a } cmp $base{ $b } || $extn{ $a } cmp $extn{ $b } || $dira{ $a } cmp $dira{ $b } }elsif( $sorttype eq 'crc' ){ # sort by crc, date, name, size, dir $crc{ $a } cmp $crc{ $b } || $date{ $a } cmp $date{ $b } || $base{ $a } cmp $base{ $b } || $extn{ $a } cmp $extn{ $b } || $size{ $b } - $size{ $a } || $dira{ $a } cmp $dira{ $b } }else{ # sort by name, then subsort by size ... $base{ $a } cmp $base{ $b } || $extn{ $a } cmp $extn{ $b } || $size{ $b } - $size{ $a } || $dira{ $a } cmp $dira{ $b } } } keys %size) { # # 'next' below suppresses single files. # # Otherwise you get full LIST sorted by filenames. # next unless $basecount{ $base{$key} } > 1 && # $sizecount{ $size{$key} } > 1; my $crc = $crc{$key}; my $file = $base{$key} .'.'. $extn{$key}; my $extn = $extn{$key}; my $size = $size{$key}; my $date = $date{$key}; my $base = $base{$key}; my( $archive, $path ) = split(/:/,$key,2); # Example: if $key is back/etc.zip:/etc/vim.rc # then $archive = back/etc.zip, $path = /etc/vim.rc, # $file = vim.rc, $base = vim, $extn = rc (no dot). $out = sprintf " %-12s %8s %12s %s\n", # Default. $file, $size, $date, $key; if( $do_rad ){ if( $archive eq $lastarchive ){ $out = 'rem ' . $out; }else{ $out = sprintf "rename $archive $date-x.zip\n"; } }elsif( $fmt ){ $out = $fmt; $out =~ s/__/\t/go; $out =~ s/\bARCHIVE\b/$archive/go; $out =~ s/\bPATH\b/$path/go; $out =~ s/\bFILE\b/$file/go; $out =~ s/\bDATE\b/$date/go; $out =~ s/\bSIZE\b/$size/go; $out =~ s/\bCRC\b/$crc/go; $out .= "\n"; } print TFILE $out if $tfile; print $out if!$tfile or $verbose; $lastarchive = $archive; } if( $tfile ){ close TFILE; warn "Wrote file $tfile\n"; } # End of Main =================================================== sub parselines { my( $archive ); LINELISTING: while( <> ){ my( $path, $size, $date, $dira, $base, $extn, $attrib, $isdir, $crc ); if( m/^Archive:\s+(.+)$/ ){ # unzip -l => Archive: ../tar/4gl.zip $archive = $1; $archive =~ s,.*/,,; # Get rid of pathname. if( $archive{ lc($archive) }++ ){ die "Stopped, $archive appears again on line $.\n" } print STDERR "Scanning archive: $archive\n" unless $list; next LINELISTING; }elsif(# unzip -l # Size Mo-Da-Ye Ho:Mi Dir/File # 5267 03-13-93 10:18 4gl/read-me.bat # $1 $2 $3 $4 $5:$6 $7 m|^\s* (\d+)\s+ (\d+)-(\d+)-(\d+)\s+ (\d+:\d+)\s+ (\S+[^/])\n|x ){ $path = "$archive:$6"; # unique key: 'archive:path' $size = $1; # file size $date = "$4-$3-$2-$5"; # sort-able date. # $isdir = ($path =~ ,/$,) }elsif(# unzip -lv # Length Method Size Ratio Date Time CRC-32 Name # ------ ------ ---- ----- ---- ---- ------ ---- # 0 Stored 0 0% 03-21-98 22:34 00000000 qu/ # 4376 Defl:N 1756 60% 08-20-96 21:02 2816084a qu/00.TXT # 15600 Defl:N 3458 78% 08-20-96 21:02 44adf135 qu/1.TXT m|^\s* (\d+)\s+ (?# Length $1 ) \S+\s+ (?# Method ) \d+\s+ (?# Size ) \d+\%\s+ (?# Ratio ) (\d+)-(\d+)-(\d+)\s+ (?# Date $234 ) (\d+:\d+)\s+ (?# Time $5 ) (\S+)\s+ (?# CRC-32 $6 ) (\S+[^/]) (?# Name $7 ) \n|x ){ $path = "$archive:$7"; # unique key: 'archive:path' $size = $1; # file size $date = "$4-$2-$3-$5"; # sort-able date. $crc = $6; # $isdir = ($path =~ ,/$,) }elsif(# tar -tvf # -rw-rw-rw- 0/0 2417 1998-09-08 09:41 mbin/zipdup.pl # $1 $2 $3 $4 $5 # zz: this tar output is not parsed. # -rw-rw-rw- 0/0 2417 Sep 08 09:41 1998 mbin/zipdup.pl m, (\S)(\S+)\s \d+/\d+\s+ (\d+)\s+ (\d\d\d\d-\d\d-\d\d\s+\d\d:\d\d)\s+ (\S+) ,x ){ $path = $5; $size = $3; $date = $4; $isdir = $1 eq 'd'; $attrib = $2; } # Parse $path = $archive:$dira/$base.$extn $path =~ s,\\,/,g; # dos\ to dos/. $date =~ s/[-:]//g; # numerically sortable. next LINELISTING if $filterin and $path !~ /$filterin/o; next LINELISTING if $filterout and $path =~ /$filterout/o; next LINELISTING unless $path; $dira = $base = $path; # path=dir/filebase.ext $base =~ s,.*/,,; # get rid of dir $base =~ tr/A-Z/a-z/; # lowercase filename ($extn = $base) =~ s,.*\.,, if $base =~ m/\./; # find .ext $base =~ s,\.[^.]*,,; # remove .ext if any. $dira =~ s,(.*/).*,$1,; $dira =~ tr/A-Z/a-z/; # Save the parsed info. $size{ $path } = $size; $date{ $path } = $date; $base{ $path } = $base; $extn{ $path } = $extn; $dira{ $path } = $dira; $crc{ $path } = $crc; $basecount{ $base }++; $sizecount{ $size }++; ( $archivedin{ $path }, $filepath{ $path } ) = split( ':', $path ); printf "date=%10s, size=%05d, archivedin=%08s, filepath=%s\n", $date, $size, $archivedin{$path}, $filepath{$path} if $list or $verbose; # incase two files have same crc, include size also my $crcsize = $crc .":". $size; if( exists $invcrc{ $crcsize } ){ $invcrc{ $crcsize } .= ",$path"; }else{ $invcrc{ $crcsize } = $path; } } }