#!/usr/bin/perl @wordb= ( # patterns to split words. '[\W\s]+', # keep only alphabetic words (short words). '\s+' # break words only on spaces (long words). ); $wordb = $wordb[0]; # default. $USAGE = " SYNOPSIS: Count words occurring in files. USAGE: $0 WORDPATTERN FILES EXAMPLE $0 . *.txt OPTIONS: Word split patterns: -0 split on $wordb[0], default is $wordb. -1 split on $wordb[1] -s:pat split on user supplied pat -f print files where found. -C count 2 letter subsequences. -v verbose -modules pass1: get modules names, pass2: count them. -h -? this help. NOTES: Case sensitive count: so warns of typos like Int and int AUTHOR: GPL(C) http://www.cs.albany.edu/~mosh "; while( $_ = $ARGV[0], /^-/ ){ shift; last if m/^--$/; if( m/^-s:(.+)/ ){ $wordb = $1; next; }elsif( m/^-([01])/ ){ $wordb = $wordb[$1]; next; }elsif( m/^-C/ ){ print STDERR "Counting subsequences\n"; $countsub++; }elsif( m/^-f$/ ){ $printfile++; }elsif( m/^-modules$/ ){ $modules++; }elsif( m/^-v$/ ){ $verbose++; }elsif( m/^-[?h]/ ){ print $USAGE; exit; }else{ die "Invalid option $_, see -? for help.\n"; } } # ========================================================================= $pat = shift or die $USAGE; print STDERR "Counting words ~= /$pat/io, split( /$wordb/ ) \n"; die "$USAGE Need filenames.\n" unless @ARGV; my( @files ) = @ARGV; # PASS1 my %modulename; if( $modules ){ while(<>){ if( m/\bmodule\s+([A-Za-z0-9_]+)/ ){ $modulename{$1}++; warn "module=$1.\n" if $verbose > 1; } } } # PASS2 @ARGV = @files; while(<>){ $linecount++; @words = split( /$wordb/ ); foreach $word (grep( /$pat/io, @words )){ next if $modules and !defined $modulename{$word}; $wordcount{ $word }++; # my $size, $head, $subword. if( $countsub ){ for( $size=2; $size < length( $word ); $size++ ){ for( $head = 0; $head + $size < length( $word ); $head++){ $subword = substr( $word, $head, $size ); $countsub{ $subword }++; print STDERR "$subword : $countsub{ $subword }\n" if $verbose; } last if $size > 2; # counting only two letter seq now. } } if( $printfile ){ $file = $ARGV; $wordfile{ $word } .= "$file, " unless $wordfilecount{ "$word\199$file" }++; } } if( eof ){ close(ARGV); print STDERR "processed $linecount lines in $ARGV\n"; $linecount = 0; } } if( $countsub ){ printf "SEQ Counts\n"; foreach $word (sort seqsort keys %countsub){ # Shorter seq must occur more often to be of use. # next if ($countsub{ $word } * length( $word ) ) < $linecount; # next if $countsub{ $word } < 5; # last if length( $word ) > 3; printf( "<%s> %d %d\n", $word, length( $word ), $countsub{ $word } ); } }else{ foreach $word (sort mysort keys %wordcount){ if( $printfile ){ printf( "%-20s %4d in %-30s\n", $word, $wordcount{ $word }, $wordfile{ $word } ); }else{ printf( "%4d %s\n", $wordcount{ $word }, $word ); } } } sub seqsort { length( $a ) cmp length( $b ); } sub mysort { # Sort case-insensitive my( $aa, $bb ); ($aa = $a) =~ tr/A-Z/a-z/; ($bb = $b) =~ tr/A-Z/a-z/; warn "* Warn dup $a ~ $b \n" if $aa eq $bb; $aa cmp $bb; } # EOF