#!/usr/bin/perl # First the Defaults. $minwordlength = 5; $wordpat = "."; $totalfiles = 0; $totallines = 0; $USAGE = " USAGE: $0 OPTIONS files $0 -f -pPattern -ePattern -mNN -n -v -l:FL files SYNOPSIS: Index words/Functions in files. OPTIONS: -l:FL File FL contains names of files to index, Put one filename on each line of FL. - Newlines and spaces in FL are ignored, - Lines starting with # in FL are ignored. -f Generate index of functions, default is keywords. -d Generate functional Dependencues. -pPattern. Only index words matching Pattern (optional), else all. -ePattern Exclude these patterns. -i Make patterns options after this case insensitive. -mNN List only words longer than NN chars, default is $minwordlength See the source, it filters out many common words. -n No comments, // in C, ; in asm, # in perl, rem in bat. Doesn't handle multi line comments of C. -v Verbose. -? This help. HISTORY: Was p-special-wordcount, key-index, p-index AUTHOR: (C) GNU GPL, Mohsin Ahmed Mosh\@cs.albany.edu, EXAMPLES: . index *.txt ... Index words in all txt files. . index *.c *.h > index.txt ... Indexing words in c and h files. . index -f *.c ... List only the functions. . index -f -pdll *.c ... List only the functions matching /dll/. . index -eWSA -i -pdll * ... Index words !~ /exlude/ && m/dll/i in all files. . index -v -d *.c ... Print caller-callee relations in *.c . find . -name * -print > x1; vi x1, index -l:x1 .. Index whole tree listed in x1. "; # Process options. while( $_ = $ARGV[0], /^-/ ){ shift; last if /^--$/; if( /^-p(.+)/ ){ $wordpat = $1; if( $nocase ){ $wordpat = '(?i)' . $wordpat; } warn "-words =~ /$wordpat/\n"; }elsif( /^e(.+)/ ){ $exclpat = $1; if( $nocase ){ $exclpat = '(?i)' . $exclpat; } warn "-words !~ /$wordpat/\n"; }elsif( /^-m(\d+)$/ ){ $minwordlength = $1; warn "- length( words ) >= $minwordlength\n"; }elsif( /^-f/ ){ $index_functions++; warn "-function names\n"; }elsif( /^-d/ ){ unshift( @ARGV, '-f' ); $depend++; warn "-depend\n"; }elsif( /^-i$/ ){ $nocase++; warn "-case insensitive only for options that follow.\n"; }elsif( /^-n/ ){ $no_comments++; warn "-no_comments\n"; }elsif( /^-v/ ){ $verbose++; warn "-verbose\n"; }elsif( /^-l:(.+)$/ ){ $filelist = $1; warn "-filelist = $filelist\n"; die "filelist $filelist not readable\n" unless -r $filelist; }elsif( /^-[?h]/ ){ print $USAGE; exit; }else{ die "Unknown option $_\n"; } } # Process files. if( $filelist ){ @files = &get_list( $filelist ); } splice( @files, $#files+1, 0, @ARGV ); foreach $file (@files) { print STDERR "Indexing File: ($file)\n"; do process_file( $file ); } unless( $totalfiles ){ die "No files to index? See -? for help.\n"; } # Print stats. foreach $word (sort keys %wordcount){ # No \n, easier to grep later. $thisfiles = $wordfile{ $word }; printf "%25s %3d %s\n", $word, $wordcount{ $word }, $thisfiles; } if( $depend ){ print_call_info(); } print STDERR "Processed $totalfiles files $totallines lines.\n"; # End of main program. # Eg. main(){ printf(); } # $call_to{ "main()" } = "printf()\007"; # $call_tonum{ "main()" } = #subroutines main called. # $call_from{ "printf()" } = "main()\007"; # $call_fromnum{ "printf()" } = #places printf() was called. sub mysort { # $a, $b are implicit args, we sort by how many times # a func was called, so that main() should be at the top/0. $call_fromnum{ $a } cmp $call_fromnum{ $b }; } sub print_call_info { local( $word ); foreach $word (sort mysort keys %func_occur){ print_calling( $word, 0 ); } } sub print_calling { local( $main, $level, $word ) = @_; print "-->" x $level, " $main"; if( $main_defn{ $main } ){ print " [in $main_defn{ $main }]"; }else{ # print "_"; } if( $call_tonum{ $main } ){ if( $print_calling_printed{ $main }++ ){ print " [see before].\n"; return; }else{ print "--->\n"; } }else{ print ".\n"; return; } if( $level > 10 ){ warn " nested > $level, breaking recursive loop?\n"; return; } foreach $word (split( '\007', $call_to{ $main } )){ print_calling( $word, $level+1 ); } } # Input: name of file containing names of files to index. sub get_list { local( $filelist ) = @_; local( $line ); local( @files ); open( FILELIST, "< $filelist" ) || die "Cannot read $filelist\n"; print STDERR "Processing filelist: $filelist.\n"; while( $line = ){ print STDERR "Processing ($line)\n" if $verbose; $line =~ s/#.*//; # ignore commented filenames.. $line =~ s/\n//; # ignore new lines in filelist. $line =~ s/^\s+//; $line =~ s/\s+$//; # can have multiple filenames. splice( @files, $#files+1, 0, split( /\s+/, $line ) ); } close( FILELIST ); @files; } sub process_file { local( $file ) = @_; local( $word, $line, $currentfunction, @words ); if( ! -T $file ){ warn "Skipping non text file: $file?.\n"; return -1; } if( $seenfile{ $file }++ ){ warn "Already seen file: $file."; return -1; } unless( open( FILE, "< $file" ) ){ warn "Cannot open for reading, skipping file: $file.\n"; return -1; } $totalfiles++; $wordsinthisfile=0; while( ){ $line = $_; $totallines++; # Global. if( $no_comments || $depend ){ # Remove comments, ignoring strings. if( $file =~ /\.[ch]/ ){ s,//.*,,; } if( $file =~ /\.asm$/ ){ s,;.*,,; } # if( $file =~ /\.\(bat\|cmd\)$/ ){ s,rem.*,,; } # if( $file =~ /\.pl$/ ){ s,#.*,,; } } if( $index_functions ){ @words = split(/[^\w\(]+/, $_); # split for functions. }else{ @words = split(/[\W\s]+/, $_); # split for keywords. } foreach $word (@words){ $wordlength = length( $word ); # Filter out common words. next if $wordlength < $minwordlength; next if $word !~ m/^\w/; next unless $word =~ m/$wordpat/; # Exclude patterns? next if( defined( $exclpat) && $word =~ m/$exclpat/o ); if( $index_functions ){ next unless $word =~ s/^(\w.*\().*/$1/; # Eg. abc(..); # Found a function name on this line. $word .= ')'; if( $depend ){ next if m/^\#/; # No macros. $func_occur{ $word }++; # Try to guess if this is a defn of the function. # ie. this line begins with a alphabet character. if( ($file !~ m/\.h$/i ) && ($line =~ m/^\w/) ){ $currentfunction = $word; $main_defn{ $word } = "$file:$."; }else{ # Eg. main(){ printf(); } # $call_to{ "main()" } = "printf()\007"; # $call_from{ "printf()" } = "main()\007"; # Indented lines start with space. next unless m/^\s/; warn "What function: $file:$.:$line\n" unless $currentfunction; $call_to{ $currentfunction } .= "$word\007"; $call_tonum{ $currentfunction }++; $call_from{ $word } .= "$currentfunction\007"; $call_fromnum{ $word }++; } } }else{ next unless ( defined( $wordpat ) || ( $word =~ m/[_a-z][_A-Z]/ ) || ( $wordlength > 8 ) ); } # print STDERR $file, " has ", $word, "\n" if $verbose > 2; $wordsinthisfile++; $wordcount{ $word }++ ; $wordfile{ $word } .= ", $file" unless $wordfilecount{ "$word\199$file" }++; # -- Note: $wordfilecount is a 2-D array, # separated by \199. } # end for each word. } # end while file. close( FILE ); return $wordsinthisfile; } # end sub. # EOF