#!/usr/local/bin/perl # bibexec.pl # v1.0 Sept 1 1995 - handles SpatBib format files # v1.1 Sept 15 1995 - recursively search files in TODO directory # v1.2 Oct 1 1995 - added AND/OR search for bibtex and text entries, # also MAX entries, Keyword info # v1.3 Nov 3 1995 - added keyword equivalence feedback # # Copyright amit@cs.albany.edu (Amit Mukerjee) 1995 # # Using ReadParse from cgi-lib.pl - Copyright 1993 Steven E. Brenner # and many parts from bibsearch.pl (c) 1994 Andy Wood (amw@cs.bham.ac.uk) # http://www.cs.bham.ac.uk/~amw/agents/bibtex/search.html # # Main mods by AM: # Multiple file formats handled # Added AND/OR concatenation, Case # Output filtering options # Modified bibtex Search function for AND/OR and Case Handling # # This script will respond to a call from a HTML form with the following # fields: # # 'header' - url filename of the header template file. # 'footer' - url filename of the footer template file. # 'term' - the search term(s). # 'andor' - one of 'and', or 'or'. # 'field' - the field(s) to search in. # 'Output' - one of 'all', 'Nocomments', 'Nokeys', or 'Quiet'. # 'MaxCount' - Maximum number of entries to be printed # 'mode' - one of 'substr', 'word', or 'regexp'. # 'filters'- ignore case? # 'files' - list of url filenames to be searched seperated by `\0' # # It copies the specified header template file to stdout. This file should # contain valid html and can be used to put a title on the search results. # Any instances of "$term" will be replaced with the search term text, and # any instances of "$mode" with the search mode. # # Then it searches the list of files, using the search term, in the manner # specified by the search mode. The list of files can be a bib # database as per the document ~amit/bib/format.info, or in BibTeX format, # or an HTML or plaintext file, which is searched without fields. # Any entries in the database that match the search will be outputted with # a HTML title. # # When it has scanned all the files it will then copy the specified footer # template to stdout, again replacing "$term" and "$mode". # # A url filename is of the form "~user/path/file" or "ftp:/path/file" and # these only work on the local filesystem - see GetPath for more details. #$DEBUG=1; #$SrchDEBUG=1; # **** # # Constants # # **** # For use in GetPath # USERHTMLDIR - name of directory for user supplied pages # LOCALFTPPATH - local path for public ftp site $EOL_REPL="\0\0"; # Used to fuse multiple lines together $USERHTMLDIR = "/public_html/"; $LOCALFTPPATH = "/scratch/ftp"; #$MAXENTRIES = 200; $MAXKEYS = 45; # **** # # ReadParse # # Reads in GET or POST data, converts it to unescaped text, and puts # one key=value in each member of the list "@in" # Also creates key/value pairs in %in, using '\0' to separate multiple # selections # # If a variable-glob parameter (e.g., *cgi_input) is passed to ReadParse, # information is stored there, rather than in $in, @in, and %in. # # From cgi-lib.pl - Copyright 1993/1994 Steven E. Brenner # http://www.bio.cam.ac.uk/web # # **** sub ReadParse { local (*in) = @_ if @_; local ($i, $key, $val); # Read in text if ($ENV{'REQUEST_METHOD'} eq "GET") { $in = $ENV{'QUERY_STRING'}; } elsif ($ENV{'REQUEST_METHOD'} eq "POST") { read(STDIN,$in,$ENV{'CONTENT_LENGTH'}); } @in = split(/&/,$in); foreach $i (0 .. $#in) { # Convert plus's to spaces $in[$i] =~ s/\+/ /g; # Split into key and value. ($key, $val) = split(/=/,$in[$i],2); # splits on the first =. # Convert %XX from hex numbers to alphanumeric $key =~ s/%(..)/pack("c",hex($1))/ge; $val =~ s/%(..)/pack("c",hex($1))/ge; # Associate key and value $in{$key} .= "\0" if (defined($in{$key})); # \0 is the multiple separator $in{$key} .= $val; } return 1; # just for fun } # **** # # GetPath # # Converts it's argument from a partial path ("~amw/file" or "ftp:/path/file") into # it's full equivalent ("/home/pg/amw/public_html/file" or "/scratch/ftp/path/file") # ensuring that we don't inadvertently allow external users full access to the file # system. We also remove any instances of ".." in the path. # # **** sub GetPath { local( $filename ) = $_[ 0 ]; if ( $filename =~ /^~/ ) { local( $name ) = $filename; $name =~ s/^~([^\/]*)\/(.*)/$1/; local( $file ) = $2; local( @entry ) = getpwnam( $name ); $filename = $entry[ $#entry - 1 ].$USERHTMLDIR.$file; } elsif ( $filename =~ /^ftp:(.*)/ ) { $filename = $LOCALFTPPATH.$1; } else { $filename = ""; } $filename =~ s/\.\.//g; # Make sure we don't allow any ..'ing return $filename; } # **** # # PrintHeader # # Prints the line that tells WWW that we're an HTML document (honest!) # # **** sub PrintHeader { print "Content-type: text/html\n\n"; } # **** # # Search # # See if entry ($_) matches required search term. The arguments are # 1: Search term (in this case $in{ 'term' }), # 2: Field - ($in{ 'field' }), # 3: Search Mode ($in{ 'mode' }). # The concatenation variable $OR is passed globally - AM # **** sub Search { local( $found ) = 0; local( $searchin ); local ($term, $mode, $field) = @_; local (@SrchTrms) = split(/ /,$term); if ($mode eq 'word') { $SrchMode = '$searchin =~ /\b' . '($key)' . '\b/' . $ignore ; } else { $SrchMode = '$searchin =~ /' . '($key)' . "/" . $ignore ; } if ( $field ne "" && $field ne "all" ) { local( @fields ) = split( /[\s]+/, $_[2] ); foreach $field ( 0..$#fields ) { $searchin .= &GetField( $fields[$field], $_ ); } } else { $searchin = $_; } # if ( $mode eq 'regexp' ) # { # $found = 1 if $searchin =~ /($term)/i; # } # else { foreach $key ( @SrchTrms ) { if (eval $SrchMode) { $found=1; last if ($OR); } else {$found=0; last if (!$OR);} } } return $found; } # **** # # MarkupEntry # # Create a line of HTML for each entry in the file. This pulls the # title and the author (or editor) from the BibTeX entry in $_, and # prints a HTML heading, followed by an availablity list of urls if # there are any, followed by the full entry in
formatted form.
#
# ****
sub MarkupEntry
{
local( $junk, $bibtex ) = split( '@', $_, 2 );
$bibtex = '@'.$bibtex;
local( $key, $rest ) = split( ',', $bibtex, 2 );
$key =~ s/^@.*[{(]\s*(.*)\s*$/$1/; # Retrieve the key
local( $author, $title );
$title = &GetField( "title", $bibtex );
if ( /author\s*=/i )
{
$author = &GetField( "author", $bibtex );
}
elsif ( /editor\s*=/i )
{
$author = &GetField( "editor", $bibtex );
if ( $author =~ /\band\b/ )
{
$author .= " (Eds)";
}
else
{
$author .= " (Ed)";
}
}
else
{
$author = "";
}
print "\n", $title, ", ", $author, "\n";
if ( /url\s*=/i )
{
print "
Available as ", &MarkupURL( &GetField( "url", $bibtex ) ), ".\n";
}
s///i;
s/<\/PRE>//i;
print "\n", $bibtex, "
\n";
}
# ****
#
# MarkupURL
#
# Takes a list of URLs seperated by commas and expands them into a html list
# that you can click on.
#
# ****
sub MarkupURL
{
local( $url, $format, $html ) = "";
local( @urls ) = split( ',', $_[0] );
foreach $url ( 0..$#urls )
{
$html .= "\nand" if ( $url == $#urls && $#urls >= 1 );
$html .= "," if ( $url >= 1 && $url < $#urls );
$format = "";
if ( $urls[ $url ] =~ /\.Z/ || $urls[ $url ] =~ /\.gz/ )
{
$format = "compressed ";
}
if ( $urls[ $url ] =~ /\.ps/i )
{
$format .= "postscript";
}
elsif ( $urls[ $url ] =~ /\.txt/i )
{
$format .= "text";
}
elsif ( $urls[ $url ] =~ /\.gif/i )
{
$format .= "gif";
}
elsif ( $urls[ $url ] =~ /\.html/i )
{
$format .= "hypertext";
}
else
{
$format = $urls[ $url ];
$format =~ s/.*\/([^\/]*)$/$1/;
}
$urls[ $url ] =~ s/^\s+//;
$html .= "\n".$format."";
}
return $html;
}
# ****
#
# GetField
#
# Gets the field specified in the first argument and strips it of quotes and/or
# squiggly brackets, removes excess spaces and returns it.
#
# ****
sub GetField
{
local( $field, $contents ) = @_; # Arguments: field name, bibtex entry
$contents =~ s/\n/ /g; # Remove all \n's
if ( $contents =~ /.*\b($field)\s*=\s*"([^"]*)"\s*,/i )
{
$contents = $2;
}
elsif ( $contents =~ /.*\b($field)\s*=\s*{(.*)}\s*,/i )
{
$contents = $2; # Contains remaining fields too
$contents =~ s/}\s*,.*//g; # So remove everything after },
}
elsif ( $contents =~ /.*\b($field)\s*=\s*(\d*)\s*,/i )
{
$contents = $2;
}
else
{
$contents = "";
}
$contents =~ s/"|{|}//g; # Remove ""`s and {}'s
$contents =~ s/\s+/ /g; # Make lots of spaces into 1.
return $contents;
}
# ****
#
# ProcessFile
#
# Process the file (filename specified by the first argument). This involves
# opening it, printing the last part of the filename, stripping off the html
# header (if it is not a .bib file), printing all the entries that match
# the search term (or "none found" if there aren't any) and closing the
# file again.
#
# ****
sub ProcessFile
{
local( $file, $field, $mode, $term ) = @_;
$filename = &GetPath( $_[0] );
if ($field eq "key" || $field eq "comments" || $field eq "source") {
$field = "all";
}
local( $nfound ) = 0;
open( FILE, $filename ) ||
print "Couldn't Open Input file - ".$filename."
\n";
print "".substr( $filename, rindex( $filename, '/' ) + 1 )."
\n";
if ( $filename =~ /\.bib$/ )
{
$stage='body';
$/ = "";
}
else
{
$stage='header';
}
if ($DEBUG) {
print "BIBTEX search for TERM: $term, MODE: $mode FIELD: $field
\n";
}
while ( )
{
if ( $stage eq 'body' )
{
if ( /@.*{/ && &Search( $term , $mode , $field ) ){
# If bibtex entry and matches search.
&MarkupEntry();
&MaxReached if ($Maxcount -$bibcount < $nfound++);
}
if ( //i ) # If end of entries
{
$stage='footer';
$/ = "\n";
}
if ( ( $stage eq 'header' ) && //i ) {
# If end of header
$stage='body';
$/ = "";
}
}
}
$bibcount += $nfound;
print "", $nfound," Matching Entries found. (Total: ",$bibcount,
")
\n";
close( FILE );
print "
\n";
}
# ****
#
# PrintTemplate
#
# Copy a Template file substituting variables where necessary.
#
# ****
sub PrintTemplate
{
local( $filename ) = &GetPath( $_[0] );
open( TEMPLATE, $filename ) || print "Incorrect Template File!
\n";
while ( )
{
s/\$term/$in{'term'}/;
s/\$mode/$in{'mode'}/;
s/\$field/$in{'field'}/;
s/\$andor/$in{'andor'}/;
s/\$Maxcount/$in{'max'}/;
s/\$output/$in{'output'}/;
print;
}
close( TEMPLATE );
}
# ***************************** Keyword_Review ***************************** #
#
sub Keyword_Review{
&PrintTemplate( "~amit/bib/review.header" );
local ($more) = $MAXKEYS;
print "";
print" Frequency Keyword Also Means\n";
print" --------- ------- ----------\n";
$filename= &GetPath( "~amit/bib/keywords.freq" );
open(KEYFILE,$filename) || print "Couldn't Open Input file - ",
$filename, "
\n";
$filename= &GetPath( "~amit/bib/keywords.dict" );
open(DICTFILE,$filename) || print "Couldn't Open Input file - ",
$filename, "
\n";
while (){
chop;
last if (/_ENDOFDICT/);
($equiv,$words)=split(/\t/);
$equiv =~ s/\s+$//g;
# $equiv =~ y/[A-Z]/[a-z]/;
foreach $w (split(/\s+/,$words)){
$eqv{$w} .= " " if (defined($eqv{$w})); # --> multiple separator
$eqv{$w} .= $equiv; print $w,": ", $eqv{$w},"\n" if ($DEBUG);
}
}
@lines= sort {$b <=> $a} ;
while (@lines && $more--){
$line = shift @lines;
chop $line;
$line=~s/ \.$//;
$key = substr($line,5);
print " ",$line,' ' x (22-length($line))," ",$eqv{$key},"\n";
# ($junk,$freq,$key)=split($line);
# print " ",$freq," ",$key,"\n";
}
die;
}
# ***************************** MaxReached ***************************** #
#
# MaxReached kills the search after finding the maximum
sub MaxReached{
print "
The maximum number of entries was set at ",$Maxcount,'
. You can increase it by going back to the
Search Page..
';
if ($BibFile =~ /bib\./){
&InsertKeyForm;
}
&PrintTemplate( $in{'footer'} );
die;
}
# *************************** RecordKeys **************************** #
#
sub RecordKeys{
local $terms= @_[0];
$filename= &GetPath( "~amit/bib/keywords.FAIL" );
open( FAILFILE, ">>".$filename ) || print "
Couldn't Open Input file - ",
$filename, "
\n";
print FAILFILE $terms,"\n";
}
# *************************** InsertKeyForm **************************** #
#
sub InsertKeyForm{
print "Would you like to review the main keywords? \n";
print '";
}
# ***************************** bibMarkup ****************************** #
#
# bibMarkup handles the output; whether to suppress comments, or keyword lines
# BUG: many global variables
sub bibMarkup{
local($e) = @_;
local($a,$k,$t,$s,$c) = split(/\n/,$e,5);
chop($c);
$t =~ s/ $EOL_REPL /\n\t/go;
$s =~ s/ $EOL_REPL /\n\t/go;
print "\n ".$t.", ".$a."\n\n"; #
print $a,"\n";
print $k,"\n" if ($PrintKeys);
print $t,"\n",$s,"\n";
print "$c" if ($PrintComments);
print "
\n";
}
# **************** SearchFile: SpatBIB SEARCH FUNCTION ****************** #
#
# Searches a file in the format as in ~amit/bib/bib
#
sub SearchFile{
$/=""; # Enable Paragraphs #$* = 1; is not needed
local($BIBFILE, $SrchField, $Terms) = @_;
local( $filename ) = &GetPath( $_[0] );
local($nfound) = 0;
open( BIBFILE, $filename ) || print "Couldn't Open Input file - ",
$filename, "
\n";
print "".substr( $filename, rindex( $filename, '/' ) + 1 )."
\n";
local (@SrchTrms) = split(/ /,$Terms);
if ($WholeWord) {
foreach (@SrchTrms) { $_="\\b" . $_ . "\\b";}
}
$SrchMode = "/" . '$SRCH' . "/" . $ignore;
if ($DEBUG){
print "INPUT TO SearchFile - FILE: ".$BIBFILE;
print "
- SearchString: ".$Terms."\n";
print "
- SrchField = ".$SrchField."\n";
print "
- Search Terms: ";
foreach (@SrchTrms) {print $_," ";}
print "
\nSRCHMODE: ", $SrchMode," FIELD: ",$SrchField,
" Print C:", $PrintComments," K:",$PrintKeys, " OR? ", $OR, "
\n";
}
while (){
s/\n\t/ $EOL_REPL /g;
$entry = $_;
if ($SrchField < $all) {
@Line = split(/\n/, $_, 5);
$_ = $Line[$SrchField];
if ($SrchField == $key) {
if (/\t/) {$_ = $`};
}
$Field = $_ if $SrchDEBUG ;
}
@SrchList=@SrchTrms; # Loop through list of Search Terms
while (@SrchList){
$SRCH=$SrchList[0];
$found = eval $SrchMode;
print ("FOUND: ", $SRCH, " ") if ($found && $SrchDEBUG) ;
# print ("FOUND: ", $SRCH, " ") if (!$OR) ;
last if ($found && $OR);
last if ( !$found && !$OR);
shift @SrchList;
}
if ($found) {
if ($SrchDEBUG) {print " ;FIELD=",$Field,"$nfound: ";}
# prints searchstring only if on a Field
bibMarkup($entry);
&MaxReached if ($Maxcount -$bibcount < $nfound++);
}
}
close(BIBFILE);
$bibcount += $nfound;
print "
", 0+$nfound," Matching Entries found. (Total: ",
$bibcount,")
\n";
# print "", 0+$bibcount," Matching Entries found.
\n";
$/ = "\n";
}
# ***************************** TXTSRCH ****************************** #
#
# TxtSrch Searches files in arbitrary syntax for the search string
#
sub TxtSrch{
local ($filename,$term) = @_;
local (@SrchTrms) = split(/ /,$term);
if ($mode eq 'word') {
$SrchMode = '/\b' . '($key)' . '\b/' . $ignore ;
}
else { $SrchMode = '/' . '($key)' . "/" . $ignore ; }
print "\nSRCHMODE: ", $SrchMode, "TRM[0]: ", $SrchTrms[0],
" \$OR ", $OR, "
\n" if ($DEBUG);
local($nfound) = 0;
if ($filename =~ /.html/ ) {$htmlmode++;}
$fn = &GetPath( $filename );
print "GETPATH: $filename mapped to $fn \n" if ($DEBUG) ;
$/="";
open (FILE, $fn ) ||
print "
Couldn't Open Input file ",$filename," in $fdir
\n";
print "".substr( $filename, rindex( $filename, '/' ) + 1 )."
\n";
print "" if (! $htmlmode);
while () {
foreach $key ( @SrchTrms )
{
if (eval $SrchMode) {
$found=1;
last if ($OR);
}
else {$found=0; last if (!$OR);}
}
if ($found) {
print;
print"
\n";
&MaxReached if ($Maxcount -$bibcount < $nfound++);
}
}
close (FILE);
print " " if (! $htmlmode);;
$bibcount += $nfound;
print "", 0+$nfound," Matching Entries found. (Total: ",
$bibcount,")
\n";
# $/ = "\n";
}
# ***************************** CheckDict ****************************** #
#
# Checks the keyword dictionary for equivalent terms
#
sub CheckDict{
local ($term) = @_[0];
$filename= &GetPath( "~amit/bib/keywords.dict" );
open(DICTFILE,$filename) || print "Couldn't Open Input file - ",
$filename, "
\n";
local (@ST) = split(/ /,$term); # presumably already in uppercase
while (){
chop;
last if (/[a-z]/);
($equiv,$words)=split(/\t/);
$equiv =~ s/\s+$//g;
foreach $t (@ST){
if (($equiv =~ /$t/) || ($t =~ /$equiv/)){
$words =~ s:\s+: and/or :g;
print "Keyword Tips:
Instead of $t, you
may try using
- $words
Based on the keywords dictionary
" ;
}
}
}
close(DICTFILE);
if (!$bibcount) {
&InsertKeyForm ;
&RecordKeys($term); #! NOMIRR
}
}
# ***************************** MAIN ********************************* #
$ignore=""; # Set to "i" if ignore is ON.
$PrintComments=$PrintKeys=1;
$author=0; # This and the next few define fields to search
$key=1;
$title=2;
$source=3;
$comments=4;
$all=99;
&ReadParse();
&PrintHeader();
if ($in{'keyreview'} eq "yes") {&Keyword_Review};
&PrintTemplate( $in{'header'} );
print "\n";
$TERMS = $in{'term'};
$OR = ($in{'andor'} eq "OR");
$SrchField = eval "\$".$in{'field'};
$mode = $in{'mode'};
$Maxcount = $in{'max'};
$WholeWord = ("$mode" eq "word");
@files = split( /\0/, $in{'files'} );
@filt = split( /\0/, $in{'filters'} );
foreach $filter ( @filt ) {
if ($filter =~ /Ignore/) {$ignore="i";}
}
@output = split( /\0/, $in{'Output'} );
foreach $outp ( @output ) {
# if ($outp eq "all") {}
if ($outp eq "NoComments") {$PrintComments--;}
elsif ($outp eq "NoKeywords") {$PrintKeys--;}
elsif ($outp eq "Quiet") {$PrintKeys--;$PrintComments--;}
}
if ($SrchField == $key) { $TERMS =~ tr/a-z/A-Z/;}
if ($DEBUG) {
print "
INPUT TO BIBEXEC:
\n\n";
print "- TERMS: ".$in{'term'}."\n";
print "
- andor: ".$in{'andor'}."\n";
print "
- field: ".$in{'field'}." : SearchField = $SrchField\n";
print "
- mode: ".$in{'mode'}." : WholeWord = ", $WholeWord,"\n";
print "
- max: ".$in{'max'}." : Maxcount = ", $Maxcount,"\n";
print "
- output: ".$in{'output'}." : PrintComments= ", $PrintComments,
" PrintKeys= ", $PrintKeys;
print "
- filters: ".$in{'filters'}, " \$ignore=", $ignore, "\n
- Files: ";
foreach $BibFile ( @files ) {print $BibFile, " ";}
print "\n
\n";
}
if (! (@files) ) { print "No Search Files Specified!
\n";}
elsif ( $TERMS eq "" ) { print "No Search Term Given!
\n"; }
else {
foreach $BibFile ( @files ) {
if ($BibFile =~ /bib\./) {
&SearchFile( $BibFile, $SrchField, $TERMS );
if ($SrchField == $key) { &CheckDict($TERMS) };
}
elsif ($BibFile =~ /\.bib/) { # || $BibFile =~ /\.html/)
&ProcessFile( $BibFile , $in{'field'}, $in{'mode'}, $in{'term'});
}
elsif ($BibFile =~ /TODO/) {
$/="\n";
local ($fdir) = &GetPath ( $BibFile );
open (FILELIST, $fdir."/FLIST") ||
print "Couldn't Open Input file FLIST in $fdir
\n";
@flist = ;
foreach $fn (@flist){
chop($fn);
&TxtSrch( $BibFile."/".$fn , $in{'term'} );
}
close(FILELIST);
}
else { # ($BibFile =~ /\.txt/)
&TxtSrch( $BibFile , $TERMS );
}
}
}
print "\n\n";
&PrintTemplate( $in{'footer'} );
exit;
# EASY TODO: Add a "Exact" mode to the AND/OR which will not
# split the term into words. (See TxtSrch)
# HARD TODO: Output options in BibTex or REFER formats.