#!/usr/bin/perl -w use strict; # Check a TREC 2006 genomics track submission for various errors. # Usage: check_genomics.pl resultsfile # The 2006 genomics track uses a new run format which is similar # to trec_eval format but includes fields to indicate passages. # The (whitespace-separated) fields are: # Topic ID (from 160-187) # Doc ID (an integer XXX, corresponding to an XXX.html file in the coll) # Rank rank for passage, from 1 to 1000 # Score score for passage. # Passage start byte offset into document # Passage length number of bytes in passage # Run tag run tag # Messages regarding submission are printed to an error log # Script uses UNIX sort routine to ensure input is sorted by increasing # topic number and decreasing sim. If run on non-unix system, # use alternate open command, but make sure input file is sorted # Note that line numbers in the error output refer to the SORTED file! # The errors checked for for the ad hoc task include # * extra fields # * multiple run tags # * missing or extraneous topics # * invalid retrieved documents # * duplicate retrieved documents in a single topic # * too many documents retrieved for a topic # * fewer than maximum allowed retrieved for a topic (warning) # Change this variable to the directory where the error log should be put my $errlog_dir = "."; # If more than 25 errors, then stop processing; something drastically # wrong with the file. my $MAX_ERRORS = 25; my $results_file; # results file (argument) my $errlog; # name of error log file my ($num_errors, $line_num); my ($last_i, $i); my $usage = "Usage: $0 resultsfile\n"; $results_file = shift @ARGV or die $usage; $last_i = -1; while ( ($i=index($results_file,"/",$last_i+1)) > -1) { $last_i = $i; } $errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog"; open ERRLOG, ">$errlog" || die "Cannot open error log for writing\n"; $num_errors = 0; $line_num = 0; &check_adhoc_task; print ERRLOG "Finished processing $results_file\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; print "Finished processing $results_file\n"; if ($num_errors) { exit 255; } exit 0; # process an adhoc run sub check_adhoc_task { my $MINQ = 160; my $MAXQ = 187; my $MAX_RET = 1000; my %topic_docnos; # hash of docs retrieved for current topic my ($topic_string,$docno,$rank,$sim,$start,$len,$tag,$rest); my %num_ret; # number of docs retrieved per topic my $line; # line from input file; my ($topic, $old_topic, $t); my $run_id; my $found; my $sort_run; for ($t=$MINQ; $t<=$MAXQ; $t++) { $num_ret{$t} = 0; } # Sort the input file by topic_num, sim and read result # ASSUMES UNIX; FOR non-unix, comment out this open, and use # alternate open --- make sure file is sorted! open RESULTS, "nl -s' ' -nln -ba -fn $results_file | sort -k2,2n -k 5,5gr |" or die "Unable to open (or sort) results file $results_file: $!\n"; $sort_run = 1; #open RESULTS, "<$results_file" || # die "Unable to open results file $results_file: $!\n"; #$sort_run = 0; $old_topic = "-1"; $run_id = ""; while ($line = ) { chomp $line; next if ($line =~ /^\s*$/); undef $tag; my @fields = split " ", $line; $line_num++ if $sort_run; if ($sort_run and scalar(@fields) == 8) { ($line_num,$topic_string,$docno,$rank,$sim,$start,$len,$tag) = @fields; } elsif (!$sort_run and scalar(@fields) == 7) { # Input was sorted outside the check script ($topic_string,$docno,$rank,$sim,$start,$len,$tag) = @fields; } else { &error("Wrong number of fields"); exit 255; } # make sure runtag is ok if (! $run_id) { # first line --- remember tag $run_id = $tag; if ($run_id !~ /^[A-Za-z0-9]{1,12}$/) { &error("Run tag `$run_id' is malformed"); next; } } else { # otherwise just make sure one tag used if ($tag ne $run_id) { &error("Run tag inconsistent (`$tag' and `$run_id')"); next; } } # get topic number if ($topic_string ne $old_topic) { $old_topic = $topic_string; undef %topic_docnos; $topic_string =~ s/^0*//; $topic = $topic_string; if ($topic < $MINQ || $topic > $MAXQ) { &error("Unknown topic ($topic_string)"); $topic = 0; next; } } # approximate check for correct docno if ($docno !~ /^\d+$/) { # invalid DOCNO &error("Unknown document `$docno'"); next; } # check that span start is nonnegative, # and than length is greater than 0. if ($start !~ /^\d+$/ or $start < 0) { &error("Invalid passage start: must be greater than or equal to zero"); next; } if ($len !~ /^\d+$/ or $len < 1) { &error("Invalid passage length: must be greater than zero"); next; } my $docno_key = "$docno-$start-$len"; if (exists $topic_docnos{$docno_key}) { &error("Passage [$start, $len] in document `$docno' retrieved more than once for topic $topic_string"); next; } $topic_docnos{$docno_key} = $topic; # remove leading 0's from rank (but keep final 0!) $rank =~ s/^0*//; if (! $rank) { $rank = "0"; } $num_ret{$topic}++; } # Do global checks: # error if some topic has no (or too many) documents retrieved for it # warn if too few documents retrieved for a topic for ($t=$MINQ; $t<=$MAXQ; $t++) { if ($num_ret{$t} == 0) { &error("No passages retrieved for topic $t"); } elsif ($num_ret{$t} > $MAX_RET) { &error("Too many passages ($num_ret{$t}) retrieved for topic $t"); } elsif ($num_ret{$t} < $MAX_RET) { print ERRLOG "$0 of $results_file: WARNING: only $num_ret{$t} passages retrieved for topic $t\n" } } } # print error message, keeping track of total number of errors # line numbers refer to SORTED file since that is the actual input file sub error { my $msg_string = pop(@_); print ERRLOG "$0 of $results_file: Error on line $line_num --- $msg_string\n"; $num_errors++; if ($num_errors > $MAX_ERRORS) { print ERRLOG "$0 of $results_file: Quit. Too many errors!\n"; close ERRLOG || die "Close failed for error log $errlog: $!\n"; exit 255; } }