#!/usr/bin/perl -w

use strict;

# Check a TREC 2006 genomics track submission for various errors.
# Usage: check_genomics.pl resultsfile

# The 2006 genomics track uses a new run format which is similar
# to trec_eval format but includes fields to indicate passages.
# The (whitespace-separated) fields are:
#   Topic ID (from 160-187)
#   Doc ID   (an integer XXX, corresponding to an XXX.html file in the coll)
#   Rank     rank for passage, from 1 to 1000
#   Score    score for passage.
#   Passage start    byte offset into document
#   Passage length   number of bytes in passage
#   Run tag  run tag

# Messages regarding submission are printed to an error log

# Script uses UNIX sort routine to ensure input is sorted by increasing
# topic number and decreasing sim.  If run on non-unix system,
# use alternate open command, but make sure input file is sorted
# Note that line numbers in the error output refer to the SORTED file!
# The errors checked for for the ad hoc task include
#      * extra fields
#      * multiple run tags
#      * missing or extraneous topics
#      * invalid retrieved documents
#      * duplicate retrieved documents in a single topic
#      * too many documents retrieved for a topic
#      * fewer than maximum allowed retrieved for a topic (warning)


# Change this variable to the directory where the error log should be put
my $errlog_dir = ".";

# If more than 25 errors, then stop processing; something drastically
# wrong with the file.
my $MAX_ERRORS = 25; 


my $results_file;       # results file (argument)
my $errlog;             # name of error log file
my ($num_errors, $line_num);
my ($last_i, $i);

my $usage = "Usage: $0 resultsfile\n";
$results_file = shift @ARGV or die $usage;

$last_i = -1;
while ( ($i=index($results_file,"/",$last_i+1)) > -1) {
    $last_i = $i;
}
$errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog";
open ERRLOG, ">$errlog" ||
	die "Cannot open error log for writing\n";


$num_errors = 0;
$line_num = 0;

&check_adhoc_task;

print ERRLOG "Finished processing $results_file\n";
close ERRLOG || die "Close failed for error log $errlog: $!\n";
print "Finished processing $results_file\n";

if ($num_errors) { exit 255; }
exit 0;





# process an adhoc run
sub check_adhoc_task {

    my $MINQ = 160;
    my $MAXQ = 187;
    my $MAX_RET = 1000;

    my %topic_docnos;	    # hash of docs retrieved for current topic
    my ($topic_string,$docno,$rank,$sim,$start,$len,$tag,$rest);
    my %num_ret;	    # number of docs retrieved per topic
    my $line;               # line from input file;
    my ($topic, $old_topic, $t);
    my $run_id;
    my $found;
    my $sort_run;

    for ($t=$MINQ; $t<=$MAXQ; $t++) {
        $num_ret{$t} = 0;
    }

    # Sort the input file by topic_num, sim and read result
    # ASSUMES UNIX; FOR non-unix, comment out this open, and use
    # alternate open --- make sure file is sorted!
    open RESULTS, "nl -s' ' -nln -ba -fn $results_file | sort -k2,2n -k 5,5gr |"
	or die "Unable to open (or sort) results file $results_file: $!\n";
    $sort_run = 1;

    #open RESULTS, "<$results_file" ||
    #	die "Unable to open results file $results_file: $!\n";
    #$sort_run = 0;


    $old_topic = "-1";
    $run_id = "";
    while ($line = <RESULTS>) {
        chomp $line;
        next if ($line =~ /^\s*$/);

	undef $tag;
	my @fields = split " ", $line;
	$line_num++ if $sort_run;

	if ($sort_run and scalar(@fields) == 8) {
	    ($line_num,$topic_string,$docno,$rank,$sim,$start,$len,$tag) = @fields;
	} elsif (!$sort_run and scalar(@fields) == 7) {
	    # Input was sorted outside the check script
	    ($topic_string,$docno,$rank,$sim,$start,$len,$tag) = @fields;
	} else {
	    &error("Wrong number of fields");
	    exit 255;
	}

        # make sure runtag is ok
        if (! $run_id) { 	# first line --- remember tag 
	    $run_id = $tag;
	    if ($run_id !~ /^[A-Za-z0-9]{1,12}$/) {
	        &error("Run tag `$run_id' is malformed");
	        next;
   	    }
        }
        else {			# otherwise just make sure one tag used
	    if ($tag ne $run_id) {
	        &error("Run tag inconsistent (`$tag' and `$run_id')");
	        next;
	    }
        }

        # get topic number
        if ($topic_string ne $old_topic) {
	    $old_topic = $topic_string;
	    undef %topic_docnos;
            $topic_string =~ s/^0*//;
	    $topic = $topic_string;
	    if ($topic < $MINQ || $topic > $MAXQ) {
                &error("Unknown topic ($topic_string)");
                $topic = 0;
                next;
            }  
        }


        # approximate check for correct docno
        if ($docno !~ /^\d+$/) {	# invalid DOCNO
            &error("Unknown document `$docno'");
	    next;
        }

	# check that span start is nonnegative, 
	# and than length is greater than 0.
	if ($start !~ /^\d+$/ or
	    $start < 0) {
	    &error("Invalid passage start: must be greater than or equal to zero");
	    next;
	}

	if ($len !~ /^\d+$/ or
	    $len < 1) {
	    &error("Invalid passage length: must be greater than zero");
	    next;
	}

	my $docno_key = "$docno-$start-$len";
        if (exists $topic_docnos{$docno_key}) {
            &error("Passage [$start, $len] in document `$docno' retrieved more than once for topic $topic_string");
            next;
        }
        $topic_docnos{$docno_key} = $topic;
	

        # remove leading 0's from rank (but keep final 0!)
	$rank =~ s/^0*//;
        if (! $rank) {
            $rank = "0";
        }


        $num_ret{$topic}++;
    }



    # Do global checks:
    #   error if some topic has no (or too many) documents retrieved for it
    #   warn if too few documents retrieved for a topic
    for ($t=$MINQ; $t<=$MAXQ; $t++) { 
        if ($num_ret{$t} == 0) {
            &error("No passages retrieved for topic $t");
        }
        elsif ($num_ret{$t} > $MAX_RET) {
            &error("Too many passages ($num_ret{$t}) retrieved for topic $t");
        }
        elsif ($num_ret{$t} < $MAX_RET) {
	    print ERRLOG "$0 of $results_file:  WARNING: only $num_ret{$t} passages retrieved for topic $t\n"
        }
    }
}


# print error message, keeping track of total number of errors
# line numbers refer to SORTED file since that is the actual input file
sub error {
   my $msg_string = pop(@_);

    print ERRLOG 
    "$0 of $results_file: Error on line $line_num --- $msg_string\n";

    $num_errors++;
    if ($num_errors > $MAX_ERRORS) {
        print ERRLOG "$0 of $results_file: Quit. Too many errors!\n";
        close ERRLOG ||
		die "Close failed for error log $errlog: $!\n";
	exit 255;
    }
}
