#!/usr/local/bin/perl

# Check a TREC 2003 genomics track primary task submission for various
# common errors:
#      * extra fields
#      * multiple run tags
#      * missing or extraneous topics
#      * invalid retrieved documents
#      * duplicate retrieved documents in a single topic
#      * too many documents retrieved for a topic
#      * fewer than maximum allowed retrieved for a topic (warning)
# Messages regarding submission are printed to an error log

# Results input file is in the form
#     topic_num Q0 docno rank sim tag
# Script uses UNIX sort routine to ensure input is sorted by increasing
# topic number and decreasing sim.  If run on non-unix system,
# use alternate open command, but make sure input file is sorted
# Note that line numbers in the error output refer to the SORTED file!

# Change these variable values to the directory in which the lists
# of DOCNO's reside and the directory where the error log should be put
$docno_dir = "/trec/trec12/aux";
$docnos_file = "$docno_dir/docnos.genome";
$errlog_dir = ".";

# If more than 25 errors, then stop processing; something drastically
# wrong with the file.
$MAX_ERRORS = 25; 

$MINQ = 1;
$MAXQ = 50;
$MAX_RET = 1000;


$#ARGV == 1 || die "Usage: $0 task resultsfile\n\twhere task is either 'primary' or 'secondary'.\n";

$task = $ARGV[0];
if ($task ne "primary" && $task ne "secondary") {
    die "Task must be either 'primary' or 'secondary', not '$task'\n";
}
$results_file = $ARGV[1];

if ($task eq "secondary") {
    print STDERR "Format of secondary task not yet specified.\n";
    exit 0;
}


# Read in the list of valid docno's for this task
if ((! -e $docnos_file) || (! open DOCNO_FILE, "<$docnos_file") )  {
	die "can't open docno's file $docnos_file: $!\n";
}
while ($d = <DOCNO_FILE>) {
    chomp $d;
    if ($d !~ /^\s*([0-9]+)\s*$/) { # extract from whitespace
      die "Document id (`$d') is malformed\n";
    }
    $d = $1;
    $docnos{$d} = "-1";
}
close DOCNO_FILE;


# Sort the input file by topic_num, sim and read result
# ASSUMES UNIX; FOR non-unix, comment out this open, and use
# alternate open --- make sure file is sorted!
open RESULTS, "sort +0 -1 +4 -5gr $results_file |" ||
	die "Unable to open (or sort) results file $results_file: $!\n";
#open RESULTS, "<$results_file" ||
#	die "Unable to open results file $results_file: $!\n";


$last_i = -1;
while ( ($i=index($results_file,"/",$last_i+1)) > -1) {
    $last_i = $i;
}
$errlog = $errlog_dir . "/" . substr($results_file,$last_i+1) . ".errlog";
open ERRLOG, ">$errlog" ||
	die "Cannot open error log for writing\n";

$q0warn = 0;
$num_errors = 0;
$line_num = 0;
$old_topic = "-1";
$run_id = "";
while ($line = <RESULTS>) {
    chomp $line;
    $line_num++;
    next if ($line =~ /^\s*$/);

    undef $tag;
    ($topic_string,$q0,$docno,$rank,$sim,$tag,$rest) = split " ", $line;
    if ($rest)  {
	error("Too many fields");
	die "\n";
    }

    # make sure runtag is ok
    if (! $run_id) { 	# first line --- remember tag 
	$run_id = $tag;
	if ($run_id !~ /^[A-Za-z0-9]{1,12}$/) {
	    error("Run tag `$run_id' is malformed");
	    next;
   	}
    }
    else {			# otherwise just make sure one tag used
	if ($tag ne $run_id) {
	    error("Run tag inconsistent (`$tag' and `$run_id')");
	    next;
	}
    }

    # get topic number
    if ($topic_string ne $old_topic) {
	$old_topic = $topic_string;
        while ($topic_string =~ /^0/) {
            $topic_string = substr $topic_string, 1;
        }
	$topic = $topic_string;
	if ($topic < $MINQ || $topic > $MAXQ) {
            error("Unknown topic ($topic_string)");
            $topic = 0;
            next;
        }  
    }


    # make sure second field is "Q0"
    if ($q0 ne "Q0" && ! $q0warn) {
        $q0warn = 1;
        error("Field 2 is `$q0' not `Q0'");
    }


   # make sure DOCNO known and not duplicated
    if (exists $docnos{$docno}) {	# valid DOCNO
        if ($docnos{$docno} eq $topic) {
           error("Document `$docno' retrieved more than once for topic $topic");
	    next;
        }
        $docnos{$docno} = $topic;
    }
    else {				# invalid DOCNO
        error("Unknown document `$docno'");
	next;
    }

    $num_ret[$topic]++;
}



# Do global checks:
#   error if some topic has no (or too many) documents retrieved for it
#   warn if too few documents retrieved for a topic
for ($t=$MINQ; $t<=$MAXQ; $t++) { 
    if ($num_ret[$t] == 0) {
        error("No documents retrieved for topic $t");
    }
    elsif ($num_ret[$t] > $MAX_RET) {
        error("Too many documents ($num_ret[$t]) retrieved for topic $t");
    }
    elsif ($num_ret[$t] < $MAX_RET) {
	print ERRLOG "$0 of $results_file:  WARNING: only $num_ret[$t] documents retrieved for topic $t\n"
    }
}

print ERRLOG "Finished processing $results_file\n";
close ERRLOG || die "Close failed for error log $errlog: $!\n";
print STDERR "Finished processing $results_file\n";

if ($num_errors) { exit 255; }
exit 0;


# print error message, keeping track of total number of errors
# line numbers refer to SORTED file since that is the actual input file
sub error {
   my $msg_string = pop(@_);

    print ERRLOG 
    "$0 of $results_file: Error on line $line_num --- $msg_string\n";

    $num_errors++;
    if ($num_errors > $MAX_ERRORS) {
        print ERRLOG "$0 of $results_file: Quit. Too many errors!\n";
        close ERRLOG ||
		die "Close failed for error log $errlog: $!\n";
	exit 255;
    }
}
