# checkrun.py # This program checks a submitted run file for the appropriate syntax and format, including the following: # Topic number is valid (160-187) # PMID is valid (in collection) # Number of fields is 6 # Same tag is used throughout import sys import glob import os.path if len(sys.argv) != 2: sys.stdout.write("usage:python %s [path-to-run-files] > STDOUT\n" % sys.argv[0]) sys.exit(0) #Reading all valid pmids from pmids.txt... sys.stdout.write("Reading pmids...\n") pmids = set() for line in open("pmids.txt", 'r'): line = line.strip() if line: pmids.add(int(line)) sys.stdout.write("done.\n") # add "zero" PMID that people may use to mean no document... pmids.add(0) netinvalid = 0 path = sys.argv[1] #processing each run in the specified directory.... for filename in glob.glob(path): invalid = 0 sys.stdout.write("Checking passages in %s...\n" % filename) linenum = 0 input = open(filename,'r') line = input.readline() input.close() fields = line.split() runname = fields[6] for line in open(filename, 'r'): linenum += 1 fields = line.split() if len(fields) == 7: # check for the number of fields in the current row... (run, pmid, rank, score, start, length, tag) = fields[0:7] run = int(run) pmid = int(pmid) if( (runname == tag) and (run>=160 and run<=187) and (pmid in pmids) ): # check if the tag is valid... pass elif runname != tag: sys.stdout.write("Error (bad tag) in file %s at line number %d\n" % (filename, linenum)) invalid+=1 elif run < 160 or run > 187: sys.stdout.write("Error (invalid topic) in file %s at line number %d\n" % (filename, linenum)) invalid+=1 elif pmid not in pmids: sys.stdout.write("Error (invalid pmid) in file %s at line number %d\n" % (filename, linenum)) invalid+=1 else: sys.stdout.write("Invalid number of fields(columns) in file %s at line %d." % (filename, linenum)) invalid+=1 #output.close() sys.stdout.write("Done, found %d invalid rows in this run file.\n\n" % invalid) netinvalid+=invalid sys.stdout.write("Done, found %d invalid rows in all the run files.\n" % netinvalid)