# This program takes the submitted checked runs and the legal spans file produces a set of pooled spans # of the given pool size for each topic. The output is written to STDOUT, each line describing a span # entered into the pool and the associated topic, in this tab-separated format: # import sys import glob import os.path if len(sys.argv) != 4: sys.stderr.write("usage:python %s [path-to-cleanrun-files] [path-to-legal-spans-file] [POOL-SIZE] > STDOUT\n" % sys.argv[0]) sys.exit(0) #Reading legalspans.txt sys.stderr.write("Reading valid spans file...") spansByPMID = {} file = open(sys.argv[2], 'r') for line in file: fields = line.split() if len(fields) == 3: (pmid, start, length) = [int(s) for s in fields] spansByPMID.setdefault(pmid, []).append((start, length)) file.close() sys.stderr.write("\ndone.\n") output = open("temp.txt",'w') # we temporarily write all legal spans from all runs to this file... path = sys.argv[1] # path to the directory of all legal runs. ex: E:/.../.../trecgen2006/cleanruns/*.txt #validating spans sys.stderr.write("Mapping submission passages to spans...") for filename in glob.glob(path): # for each file in the directory of legal runs... for line in open(filename, 'r'): # skip blank lines... line = line.strip() if not line: continue fields = line.split() # skip lines that don't have seven fields... if len(fields) != 7: sys.stderr.write("Invalid line in file %s:%s\n" % (filename, line)) (topicid, pmid, rank, score, start, length, tag) = fields[0:7] start = int(start) length = int(length) okflag = False if spansByPMID.has_key(int(pmid)): for (offset, count) in spansByPMID[int(pmid)]: # check if the span is legal... if start >= offset and (start + length) <= (offset + count): okflag = True output.write(topicid+"\t"+pmid+"\t"+rank+"\t"+str(offset)+"\t"+str(count)+"\t"+tag+"\n") break if not okflag: sys.stderr.write("Invalid span %s %d %d in file %s.\n" % (pmid, start, length, filename)) else: sys.stderr.write("Invalid PMID in span %s %d %d in file %s.\n" % (pmid, start, length, filename)) # close the temp file... output.close() output = None sys.stderr.write("\ndone.\n") # release lots of memory... spansByPMID = None topicids = list(range(160,188)) MAX = int(sys.argv[3]) for tid in topicids: sys.stderr.write("Pooling for topic %d..." % tid) # read the temp file once for each topic... input = open('temp.txt','r') top = {} for line in input: # parse the line... fields = line.split() (topicid, pmid, rank, offset, count, tag) = fields[0:6] topicid = int(topicid) rank = int(rank) # keep only the top ranking for each span... if(tid==topicid): if (topicid, pmid, offset, count) not in top: top[(topicid, pmid, offset, count)] = (rank,tag) elif rank < top[(topicid, pmid, offset, count)] [0]: top[(topicid, pmid, offset, count)] = (rank,tag) # done with file... input.close() # sort by rank and then run label... pool = [(val,key) for (key,val) in top.items()] pool.sort() # take the top MAX entries as the pooled sample... for (val,key) in pool[0:MAX]: (rank,tag) = val (topicid, pmid, offset, count) = key line = str(tid)+"\t"+pmid+"\t"+offset+"\t"+count+"\t" sys.stdout.write("%s\n" % line) sys.stdout.flush() sys.stderr.write("done.\n")