# makeforms.py # Create judging forms for TREC 2006 Genomics Track. # Note that primarily implicit error checking is done, operations will fail if files are not in the proper format # or if expected data elements are missing. # This is actually what we want, because we need to check the files and data if the program fails. # This program creates one output file per topic. import sys import glob import os.path import re import zipfile # check command line... if len(sys.argv) != 5: sys.stderr.write("usage:%s [pooled spans file] [topics file] [zipped html glob] [output base filepath]\n" % sys.argv[0]) sys.exit(0) pooled_spans_file = sys.argv[1] topics_file = sys.argv[2] zipped_html_glob = sys.argv[3] output_base_filepath = sys.argv[4] # Excel syntax magic, when these are imported into Excel using tab-separted values they become hyperlinks... PUBMED_URL_TEMPLATE = '=HYPERLINK("http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=PureSearch&db=pubmed&details_term=<>%5BUID%5D", <>)' MESH_URL = '=HYPERLINK("http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=mesh", "MESH ASPECTS")' # constants... COLUMN_HEADINGS = [ "TOPIC", "QUESTION", "PMID", "OFFSET", "LENGTH", "SPANID", "PLAIN TEXT", "RELEVANCE", "ANSWER TEXT", MESH_URL ] DROPDOWN_LIST_ITEMS = [ "?", "Not", "Possibly", "Definitely" ] ANGLE_BRACKET_REGEX = re.compile("<.*?>", re.DOTALL) def plaintext(text): # remove anything enclosed in angle brackets... text = ANGLE_BRACKET_REGEX.sub("", text) # normalize whitespace... text = " ".join(text.split()) # done.. return text # these functions need to be customized for the final judging format.... def outputJudgingFormHeader(file): file.write("%s\n" % '\t'.join(COLUMN_HEADINGS+DROPDOWN_LIST_ITEMS)) def outputJudgingFormFooter(file): file.write("\n") def outputJudgingFormLine(file, topic, question, pmid, offset, length, plaintext): spanid = "%s.%s.%s" % (pmid, offset, length) url = PUBMED_URL_TEMPLATE.replace("<>", pmid) relevance = "?" answertext= "?" meshaspects = "?" items = [topic, question, url, offset, length, spanid, plaintext, relevance, answertext, meshaspects] file.write("%s\n" % '\t'.join(items)) # create index of fulltext zips... sys.stderr.write("Creating html zip file index...") pmid2htmlfile = {} for zipfilename in glob.glob(zipped_html_glob): zfile = zipfile.ZipFile(zipfilename, 'r') for filename in zfile.namelist(): pmid = os.path.split(filename)[1].split(".")[0] pmid2htmlfile[pmid] = (zipfilename, filename) zfile.close() sys.stderr.write("OK.\n") # create index of pooled spans by topic... sys.stderr.write("Creating pooled spans topic index...") topic2pooledspans = {} file = open(pooled_spans_file, 'r') for line in file: line = line.strip() # skip blank lines and lines starting with a hash... if len(line) == 0 or line[0] == '#': continue # note, this will fail if line does not contain exactly 4 fields! (topic, pmid, offset, length) = line.split() topic2pooledspans.setdefault(topic, []).append((pmid, offset, length)) file.close() sys.stderr.write("OK.\n") # process the topics... sys.stderr.write("Processing topics...") file = open(topics_file, 'r') for line in file: line = line.strip() # skip blank lines and lines starting with a hash... if len(line) == 0 or line[0] == '#': continue # extract topicid and question, this will fail if improper format... mo = re.match(r"<(\d+)>(.*)", line) topic = mo.group(1) question = mo.group(2) sys.stderr.write("\nTopic %s..." % topic) # check for topic spans... if topic not in topic2pooledspans: sys.stderr.write("ERROR, NO POOLED SPANS FOR Topic %s!\n" % topic) continue # open output file for this topic... output = open("%s_topic%s.tsv.txt" % (output_base_filepath, topic), 'w') outputJudgingFormHeader(output) # process all of the pooled spans for this topic... for (pmid, offset, length) in topic2pooledspans[topic]: sys.stderr.write("(%s.%s.%s)..." % (pmid, offset, length)) # grab span text from the appropriate file... (zipfilename, filename) = pmid2htmlfile[pmid] zfile = zipfile.ZipFile(zipfilename, 'r') allhtml = zfile.read(filename) zfile.close() start = int(offset) count = int(length) text = plaintext(allhtml[start:start+count]) outputJudgingFormLine(output, topic, question, pmid, offset, length, text) outputJudgingFormFooter(output) output.close() sys.stderr.write("OK.\n")