# makeforms.py
# Create judging forms for TREC 2006 Genomics Track.
# Note that primarily implicit error checking is done, operations will fail if files are not in the proper format
# or if expected data elements are missing.
# This is actually what we want, because we need to check the files and data if the program fails.
# This program creates one output file per topic.
import sys
import glob
import os.path
import re
import zipfile

# check command line...
if len(sys.argv) != 5:
	sys.stderr.write("usage:%s [pooled spans file] [topics file] [zipped html glob] [output base filepath]\n" % sys.argv[0])
	sys.exit(0)

pooled_spans_file = sys.argv[1]
topics_file = sys.argv[2]
zipped_html_glob = sys.argv[3]
output_base_filepath = sys.argv[4]

# Excel syntax magic, when these are imported into Excel using tab-separted values they become hyperlinks...
PUBMED_URL_TEMPLATE = '=HYPERLINK("http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?cmd=PureSearch&db=pubmed&details_term=<<PMID>>%5BUID%5D", <<PMID>>)'
MESH_URL = '=HYPERLINK("http://www.ncbi.nlm.nih.gov/entrez/query.fcgi?db=mesh", "MESH ASPECTS")'

# constants...
COLUMN_HEADINGS = [
	"TOPIC",
	"QUESTION",
	"PMID",
	"OFFSET",
	"LENGTH",
	"SPANID",
	"PLAIN TEXT",
	"RELEVANCE",
	"ANSWER TEXT",
	MESH_URL
]

DROPDOWN_LIST_ITEMS = [
	"?",
	"Not",
	"Possibly",
	"Definitely"
]
	
ANGLE_BRACKET_REGEX = re.compile("<.*?>", re.DOTALL)
def plaintext(text):
	# remove anything enclosed in angle brackets...
	text = ANGLE_BRACKET_REGEX.sub("", text)
	# normalize whitespace...
	text = " ".join(text.split())
	# done..
	return text

# these functions need to be customized for the final judging format....
def outputJudgingFormHeader(file):
	file.write("%s\n" % '\t'.join(COLUMN_HEADINGS+DROPDOWN_LIST_ITEMS))
	
def outputJudgingFormFooter(file):
	file.write("\n")

def outputJudgingFormLine(file, topic, question, pmid, offset, length, plaintext):
	spanid = "%s.%s.%s" % (pmid, offset, length)
	url = PUBMED_URL_TEMPLATE.replace("<<PMID>>", pmid)
	relevance = "?"
	answertext= "?"
	meshaspects = "?"
	items = [topic, question, url, offset, length, spanid, plaintext, relevance, answertext, meshaspects]
	file.write("%s\n" % '\t'.join(items))
	
# create index of fulltext zips...
sys.stderr.write("Creating html zip file index...")
pmid2htmlfile = {}
for zipfilename in glob.glob(zipped_html_glob):
	zfile = zipfile.ZipFile(zipfilename, 'r')
	for filename in zfile.namelist():
		pmid = os.path.split(filename)[1].split(".")[0]
		pmid2htmlfile[pmid] = (zipfilename, filename)
	zfile.close()
sys.stderr.write("OK.\n")

# create index of pooled spans by topic...
sys.stderr.write("Creating pooled spans topic index...")
topic2pooledspans = {}
file = open(pooled_spans_file, 'r')
for line in file:
	line = line.strip()
	# skip blank lines and lines starting with a hash...
	if len(line) == 0 or line[0] == '#':
		continue
	# note, this will fail if line does not contain exactly 4 fields!	
	(topic, pmid, offset, length) = line.split()
	topic2pooledspans.setdefault(topic, []).append((pmid, offset, length))
file.close()
sys.stderr.write("OK.\n")

# process the topics...
sys.stderr.write("Processing topics...")
file = open(topics_file, 'r')
for line in file:
	line = line.strip()
	# skip blank lines and lines starting with a hash...
	if len(line) == 0 or line[0] == '#':
		continue
	# extract topicid and question, this will fail if improper format...
	mo = re.match(r"<(\d+)>(.*)", line)
	topic = mo.group(1)
	question = mo.group(2)
	sys.stderr.write("\nTopic %s..." % topic)
	# check for topic spans...
	if topic not in topic2pooledspans:
		sys.stderr.write("ERROR, NO POOLED SPANS FOR Topic %s!\n" % topic)
		continue
	# open output file for this topic...
	output = open("%s_topic%s.tsv.txt" % (output_base_filepath, topic), 'w')	
	outputJudgingFormHeader(output)
	# process all of the pooled spans for this topic...
	for (pmid, offset, length) in topic2pooledspans[topic]:
		sys.stderr.write("(%s.%s.%s)..." % (pmid, offset, length))
		# grab span text from the appropriate file...
		(zipfilename, filename) = pmid2htmlfile[pmid]
		zfile = zipfile.ZipFile(zipfilename, 'r')
		allhtml = zfile.read(filename)
		zfile.close()
		start = int(offset)
		count = int(length)
		text = plaintext(allhtml[start:start+count])
		outputJudgingFormLine(output, topic, question, pmid, offset, length, text)		
	outputJudgingFormFooter(output)
	output.close()
	sys.stderr.write("OK.\n")