# makegldstd.py
# Create gold standard from filled-out judging forms for TREC 2006 Genomics Track.
# Note that primarily implicit error checking is done, operations will fail if files are not in the proper format
# or if expected data elements are missing.
# This is actually what we want, because we need to check the files and data if the program fails.
# Also includes a check mode for syntax only when no spans or zip glob arguments are given.
import sys
import glob
import os.path
import re
import zipfile
import os
import cPickle

# check command line...
if len(sys.argv) not in (2,4):
	sys.stderr.write("usage:%s [completed judging forms glob] [legal spans file] [zipped html glob] > STDOUT\n" % sys.argv[0])
	sys.exit(0)
else:	
	# save command line arguments...
	completed_judging_forms_glob = sys.argv[1]
	if len(sys.argv) == 4:
		legal_spans_file = sys.argv[2]
		zipped_html_glob = sys.argv[3]
	else:
		legal_spans_file = None
		zipped_html_glob = None

# define constants...
MAXIMUM_FLOAT = 1.7976E+308 # IEEE-754 double precision maximum floating point value
PICKLED_HTML_INDEX_FILE = "trecgen.2006.makegoldstd.pickle"

# judged entry fieldnames...
TOPIC_FIELD = "TOPIC"
QUESTION_FIELD = "QUESTION"
PMID_FIELD = "PMID"
OFFSET_FIELD = "OFFSET"
LENGTH_FIELD= "LENGTH"
SPANID_FIELD = "SPANID"
PLAINTEXT_FIELD = "PLAINTEXT"
RELEVANCE_FIELD = "RELEVANCE"
ANSWERTEXT_FIELD = "ANSWERTEXT"
MESHASPECTS_FIELD = "MESHASPECTS"
# judging form column labels...
COLUMN_HEADINGS = [
	TOPIC_FIELD,
	QUESTION_FIELD,
	PMID_FIELD,
	OFFSET_FIELD,
	LENGTH_FIELD,
	SPANID_FIELD,
	PLAINTEXT_FIELD,
	RELEVANCE_FIELD,
	ANSWERTEXT_FIELD,
	MESHASPECTS_FIELD
]
# positive relevance definitions...
POSITIVE_RELEVANCE = ["possibly", "definitely"]
NEGATIVE_RELEVANCE = ["not"]
POSITIVE_RELEVANCE_LABEL = "RELEVANT"
NEGATIVE_RELEVANCE_LABEL = "NOT_RELEVANT"

# constants...
NONRELEVANT_COLUMN_HEADING_COUNT = 8
RELEVANT_COLUMN_HEADING_COUNT = 10

def fatalError(msg):
	sys.stderr.write("%s\n\n" % msg)
	raise RuntimeError

def stripQuotes(s):
	if len(s) == 0:
		return s
	i = 0
	while s[i].isspace() or s[i] == '"' or s[i] == "'":
		i += 1
	s = s[i:]
	i = len(s)
	while s[i-1].isspace() or s[i-1] == '"' or s[i-1] == "'":
		i -= 1
	s = s[0:i]
	return s

def mapRelevance(s):
	s = s.lower()
	for v in POSITIVE_RELEVANCE:
		if s.find(v) >= 0:
			return POSITIVE_RELEVANCE_LABEL
	for v in NEGATIVE_RELEVANCE:
		if s.find(v) >= 0:
			return NEGATIVE_RELEVANCE_LABEL
	# unable to map relevance value...
	raise RuntimeError	
	
def normalizeMesh(s):
	# strip leading/trailing quotes and spaces,
	# as well as leading/trailing pipe characters...
	if len(s) == 0:
		return s	
	s = stripQuotes(s)
	i = 0
	while s[i] == '|':
		i += 1
	s = s[i:]
	i = len(s)
	while s[i-1] == "|":
		i -= 1
	s = s[0:i]
	# now split by pipe, remove extra spaces, and put back together...
	s = '|'.join([t.strip() for t in s.split('|')])
	# remove any asterisks, which designate central concepts and are useless to us...
	s = s.replace('*', '')
	# normalize case, convert everything to capitals for consistency...
	s = s.upper()
	return s

def normalizeWhitespace(s):
	return " ".join(s.split())
	
def removeDoubledQuotes(s):
	return s.replace('""', '"')

def normalizeAnswer(s):
	return removeDoubledQuotes(normalizeWhitespace(stripQuotes(s)))
	
# read in single passage judgement from judging file, each on a single line...
# returns a populated dictionary, or None if EOF.
def readJudgedEntry(file, filename):
	# skip blank lines...
	line = None
	while not line:
		line = file.readline()
		# check for EOF...
		if not line:
			return None
		else:
			line = line.strip()
	# process fields in line...
	fields = line.split('\t')
	# check field validity...
	if len(fields) < NONRELEVANT_COLUMN_HEADING_COUNT or \
		len(fields) > RELEVANT_COLUMN_HEADING_COUNT:
		fatalError("Incorrect number of fields in file %s at line '%s'" % (filename, line))
	# convert lists to dictionary...
	dx = {}
	for (label, value) in zip(COLUMN_HEADINGS, fields):
		dx[label] = value
	# convert numeric fields from strings to integers...
	try:
		dx[OFFSET_FIELD] = int(dx[OFFSET_FIELD])
		dx[LENGTH_FIELD] = int(dx[LENGTH_FIELD])
	except:
		fatalError("Invalid expected numeric value in file %s at line '%s'" % (filename, line))
	# map relevance data...
	try:
		dx[RELEVANCE_FIELD] = mapRelevance(dx[RELEVANCE_FIELD] )
	except:
		fatalError("Invalid relevance value in file %s at line '%s'" % (filename, line))	
	# strip leading/trailing spaces and quotes from text fields...
	# this is necessary because exporting string data from Excel sometimes quotes strings...
	try:
		if PLAINTEXT_FIELD in dx:
			dx[PLAINTEXT_FIELD] = stripQuotes(dx[PLAINTEXT_FIELD])
		if ANSWERTEXT_FIELD in dx:
			dx[ANSWERTEXT_FIELD] = normalizeAnswer(dx[ANSWERTEXT_FIELD])
	except:
		fatalError("Invalid text field in file %s at line '%s'" % (filename, line))			
	# normalize mesh aspects...
	try:
		if MESHASPECTS_FIELD in dx:
			dx[MESHASPECTS_FIELD] = normalizeMesh(dx[MESHASPECTS_FIELD])
	except:
		fatalError("Invalid MeSH aspects field in file %s at line '%s'" % (filename, line))						
	# check field consistency...
	if dx[RELEVANCE_FIELD]  == POSITIVE_RELEVANCE_LABEL and ANSWERTEXT_FIELD not in dx:
		fatalError("No answertext for relevant passage in file %s at line '%s'" % (filename, line))			
	# return entry...
	return dx

def includeRelevance(relevance):
	return relevance == POSITIVE_RELEVANCE_LABEL
	
def writeGoldStandardEntry(file, topicid, pubmedid, offset, length, mesh, html, answer):
	try:
		# gold standard output file field set...
		fields = [topicid, pubmedid, str(offset), str(length), mesh]
		# for debugging output extra text fields...
		fields = [topicid, pubmedid, str(offset), str(length), mesh, answer, " ".join(html.split())]	
		file.write("%s\n" % "\t".join(fields))
	except:
		fatalError("Unable to convert and write gold standard entry:%s" % str(fields))					
	return
	
def writeGoldStandardHeader(file):
	# do nothing...
	return

def writeGoldStandardFooter(file):
	# do nothing...
	return	
	
def findLegalSpanContainedMatch(passage, pmidSpans):
	end = passage[0] + passage[1]
	for (offset, length) in pmidSpans:
		if passage[0] >= offset and end <= (offset + length):
			return (offset, length)
	return None

def findLegalSpanExactMatch(passage, pmidSpans):
	for (offset, length) in pmidSpans:
		if passage[0] == offset and passage[1] == length:
			return (offset, length)
	return None

# use exact span match version...
findLegalSpan = findLegalSpanExactMatch

class StringAlign(object):
	# define class constants...
	INSERTION = 'I'
	DELETION = 'D'
	SUBSTITUTION = 'S'

	def __init__(self):
		super(StringAlign, self).__init__()
		return

	def insertion_cost(self, prior):
		raise NotImplementedError

	def deletion_cost(self, prior):
		raise NotImplementedError

	def substitution_cost(self, a, b, prior, gap):
		raise NotImplementedError
		
	def align(self, A, B, gapchar):
		# dereference for speed...
		substitution_cost = self.substitution_cost
		deletion_cost = self.deletion_cost
		insertion_cost = self.insertion_cost		
		# loop constants...
		lenA = len(A)
		lenB = len(B)
		# compute F and P matrix...
		F = [[0.0]*(lenB+1) for i in range(lenA+1)]
		P = [[None]*(lenB+1) for i in range(lenA+1)]
		F[0][0] = 0.0
		P[0][0] = None
		for i in range(1, lenA+1):
			# entries are zero for zero cost for header deletions...
			F[i][0] = 0.0
			P[i][0] = StringAlign.DELETION
		for j in range(1, lenB+1):
			F[0][j] = MAXIMUM_FLOAT
			P[0][j] = StringAlign.INSERTION
		for i in range(1, lenA+1):
			for j in range(1, lenB+1):
				diag = F[i-1][j-1] + substitution_cost(A[i-1], B[j-1], P[i-1][j-1])
				left = F[i-1][j] + deletion_cost(P[i-1][j])
				up = F[i][j-1] + insertion_cost(P[i][j-1])
				best = min(diag, left, up)
				F[i][j] = best
				if best == diag:
					P[i][j] = StringAlign.SUBSTITUTION
				elif best == left:
					P[i][j] = StringAlign.DELETION
				elif best == up:
					P[i][j] = StringAlign.INSERTION
				else:
					raise RuntimeError
		# determine alignment...
		alignedA = ""
		alignedB = ""
		i = lenA
		j = lenB
		minAlign = lenA + 1
		maxAlign = -1
		total_cost = F[lenA][lenB]
		# start in minimum cost of the last row to enable penalty-free tail-end deletions...
		imin = lenA
		iminCost = F[lenA][lenB]
		for i in range(1, lenA+1):
			if F[i][lenB] < iminCost:
				iminCost = F[i][lenB] 
				imin = i
		for i in range(lenA - imin):
			alignedA = A[i-1] + alignedA 
			alignedB = gapchar + alignedB
		i = imin
		# traceback...
		while i > 0 and j > 0:
			score = F[i][j]
			diag = F[i-1][j-1]
			up = F[i][j-1]
			left = F[i-1][j]
			if score == diag + substitution_cost(A[i-1], B[j-1], P[i-1][j-1]):
				alignedA = A[i-1] + alignedA 
				alignedB = B[j-1] + alignedB
				if i > maxAlign:
					maxAlign = i
				if i < minAlign:
					minAlign = i
				i -= 1
				j -= 1
			elif score == left + deletion_cost(P[i-1][j]):
				alignedA = A[i-1] + alignedA 
				alignedB = gapchar + alignedB
				i -= 1
			elif score == up + insertion_cost(P[i][j-1]):
				alignedA = gapchar + alignedA
				alignedB = B[j-1] + alignedB
				j -= 1
			else:
				raise RuntimeError
		while i > 0:
			alignedA = A[i-1] + alignedA
			alignedB = gapchar + alignedB
			i -= 1
		while j > 0:
			alignedA = gapchar + alignedA
			alignedB = B[j-1] + alignedB
			j -= 1
		# clean up...	
		F = None
		P = None
		# return results...
		# print lenA, lenB, minAlign, maxAlign
		return (alignedA, minAlign - 1, alignedB, maxAlign - minAlign + 1, total_cost)

class AlignPlaintext2HTML(StringAlign):
	def deletion_cost(self, prior):		
		# it is better (lower cost) to have  multiple deletions in a row, 
		# this ensure maximal contiguous alignment substring length...
		if prior == StringAlign.DELETION:
			return +1.0
		else:
			return +2.0
			
	def insertion_cost(self, prior):
		return MAXIMUM_FLOAT
		
	def substitution_cost(self, a, b, prior):
		# only allow subsitutions of identical characters, 
		# and treat all whitespace as identical...
		if a == b or (a.isspace() and b.isspace()):
			# it is better (lower cost) to have multiple substitutions in a row
			# as opposed to substitutions separated by deletions,
			# this ensure maximal contiguous alignment substring length...
			if prior == StringAlign.SUBSTITUTION:
				return -2.0
			else:
				return -1.0
		else:
			return MAXIMUM_FLOAT
			
	def align(self, html, plaintext):
		# call base class...
		(alignedA, offset, alignedB, length, score) = super(AlignPlaintext2HTML, self).align(html, plaintext, "~")
		# debugging...
		#print alignedA
		#print alignedB
		return (offset, length)

# speed up with psyco if available...
try:
	import psyco
	psyco.bind(StringAlign)
	psyco.bind(AlignPlaintext2HTML)
	sys.stderr.write("(psyco library installed - using speed optimization)\n")
except:
	sys.stderr.write("(psyco library not installed - no speed optimization)\n")
	pass

if zipped_html_glob:
	if os.access(PICKLED_HTML_INDEX_FILE, os.R_OK):
		sys.stderr.write("Reading pickled html zip file index...")
		file = open(PICKLED_HTML_INDEX_FILE, "rb")
		pmid2htmlfile = cPickle.load(file)
		file.close()
	else:
		sys.stderr.write("Creating html zip file index...")
		pmid2htmlfile = {}
		for zipfilename in glob.glob(zipped_html_glob):
			zfile = zipfile.ZipFile(zipfilename, 'r')
			for filename in zfile.namelist():
				pmid = os.path.split(filename)[1].split(".")[0]
				pmid2htmlfile[pmid] = (zipfilename, filename)
			zfile.close()
		sys.stderr.write("writing pickle...")
		file = open(PICKLED_HTML_INDEX_FILE, "wb")
		cPickle.dump(pmid2htmlfile, file)
		file.close()
	sys.stderr.write("OK.\n")

sys.stderr.write("Loading judging results...\n")
judgedEntries = []
for filename in glob.glob(completed_judging_forms_glob):
	sys.stderr.write("Parsing '%s'.\n" % os.path.split(filename)[1])
	file = open(filename, 'r')
	# skip first header line...
	file.readline()
	# read all valid entries in file...
	while True:
		entry = 	readJudgedEntry(file, filename)
		if entry:
			judgedEntries.append(entry)
		else:
			break
	file.close()
# compute and print some descriptive counts...
sys.stderr.write("\nRelevance counts:\n")
relevanceCounts = {}
for entry in judgedEntries:
	relevanceCounts[entry[RELEVANCE_FIELD]] = relevanceCounts.get(entry[RELEVANCE_FIELD], 0) + 1
total = 0
for (label, count) in relevanceCounts.items():	
	sys.stderr.write("%s:%d\n" % (label, count))
	total += count
sys.stderr.write("TOTAL:%d\n" % total)
sys.stderr.write("\nMeSH aspect counts:\n")
meshCounts = {}
for entry in judgedEntries:
	if entry[RELEVANCE_FIELD] == NEGATIVE_RELEVANCE_LABEL:
		continue
	s = entry.get(MESHASPECTS_FIELD, None)
	if s:
		terms = s.split('|')
		for t in terms:
			meshCounts[t] = meshCounts.get(t, 0) + 1
for (label, count) in meshCounts.items():	
	sys.stderr.write("%s:%d\n" % (label, count))
sys.stderr.write("\nOK.\n")

# exit early if only checking syntax...
if len(sys.argv) == 2:
	sys.stderr.write("Finished Syntax Check, No Gold Standard Generated.\n")
	sys.exit(0)

sys.stderr.write("Loading judged RELEVANT legal spans...")
judgedPMIDs = set()
for entry in judgedEntries:
	if entry[RELEVANCE_FIELD] == POSITIVE_RELEVANCE_LABEL:
		judgedPMIDs.add(entry[PMID_FIELD])
legalSpans = {}	
for line in open(legal_spans_file, 'r'):
	# skip blank lines and lines starting with a hash...
	line = line.strip()
	if len(line) == 0 or line[0] == '#':
		continue
	#parse span...
	(pmid, offset, length) = line.split()
	offset = int(offset)
	length = int(length)
	# save spans in judged pmids...
	if pmid in judgedPMIDs:
		legalSpans.setdefault(pmid, []).append((offset, length))
sys.stderr.write("OK.\n")

# determine and output gold standard passages...
sys.stderr.write("Writing gold standard...\n")
writeGoldStandardHeader(sys.stdout)
aligner = AlignPlaintext2HTML()
for entry in judgedEntries:
	try:
		# pick apart fields...
		topicid = entry[TOPIC_FIELD]
		pmid = entry[PMID_FIELD]
		offset = entry[OFFSET_FIELD]
		length = entry[LENGTH_FIELD]
		span = entry[SPANID_FIELD]
		relevance = entry[RELEVANCE_FIELD]
		answer = entry.get(ANSWERTEXT_FIELD, None)
		mesh = entry.get(MESHASPECTS_FIELD, None)
	except:
		fatalError("\nERROR - MISSING REQUIRED VALUES IN ENTRY:%s\n" % str(entry))		
	# check relevance...
	sys.stderr.write("Checking topic %s, span %s        \r" % (topicid, span))
	if includeRelevance(relevance):
		# ensure that all data fields are present...
		if not answer or not mesh:
			fatalError("\nERROR - MISSING REQUIRED VALUES FOR RELEVANT ENTRY:%s\n" % str(entry))					
		# verify passage within a legal span...
		span = findLegalSpan((offset, length), legalSpans[pmid])
		if not span:
			fatalError("\nERROR - (%s.%s.%s) IS NOT A LEGAL SPAN!" % (pmid, offset, length))
		# get text for legal span that entry fits into...
		(zipfilename, filename) = pmid2htmlfile[pmid]
		zfile = zipfile.ZipFile(zipfilename, 'r')
		allhtml = zfile.read(filename)
		spanhtml = allhtml[span[0]:span[0]+span[1]]
		zfile.close()
		# align answer with legal HTML span text...
		(goldoffset, goldlength) = aligner.align(spanhtml, answer)
		# compute the answer passage relative to start of html file...
		goldoffset = span[0] + goldoffset
		# mark as suspicious any gold passages that require the entire span...
		if goldlength >= span[1]:
			sys.stderr.write("\nSuspect answer text for passage (%s.%s.%s), check editing!\n" % (pmid, goldoffset, goldlength))
			answer = "(SUSPECT)%s" % answer
		# output the gold standard entry...
		writeGoldStandardEntry(sys.stdout, topicid, pmid, goldoffset, goldlength, mesh, allhtml[goldoffset:goldoffset+goldlength], answer)
sys.stderr.write("\nOK.\n")
writeGoldStandardFooter(sys.stdout)
sys.stderr.write("Finished.\n")