# makegldstd.py # Create gold standard from filled-out judging forms for TREC 2006 Genomics Track. # Note that primarily implicit error checking is done, operations will fail if files are not in the proper format # or if expected data elements are missing. # This is actually what we want, because we need to check the files and data if the program fails. # Also includes a check mode for syntax only when no spans or zip glob arguments are given. import sys import glob import os.path import re import zipfile import os import cPickle # check command line... if len(sys.argv) not in (2,4): sys.stderr.write("usage:%s [completed judging forms glob] [legal spans file] [zipped html glob] > STDOUT\n" % sys.argv[0]) sys.exit(0) else: # save command line arguments... completed_judging_forms_glob = sys.argv[1] if len(sys.argv) == 4: legal_spans_file = sys.argv[2] zipped_html_glob = sys.argv[3] else: legal_spans_file = None zipped_html_glob = None # define constants... MAXIMUM_FLOAT = 1.7976E+308 # IEEE-754 double precision maximum floating point value PICKLED_HTML_INDEX_FILE = "trecgen.2006.makegoldstd.pickle" # judged entry fieldnames... TOPIC_FIELD = "TOPIC" QUESTION_FIELD = "QUESTION" PMID_FIELD = "PMID" OFFSET_FIELD = "OFFSET" LENGTH_FIELD= "LENGTH" SPANID_FIELD = "SPANID" PLAINTEXT_FIELD = "PLAINTEXT" RELEVANCE_FIELD = "RELEVANCE" ANSWERTEXT_FIELD = "ANSWERTEXT" MESHASPECTS_FIELD = "MESHASPECTS" # judging form column labels... COLUMN_HEADINGS = [ TOPIC_FIELD, QUESTION_FIELD, PMID_FIELD, OFFSET_FIELD, LENGTH_FIELD, SPANID_FIELD, PLAINTEXT_FIELD, RELEVANCE_FIELD, ANSWERTEXT_FIELD, MESHASPECTS_FIELD ] # positive relevance definitions... POSITIVE_RELEVANCE = ["possibly", "definitely"] NEGATIVE_RELEVANCE = ["not"] POSITIVE_RELEVANCE_LABEL = "RELEVANT" NEGATIVE_RELEVANCE_LABEL = "NOT_RELEVANT" # constants... NONRELEVANT_COLUMN_HEADING_COUNT = 8 RELEVANT_COLUMN_HEADING_COUNT = 10 def fatalError(msg): sys.stderr.write("%s\n\n" % msg) raise RuntimeError def stripQuotes(s): if len(s) == 0: return s i = 0 while s[i].isspace() or s[i] == '"' or s[i] == "'": i += 1 s = s[i:] i = len(s) while s[i-1].isspace() or s[i-1] == '"' or s[i-1] == "'": i -= 1 s = s[0:i] return s def mapRelevance(s): s = s.lower() for v in POSITIVE_RELEVANCE: if s.find(v) >= 0: return POSITIVE_RELEVANCE_LABEL for v in NEGATIVE_RELEVANCE: if s.find(v) >= 0: return NEGATIVE_RELEVANCE_LABEL # unable to map relevance value... raise RuntimeError def normalizeMesh(s): # strip leading/trailing quotes and spaces, # as well as leading/trailing pipe characters... if len(s) == 0: return s s = stripQuotes(s) i = 0 while s[i] == '|': i += 1 s = s[i:] i = len(s) while s[i-1] == "|": i -= 1 s = s[0:i] # now split by pipe, remove extra spaces, and put back together... s = '|'.join([t.strip() for t in s.split('|')]) # remove any asterisks, which designate central concepts and are useless to us... s = s.replace('*', '') # normalize case, convert everything to capitals for consistency... s = s.upper() return s def normalizeWhitespace(s): return " ".join(s.split()) def removeDoubledQuotes(s): return s.replace('""', '"') def normalizeAnswer(s): return removeDoubledQuotes(normalizeWhitespace(stripQuotes(s))) # read in single passage judgement from judging file, each on a single line... # returns a populated dictionary, or None if EOF. def readJudgedEntry(file, filename): # skip blank lines... line = None while not line: line = file.readline() # check for EOF... if not line: return None else: line = line.strip() # process fields in line... fields = line.split('\t') # check field validity... if len(fields) < NONRELEVANT_COLUMN_HEADING_COUNT or \ len(fields) > RELEVANT_COLUMN_HEADING_COUNT: fatalError("Incorrect number of fields in file %s at line '%s'" % (filename, line)) # convert lists to dictionary... dx = {} for (label, value) in zip(COLUMN_HEADINGS, fields): dx[label] = value # convert numeric fields from strings to integers... try: dx[OFFSET_FIELD] = int(dx[OFFSET_FIELD]) dx[LENGTH_FIELD] = int(dx[LENGTH_FIELD]) except: fatalError("Invalid expected numeric value in file %s at line '%s'" % (filename, line)) # map relevance data... try: dx[RELEVANCE_FIELD] = mapRelevance(dx[RELEVANCE_FIELD] ) except: fatalError("Invalid relevance value in file %s at line '%s'" % (filename, line)) # strip leading/trailing spaces and quotes from text fields... # this is necessary because exporting string data from Excel sometimes quotes strings... try: if PLAINTEXT_FIELD in dx: dx[PLAINTEXT_FIELD] = stripQuotes(dx[PLAINTEXT_FIELD]) if ANSWERTEXT_FIELD in dx: dx[ANSWERTEXT_FIELD] = normalizeAnswer(dx[ANSWERTEXT_FIELD]) except: fatalError("Invalid text field in file %s at line '%s'" % (filename, line)) # normalize mesh aspects... try: if MESHASPECTS_FIELD in dx: dx[MESHASPECTS_FIELD] = normalizeMesh(dx[MESHASPECTS_FIELD]) except: fatalError("Invalid MeSH aspects field in file %s at line '%s'" % (filename, line)) # check field consistency... if dx[RELEVANCE_FIELD] == POSITIVE_RELEVANCE_LABEL and ANSWERTEXT_FIELD not in dx: fatalError("No answertext for relevant passage in file %s at line '%s'" % (filename, line)) # return entry... return dx def includeRelevance(relevance): return relevance == POSITIVE_RELEVANCE_LABEL def writeGoldStandardEntry(file, topicid, pubmedid, offset, length, mesh, html, answer): try: # gold standard output file field set... fields = [topicid, pubmedid, str(offset), str(length), mesh] # for debugging output extra text fields... fields = [topicid, pubmedid, str(offset), str(length), mesh, answer, " ".join(html.split())] file.write("%s\n" % "\t".join(fields)) except: fatalError("Unable to convert and write gold standard entry:%s" % str(fields)) return def writeGoldStandardHeader(file): # do nothing... return def writeGoldStandardFooter(file): # do nothing... return def findLegalSpanContainedMatch(passage, pmidSpans): end = passage[0] + passage[1] for (offset, length) in pmidSpans: if passage[0] >= offset and end <= (offset + length): return (offset, length) return None def findLegalSpanExactMatch(passage, pmidSpans): for (offset, length) in pmidSpans: if passage[0] == offset and passage[1] == length: return (offset, length) return None # use exact span match version... findLegalSpan = findLegalSpanExactMatch class StringAlign(object): # define class constants... INSERTION = 'I' DELETION = 'D' SUBSTITUTION = 'S' def __init__(self): super(StringAlign, self).__init__() return def insertion_cost(self, prior): raise NotImplementedError def deletion_cost(self, prior): raise NotImplementedError def substitution_cost(self, a, b, prior, gap): raise NotImplementedError def align(self, A, B, gapchar): # dereference for speed... substitution_cost = self.substitution_cost deletion_cost = self.deletion_cost insertion_cost = self.insertion_cost # loop constants... lenA = len(A) lenB = len(B) # compute F and P matrix... F = [[0.0]*(lenB+1) for i in range(lenA+1)] P = [[None]*(lenB+1) for i in range(lenA+1)] F[0][0] = 0.0 P[0][0] = None for i in range(1, lenA+1): # entries are zero for zero cost for header deletions... F[i][0] = 0.0 P[i][0] = StringAlign.DELETION for j in range(1, lenB+1): F[0][j] = MAXIMUM_FLOAT P[0][j] = StringAlign.INSERTION for i in range(1, lenA+1): for j in range(1, lenB+1): diag = F[i-1][j-1] + substitution_cost(A[i-1], B[j-1], P[i-1][j-1]) left = F[i-1][j] + deletion_cost(P[i-1][j]) up = F[i][j-1] + insertion_cost(P[i][j-1]) best = min(diag, left, up) F[i][j] = best if best == diag: P[i][j] = StringAlign.SUBSTITUTION elif best == left: P[i][j] = StringAlign.DELETION elif best == up: P[i][j] = StringAlign.INSERTION else: raise RuntimeError # determine alignment... alignedA = "" alignedB = "" i = lenA j = lenB minAlign = lenA + 1 maxAlign = -1 total_cost = F[lenA][lenB] # start in minimum cost of the last row to enable penalty-free tail-end deletions... imin = lenA iminCost = F[lenA][lenB] for i in range(1, lenA+1): if F[i][lenB] < iminCost: iminCost = F[i][lenB] imin = i for i in range(lenA - imin): alignedA = A[i-1] + alignedA alignedB = gapchar + alignedB i = imin # traceback... while i > 0 and j > 0: score = F[i][j] diag = F[i-1][j-1] up = F[i][j-1] left = F[i-1][j] if score == diag + substitution_cost(A[i-1], B[j-1], P[i-1][j-1]): alignedA = A[i-1] + alignedA alignedB = B[j-1] + alignedB if i > maxAlign: maxAlign = i if i < minAlign: minAlign = i i -= 1 j -= 1 elif score == left + deletion_cost(P[i-1][j]): alignedA = A[i-1] + alignedA alignedB = gapchar + alignedB i -= 1 elif score == up + insertion_cost(P[i][j-1]): alignedA = gapchar + alignedA alignedB = B[j-1] + alignedB j -= 1 else: raise RuntimeError while i > 0: alignedA = A[i-1] + alignedA alignedB = gapchar + alignedB i -= 1 while j > 0: alignedA = gapchar + alignedA alignedB = B[j-1] + alignedB j -= 1 # clean up... F = None P = None # return results... # print lenA, lenB, minAlign, maxAlign return (alignedA, minAlign - 1, alignedB, maxAlign - minAlign + 1, total_cost) class AlignPlaintext2HTML(StringAlign): def deletion_cost(self, prior): # it is better (lower cost) to have multiple deletions in a row, # this ensure maximal contiguous alignment substring length... if prior == StringAlign.DELETION: return +1.0 else: return +2.0 def insertion_cost(self, prior): return MAXIMUM_FLOAT def substitution_cost(self, a, b, prior): # only allow subsitutions of identical characters, # and treat all whitespace as identical... if a == b or (a.isspace() and b.isspace()): # it is better (lower cost) to have multiple substitutions in a row # as opposed to substitutions separated by deletions, # this ensure maximal contiguous alignment substring length... if prior == StringAlign.SUBSTITUTION: return -2.0 else: return -1.0 else: return MAXIMUM_FLOAT def align(self, html, plaintext): # call base class... (alignedA, offset, alignedB, length, score) = super(AlignPlaintext2HTML, self).align(html, plaintext, "~") # debugging... #print alignedA #print alignedB return (offset, length) # speed up with psyco if available... try: import psyco psyco.bind(StringAlign) psyco.bind(AlignPlaintext2HTML) sys.stderr.write("(psyco library installed - using speed optimization)\n") except: sys.stderr.write("(psyco library not installed - no speed optimization)\n") pass if zipped_html_glob: if os.access(PICKLED_HTML_INDEX_FILE, os.R_OK): sys.stderr.write("Reading pickled html zip file index...") file = open(PICKLED_HTML_INDEX_FILE, "rb") pmid2htmlfile = cPickle.load(file) file.close() else: sys.stderr.write("Creating html zip file index...") pmid2htmlfile = {} for zipfilename in glob.glob(zipped_html_glob): zfile = zipfile.ZipFile(zipfilename, 'r') for filename in zfile.namelist(): pmid = os.path.split(filename)[1].split(".")[0] pmid2htmlfile[pmid] = (zipfilename, filename) zfile.close() sys.stderr.write("writing pickle...") file = open(PICKLED_HTML_INDEX_FILE, "wb") cPickle.dump(pmid2htmlfile, file) file.close() sys.stderr.write("OK.\n") sys.stderr.write("Loading judging results...\n") judgedEntries = [] for filename in glob.glob(completed_judging_forms_glob): sys.stderr.write("Parsing '%s'.\n" % os.path.split(filename)[1]) file = open(filename, 'r') # skip first header line... file.readline() # read all valid entries in file... while True: entry = readJudgedEntry(file, filename) if entry: judgedEntries.append(entry) else: break file.close() # compute and print some descriptive counts... sys.stderr.write("\nRelevance counts:\n") relevanceCounts = {} for entry in judgedEntries: relevanceCounts[entry[RELEVANCE_FIELD]] = relevanceCounts.get(entry[RELEVANCE_FIELD], 0) + 1 total = 0 for (label, count) in relevanceCounts.items(): sys.stderr.write("%s:%d\n" % (label, count)) total += count sys.stderr.write("TOTAL:%d\n" % total) sys.stderr.write("\nMeSH aspect counts:\n") meshCounts = {} for entry in judgedEntries: if entry[RELEVANCE_FIELD] == NEGATIVE_RELEVANCE_LABEL: continue s = entry.get(MESHASPECTS_FIELD, None) if s: terms = s.split('|') for t in terms: meshCounts[t] = meshCounts.get(t, 0) + 1 for (label, count) in meshCounts.items(): sys.stderr.write("%s:%d\n" % (label, count)) sys.stderr.write("\nOK.\n") # exit early if only checking syntax... if len(sys.argv) == 2: sys.stderr.write("Finished Syntax Check, No Gold Standard Generated.\n") sys.exit(0) sys.stderr.write("Loading judged RELEVANT legal spans...") judgedPMIDs = set() for entry in judgedEntries: if entry[RELEVANCE_FIELD] == POSITIVE_RELEVANCE_LABEL: judgedPMIDs.add(entry[PMID_FIELD]) legalSpans = {} for line in open(legal_spans_file, 'r'): # skip blank lines and lines starting with a hash... line = line.strip() if len(line) == 0 or line[0] == '#': continue #parse span... (pmid, offset, length) = line.split() offset = int(offset) length = int(length) # save spans in judged pmids... if pmid in judgedPMIDs: legalSpans.setdefault(pmid, []).append((offset, length)) sys.stderr.write("OK.\n") # determine and output gold standard passages... sys.stderr.write("Writing gold standard...\n") writeGoldStandardHeader(sys.stdout) aligner = AlignPlaintext2HTML() for entry in judgedEntries: try: # pick apart fields... topicid = entry[TOPIC_FIELD] pmid = entry[PMID_FIELD] offset = entry[OFFSET_FIELD] length = entry[LENGTH_FIELD] span = entry[SPANID_FIELD] relevance = entry[RELEVANCE_FIELD] answer = entry.get(ANSWERTEXT_FIELD, None) mesh = entry.get(MESHASPECTS_FIELD, None) except: fatalError("\nERROR - MISSING REQUIRED VALUES IN ENTRY:%s\n" % str(entry)) # check relevance... sys.stderr.write("Checking topic %s, span %s \r" % (topicid, span)) if includeRelevance(relevance): # ensure that all data fields are present... if not answer or not mesh: fatalError("\nERROR - MISSING REQUIRED VALUES FOR RELEVANT ENTRY:%s\n" % str(entry)) # verify passage within a legal span... span = findLegalSpan((offset, length), legalSpans[pmid]) if not span: fatalError("\nERROR - (%s.%s.%s) IS NOT A LEGAL SPAN!" % (pmid, offset, length)) # get text for legal span that entry fits into... (zipfilename, filename) = pmid2htmlfile[pmid] zfile = zipfile.ZipFile(zipfilename, 'r') allhtml = zfile.read(filename) spanhtml = allhtml[span[0]:span[0]+span[1]] zfile.close() # align answer with legal HTML span text... (goldoffset, goldlength) = aligner.align(spanhtml, answer) # compute the answer passage relative to start of html file... goldoffset = span[0] + goldoffset # mark as suspicious any gold passages that require the entire span... if goldlength >= span[1]: sys.stderr.write("\nSuspect answer text for passage (%s.%s.%s), check editing!\n" % (pmid, goldoffset, goldlength)) answer = "(SUSPECT)%s" % answer # output the gold standard entry... writeGoldStandardEntry(sys.stdout, topicid, pmid, goldoffset, goldlength, mesh, allhtml[goldoffset:goldoffset+goldlength], answer) sys.stderr.write("\nOK.\n") writeGoldStandardFooter(sys.stdout) sys.stderr.write("Finished.\n")