# cleangoldstd.py # Clean the MeSH aspects in the gold standard file created by makegoldstd.py, # according to string mappings agreed upon as equivalent or typos. import sys CLEANUP_MAPPING = { # these mappings were manually added... "MUTATIONS": "MUTATION", "APOLIPOPROTEIN E": "APOLIPOPROTEINS E", "POINT MUTATIONS;": "POINT MUTATION", "UBIQUITIN-PROTEIN LIGASE": "UBIQUITIN-PROTEIN LIGASES", "RAS-GTPASE ACTIVATING PROTEINS": "RAS GTAPASE-ACTIVATING PROTEINS", "RAS-GTPASE-ACTIVATING PROTEINS": "RAS GTAPASE-ACTIVATING PROTEINS", "RAS-GTPASE ACTIVATING PROTEINS": "RAS GTAPASE-ACTIVATING PROTEINS", "SACCHARAMYCES CEREVISIAE PROTEINS": "SACCHAROMYCES CEREVISIAE PROTEINS", "TUMORSUPPRESSOR PROTEINS": "TUMOR SUPPRESSOR PROTEINS", "WNT PROTIENS": "WNT PROTEINS", "GENE EXPESSION": "GENE EXPRESSION", "MITOCHONDRIAL MEMBRANE": "MITOCHONDRIAL MEMBRANES", "TRANS ACTIVATION (GENETICS)": "TRANS-ACTIVATION (GENETICS)", "UPREGULATION": "UP-REGULATION", "HAIR FOLLICLES": "HAIR FOLLICLE", "BIOCHEMICAL PHENOMENOM": "BIOCHEMICAL PHENOMENA", "BIOCHEMICAL PHENOMENON": "BIOCHEMICAL PHENOMENA", "CHROMOSOMES ARTIFICIAL": "CHROMOSOMES, ARTIFICIAL", "HEREDODEGENERATIVE DISORDERS, NERVOUS": "HEREDODEGENERATIVE DISORDERS, NERVOUS SYSTEM", "HEREDODEGNERATIVE DISORDERS, NERVOUS SYSTEM": "HEREDODEGENERATIVE DISORDERS, NERVOUS SYSTEM", "HEREDOGENERATIVE DISORDERS, NERVOUS SYSTEM" :"HEREDODEGENERATIVE DISORDERS, NERVOUS SYSTEM", "INTRACELLULAR SPACES": "INTRACELLULAR SPACE", "NERVE TISSUE PROTIENS": "NERVE TISSUE PROTEINS", "NERVE TISSUE PRTOEINS": "NERVE TISSUE PROTEINS", "RECEPTOR PROTEIN-TYROSINE KINASE": "RECEPTOR PROTEIN-TYROSINE KINASES", "TOXINS, BIOLOGICS": "TOXINS, BIOLOGICAL", "TRANSPORT VESICLESS": "TRANSPORT VESICLES", "TRANSPORT VESICLESSS": "TRANSPORT VESICLES", "TRINUCLEOTIDE REPEAT EXPANDED": "TRINUCLEOTIDE REPEAT EXPANSION", "TRINUCLEOTIDE REPEATS EXPANDED": "TRINUCLEOTIDE REPEAT EXPANSION", "DIGESTIVE SYSTEM ABNORMALITES": "DIGESTIVE SYSTEM ABNORMALITIES", "NERVOUS SYSTEM MALFORMATION": "NERVOUS SYSTEM MALFORMATIONS", "PAX2 TRANSCRIPITION FACTOR": "PAX2 TRANSCRIPTION FACTOR", "ZINC FINGER": "ZINC FINGERS", "ANALYTICAL, DIAGNOSTIC AND THERAPEUTIC TECHNIQUES AND EQUIPMENT CATEGORY": "ANALYTICAL, DIAGNOSTIC AND THERAPEUTIC TECHNIQUES AND EQUIPMENT CATEGORY", # these mappings were automatically generated by MeSH entry terms... "HORMONE": "HORMONES", "ACTIN-BINDING PROTEINS": "MICROFILAMENT PROTEINS", "F-ACTIN": "ACTINS", "TAU PROTEIN": "TAU PROTEINS", "DEPOTENTIATION": "LONG-TERM DEPRESSION (PHYSIOLOGY)", "ENKEPHALIN": "ENKEPHALINS", "DYNORPHIN": "DYNORPHINS", "GROWTH FACTOR": "GROWTH SUBSTANCES", "POLYADENYLATION FACTOR": "MRNA CLEAVAGE AND POLYADENYLATION FACTORS", "ARMADILLO PROTEINS": "ARMADILLO DOMAIN PROTEINS", "NUCLEAR EXPORT": "ACTIVE TRANSPORT, CELL NUCLEUS", "MASS SPECTROMETRY": "SPECTRUM ANALYSIS, MASS", "NERVOUS SYSTEM ABNORMALITIES": "NERVOUS SYSTEM MALFORMATIONS", "GENETIC PREDISPOSITION": "GENETIC PREDISPOSITION TO DISEASE", "SPASTIC PARAPARESIS": "PARAPARESIS, SPASTIC", "PROTEASOME": "PROTEASOME ENDOPEPTIDASE COMPLEX", "MOUSE, KNOCKOUT": "MICE, KNOCKOUT", "NUCLEAR IMPORT": "ACTIVE TRANSPORT, CELL NUCLEUS", "VASCULAR DISEASE": "VASCULAR DISEASES", "AMINO ACID": "AMINO ACIDS", "HYDROPHOBIC INTERACTION": "HYDROPHOBICITY", "HAPLOTYPE": "HAPLOTYPES", "LYMPHATIC DISEASE": "LYMPHATIC DISEASES", "SUBCELLULAR FRACTION": "SUBCELLULAR FRACTIONS", "TUMOR SUPPRESSOR GENE": "GENES, TUMOR SUPPRESSOR", "PROTEIN": "PROTEINS", "ALBUMIN": "ALBUMINS", "GASTROINTESTINAL DISEASE": "GASTROINTESTINAL DISEASES", "ENDOTHELIN-CONVERTING ENZYME 2": "ENDOTHELIN-CONVERTING ENZYME 1", "MASS SPECTROMETRY": "SPECTRUM ANALYSIS, MASS", "PREPRO-OREXIN": "OREXINS", "CELL MIGRATION": "CELL MOVEMENT", "ISLET AMYLOID POLYPEPTIDE": "AMYLIN", "ADENOSINETRIPHOSPHATASE": "ADENOSINE TRIPHOSPHATASES", "LYMPHOMAS": "LYMPHOMA", "GENE, TUMOR SUPPRESSOR": "GENES, TUMOR SUPPRESSOR", "RNA BINDING PROTEINS": "RNA-BINDING PROTEINS", "DNA BINDING PROTEINS": "DNA-BINDING PROTEINS", "ONCOGENIC VIRUS": "ONCOGENIC VIRUSES", "UBIQUITIN PROTEIN LIGASES": "UBIQUITIN-PROTEIN LIGASES", "NEUROBLASTOMAS": "NEUROBLASTOMA", # these mappings were automatically discovered but manually generated... "VIRUS VACCINE": "VIRAL VACCINES", "ANIAMLS, GENETICALLY MODIFIED": "ANIMALS, GENETICALLY MODIFIED", "ANTICIPATION": "ANTICIPATION, GENETIC", "ANTIGENS VIRAL TUMOR": "ANTIGENS, VIRAL, TUMOR", "ARMADILLO PROTEIN DOMAINS": "ARMADILLO DOMAIN PROTEINS", "BETA-AMYLOID PEPTIDE": "AMYLOID BETA-PROTEIN", "CACLIUM SIGNALING": "CALCIUM SIGNALING", "CELL GROWTH PROCESSESS": "CELL GROWTH PROCESSES", "COP-COATED VESICLESENDOPLASMIC RETICULUM/METABOLISM": "COP-COATED VESICLES|ENDOPLASMIC RETICULUM/METABOLISM", "CYSTIC FIBROSIS TRANSMEMBRANE CONDUCTANCE REGULATORENDOPLASMIC RETICULUM/METABOLISM": "CYSTIC FIBROSIS TRANSMEMBRANE CONDUCTANCE REGULATOR|ENDOPLASMIC RETICULUM/METABOLISM", "ENDOCRINE DISORDERS": "ENDOCRINE SYSTEM DISEASES", "ENDOCRINE SYSTEM DISORDERS": "ENDOCRINE SYSTEM DISEASES", "GENE SLIENCING": "GENE SILENCING", "GENETIC PHENOMENOM": "GENETIC PHENOMENA", "GENETIC SCREEING": "GENETIC SCREENING", "HEAT SCHOCK PROTEINS": "HEAT-SHOCK PROTEINS", "HEREDODEGENERATIVE DISORDERS": "HEREDODEGENERATIVE DISORDERS, NERVOUS SYSTEM", "HISTONE DEACETYLACES": "HISTONE DEACETYLASES", "INTRACELLULAR SIGNALLING PEPTIDES AND PROTEINS": "INTRACELLULAR SIGNALING PEPTIDES AND PROTEINS", "LATERALITY": "LATERALITY", "LEUKEMIA, MYELOID, ACUTE": "LEUKEMIA, MYELOCYTIC, ACUTE", "LEUKEMIA, MYELOTIC, ACUTE": "LEUKEMIA, MYELOCYTIC, ACUTE", "LIMB DEFORMITIES": "LIMB DEFORMITIES, CONGENITAL", "LYMPHATIC NEOPLASMS": "LYMPHATIC VESSEL TUMORS", "MICROTUBULE PRTOEINS": "MICROTUBULE PROTEINS", "MODELS ANIMAL": "MODELS, ANIMAL", "MUTATION, PRPSC PROTEINS": "MUTATIONS|PRPSC PROTEINS", "NEPRILISYN": "NEPRILYSIN", "NERVE TISSUE CELLS": "NERVE TISSUE", "NERVOUS TISSUE": "NERVE TISSUE", "NOP7P": "NOP7 PROTEIN, S CEREVISIAE", "NUCLEAR INPORT": "ACTIVE TRANSPORT, CELL NUCLEUS", "NUCLEOSIDE-DIPHOSPHATE KINASE A": "NUCLEOSIDE DIPHOSPHATE KINASE A", "ORGANELLESL": "ORGANELLES", "PANCREASTATIN": "PANCREASTATIN", "PAPILLOMAVIRUS": "PAPILLOMAVIRUS", "PAPILLOMAVIRUS, BOVINE": "PAPILLOMAVIRUS, BOVINE", "PAPILLOMAVIRUS, HUMAN": "PAPILLOMAVIRUS, HUMAN", "PHOSPHORYLATE": "PHOSPHORYLATION", "PSPSC PROTEINS": "PRPSC PROTEINS", "RAS GTAPASE-ACTIVATING PROTEINS": "RAS GTPASE-ACTIVATING PROTEINS", "RECEPTOR, ANTIGEN, T-CELL": "RECEPTORS, ANTIGEN, T-CELL", "RECEPTORS CYTOPLASMIC AND NUCLEAR": "RECEPTORS, CYTOPLASMIC AND NUCLEAR", "RECEPTORS, GROWTH FACTORS": "RECEPTORS, GROWTH FACTOR", "RECEPTORS,DOPAMINE": "RECEPTORS, DOPAMINE", "REPEAT SEQUENCES": "REPETITIVE SEQUENCES, NUCLEIC ACID", "SPECTRUM ANALYSIS, MASS": "SPECTRUM ANALYSIS, MASS", "THIMET OLIGOPEPTIDASE": "THIMET OLIGOPEPTIDASE", "TRANFECTION": "TRANSFECTION", "TRANS-ACTIVATION": "TRANS-ACTIVATION (GENETICS)", "TWO HYBRID ASSAYS": "TWO-HYBRID SYSTEM TECHNIQUES", # These mappings manually determined by Phoebe.... "DNA REPAIR, UBIQUITIN-CONJUGATING ENZYMES, DNA DAMAGE": "DNA REPAIR|UBIQUITIN-CONJUGATING ENZYMES|DNA DAMAGE", "CYTOPLASMIC PROTEINS": "CYTOPLASM", "EXPANSION": "TRINUCLEOTIDE REPEAT EXPANSION", "HYDRATASES": "HYDRO-LYASES", "IMMUNOTOXICITY": "IMMUNOTOXINS", "PROTEIN BINDING, DNA REPAIR": "PROTEIN BINDING|DNA REPAIR", "PROTEIN EXPORTING SIGNALS": "PROTEIN SORTING SIGNALS", "RING FINGER": "ZINC FINGERS", "TO DISEASE": "GENETIC PREDISPOSITION TO DISEASE", "TRINUCLEOTIDE REPEAT MUTATION": "TRINUCLEOTIDE REPEAT EXPANSION|MUTATION", } if len(sys.argv) not in (1,2): sys.stderr.write("usage:python %s [mesh-bin-filepath]? STDOUT\n" % sys.argv[0]) sys.exit(0) # read mesh bin file, return a set of official terms, formatted to upper-case... # and a dictionary of entry -> official term mappings, also formatted to uppercase... def parse_mesh_bin_file(mesh_bin_filename): # set to hold official terms... official = set() entrydx = {} # initialize fields of interest... mesh = None st = None terms = [] file = open(mesh_bin_filename, 'rt') for line in file: line = line.strip() if not line: continue elif line == "*NEWRECORD": # new record, output if we have data... if mesh and st and terms: # normalize everything to lower case... official.add(mesh.upper()) for t in terms: entrydx[t.upper()] = mesh.upper() # reset fields of interest... mesh = None st = None terms = [] else: # parse line into field and value... field = line.split("=") element = field[0].strip() value = field[1].strip() # handle fields... if element == "MH" or element == "NM": # this is the concept official name... mesh = value # each MeSH term is also it's own entry term... terms.append(value) elif element == "ENTRY" or element == "PRINT ENTRY" or element == "SY": # these fields may be multiple, separated by "|", # just take only the fields of type 'a' (see MeSH docs)... tms = value.split("|") if len(tms) > 1: terms.extend(map(lambda(x,y):x,filter(lambda(x,y):y=='a',zip(tms[:-1],tms[-1])))) elif len(tms) == 1: terms.append(tms[0]) elif element == "ST": st = value file.close() return (official, entrydx) # read '|' separated MeSH string, return list of MeSH terms... def parseMeSH(str): return [s.strip() for s in str.split('|')] # return '|' separated MeSH string from list of MeSH terms... def formatMeSH(lst): return '|'.join(lst) # add count of mesh terms to dictionary... def countMeSH(dx, lst): for t in lst: dx[t] = dx.get(t, 0) + 1 # input list of terms, return cleaned list of terms... def cleanMeSH(terms): cleaned = set() for t in terms: # remove any blank terms... if t == "": continue # remove [SUBSTANCE NAME] string, if any... t = t.replace("[SUBSTANCE NAME]", "").strip() if t in CLEANUP_MAPPING: cleaned.update([s.strip() for s in CLEANUP_MAPPING[t].split('|')]) else: cleaned.add(t) terms = list(cleaned) terms.sort() return terms # read lines from stdin, perform mapping, write to stdout... # accumulate statistics for reporting to stderr... inputDx = {} outputDx = {} for line in sys.stdin: line = line.strip() fields = [s.strip() for s in line.split('\t')] terms = parseMeSH(fields[4]) countMeSH(inputDx, terms) terms = cleanMeSH(terms) countMeSH(outputDx, terms) fields[4] = formatMeSH(terms) sys.stdout.write("%s\n" % '\t'.join(fields)) # load MeSH terms if filepath provided... if len(sys.argv) > 1: (official_mesh_terms, entry_mesh_terms) = parse_mesh_bin_file(sys.argv[1]) else: (official_mesh_terms, entry_mesh_terms) = (None, None) # display count report... sys.stderr.write("+++ Counts of INPUT MeSH terms +++\n") for m in sorted(inputDx.keys()): sys.stderr.write("\t%s\t%d\n" % (m, inputDx[m])) sys.stderr.write("+++ Counts of OUTPUT MeSH terms +++\n") newEntryMap = {} errorList = [] for m in sorted(outputDx.keys()): # only check MeSH heading, not the subheadings... term = m.split('/')[0] if official_mesh_terms and term not in official_mesh_terms and term not in CLEANUP_MAPPING: sys.stderr.write("\t(?)%s\t%d\n" % (m, outputDx[m])) errorList.append(m) # check for entry term mapping... if term in entry_mesh_terms: newEntryMap[term] = entry_mesh_terms[term] elif ("%sS" % term) in entry_mesh_terms: s = "%sS" % term newEntryMap[term] = entry_mesh_terms[s] else: sys.stderr.write("\t%s\t%d\n" % (m, outputDx[m])) # output new mappings, if any... if newEntryMap: sys.stderr.write("+++ ENTRY TERM MAPPINGS +++\n") for t in newEntryMap: sys.stderr.write('"%s": "%s",\n' % (t, newEntryMap[t])) # output errors, if any... if errorList: sys.stderr.write("+++ TERM ERROR TEMPLATES +++\n") for t in errorList: sys.stderr.write('"%s": "????????",\n' % (t))