diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..73ae6a6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,5 @@ +# jupyter notebooks meta files +.ipynb_checkpoints/ + +# OS irrelevant files +.DS_Store diff --git a/negex.python/negex.py b/negex.python/negex.py old mode 100755 new mode 100644 index 6166270..caf21e1 --- a/negex.python/negex.py +++ b/negex.python/negex.py @@ -1,13 +1,14 @@ import re -def sortRules (ruleList): + +def sortRules(ruleList): """Return sorted list of rules. - + Rules should be in a tab-delimited format: 'rule\t\t[four letter negation tag]' Sorts list of rules descending based on length of the rule, splits each rule into components, converts pattern to regular expression, and appends it to the end of the rule. """ - ruleList.sort(key = len, reverse = True) + ruleList.sort(key=len, reverse=True) sortedList = [] for rule in ruleList: s = rule.strip().split('\t') @@ -18,61 +19,63 @@ def sortRules (ruleList): sortedList.append(s) return sortedList + class negTagger(object): '''Take a sentence and tag negation terms and negated phrases. - + Keyword arguments: sentence -- string to be tagged phrases -- list of phrases to check for negation rules -- list of negation trigger terms from the sortRules function negP -- tag 'possible' terms as well (default = True) ''' - def __init__(self, sentence = '', phrases = None, rules = None, - negP = True): + + def __init__(self, sentence='', phrases=None, rules=None, + negP=True): self.__sentence = sentence self.__phrases = phrases self.__rules = rules self.__negTaggedSentence = '' self.__scopesToReturn = [] self.__negationFlag = None - + filler = '_' - + for rule in self.__rules: reformatRule = re.sub(r'\s+', filler, rule[0].strip()) - self.__sentence = rule[3].sub (' ' + rule[2].strip() - + reformatRule - + rule[2].strip() + ' ', self.__sentence) + self.__sentence = rule[3].sub(' ' + rule[2].strip() + + reformatRule + + rule[2].strip() + ' ', self.__sentence) for phrase in self.__phrases: phrase = re.sub(r'([.^$*+?{\\|()[\]])', r'\\\1', phrase) splitPhrase = phrase.split() joiner = r'\W+' - joinedPattern = r'\b' + joiner.join(splitPhrase) + r'\b' + joinedPattern = r'\b' + joiner.join(splitPhrase) + r'\b' reP = re.compile(joinedPattern, re.IGNORECASE) m = reP.search(self.__sentence) if m: self.__sentence = self.__sentence.replace(m.group(0), '[PHRASE]' + re.sub(r'\s+', filler, m.group(0).strip()) + '[PHRASE]') - -# Exchanges the [PHRASE] ... [PHRASE] tags for [NEGATED] ... [NEGATED] -# based on PREN, POST rules and if negPoss is set to True then based on + +# Exchanges the [PHRASE] ... [PHRASE] tags for [NEGATED] ... [NEGATED] +# based on PREN, POST rules and if negPoss is set to True then based on # PREP and POSP, as well. # Because PRENEGATION [PREN} is checked first it takes precedent over # POSTNEGATION [POST]. Similarly POSTNEGATION [POST] takes precedent over -# POSSIBLE PRENEGATION [PREP] and [PREP] takes precedent over POSSIBLE +# POSSIBLE PRENEGATION [PREP] and [PREP] takes precedent over POSSIBLE # POSTNEGATION [POSP]. - + overlapFlag = 0 prenFlag = 0 postFlag = 0 prePossibleFlag = 0 postPossibleFlag = 0 - + sentenceTokens = self.__sentence.split() sentencePortion = '' aScopes = [] sb = [] - #check for [PREN] + # check for [PREN] for i in range(len(sentenceTokens)): if sentenceTokens[i][:6] == '[PREN]': prenFlag = 1 @@ -80,23 +83,24 @@ def __init__(self, sentence = '', phrases = None, rules = None, if sentenceTokens[i][:6] in ['[CONJ]', '[PSEU]', '[POST]', '[PREP]', '[POSP]']: overlapFlag = 1 - + if i+1 < len(sentenceTokens): if sentenceTokens[i+1][:6] == '[PREN]': overlapFlag = 1 if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) sentencePortion = '' - + if prenFlag == 1 and overlapFlag == 0: - sentenceTokens[i] = sentenceTokens[i].replace('[PHRASE]', '[NEGATED]') + sentenceTokens[i] = sentenceTokens[i].replace( + '[PHRASE]', '[NEGATED]') sentencePortion = sentencePortion + ' ' + sentenceTokens[i] - + sb.append(sentenceTokens[i]) - + if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) - + sentencePortion = '' sb.reverse() sentenceTokens = sb @@ -109,26 +113,27 @@ def __init__(self, sentence = '', phrases = None, rules = None, if sentenceTokens[i][:6] in ['[CONJ]', '[PSEU]', '[PREN]', '[PREP]', '[POSP]']: overlapFlag = 1 - + if i+1 < len(sentenceTokens): if sentenceTokens[i+1][:6] == '[POST]': overlapFlag = 1 if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) sentencePortion = '' - + if postFlag == 1 and overlapFlag == 0: - sentenceTokens[i] = sentenceTokens[i].replace('[PHRASE]', '[NEGATED]') + sentenceTokens[i] = sentenceTokens[i].replace( + '[PHRASE]', '[NEGATED]') sentencePortion = sentenceTokens[i] + ' ' + sentencePortion - + sb2.insert(0, sentenceTokens[i]) - + if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) - + sentencePortion = '' self.__negTaggedSentence = ' '.join(sb2) - + if negP: sentenceTokens = sb2 sb3 = [] @@ -140,26 +145,28 @@ def __init__(self, sentence = '', phrases = None, rules = None, if sentenceTokens[i][:6] in ['[CONJ]', '[PSEU]', '[POST]', '[PREN]', '[POSP]']: overlapFlag = 1 - + if i+1 < len(sentenceTokens): if sentenceTokens[i+1][:6] == '[PREP]': overlapFlag = 1 if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) sentencePortion = '' - + if prePossibleFlag == 1 and overlapFlag == 0: - sentenceTokens[i] = sentenceTokens[i].replace('[PHRASE]', '[POSSIBLE]') + sentenceTokens[i] = sentenceTokens[i].replace( + '[PHRASE]', '[POSSIBLE]') sentencePortion = sentencePortion + ' ' + sentenceTokens[i] - - sb3 = sb3 + ' ' + sentenceTokens[i] - + + # sb3 = sb3 + ' ' + sentenceTokens[i] + sb3.append(sentenceTokens[i]) + if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) - + sentencePortion = '' sb3.reverse() - sentenceTokens = sb3 + sentenceTokens = sb3 sb4 = [] # Check for [POSP] for i in range(len(sentenceTokens)): @@ -169,34 +176,36 @@ def __init__(self, sentence = '', phrases = None, rules = None, if sentenceTokens[i][:6] in ['[CONJ]', '[PSEU]', '[PREN]', '[PREP]', '[POST]']: overlapFlag = 1 - + if i+1 < len(sentenceTokens): if sentenceTokens[i+1][:6] == '[POSP]': overlapFlag = 1 if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) sentencePortion = '' - + if postPossibleFlag == 1 and overlapFlag == 0: - sentenceTokens[i] = sentenceTokens[i].replace('[PHRASE]', '[POSSIBLE]') + sentenceTokens[i] = sentenceTokens[i].replace( + '[PHRASE]', '[POSSIBLE]') sentencePortion = sentenceTokens[i] + ' ' + sentencePortion - + sb4.insert(0, sentenceTokens[i]) - + if sentencePortion.strip(): aScopes.append(sentencePortion.strip()) - + self.__negTaggedSentence = ' '.join(sb4) - + if '[NEGATED]' in self.__negTaggedSentence: self.__negationFlag = 'negated' elif '[POSSIBLE]' in self.__negTaggedSentence: self.__negationFlag = 'possible' else: self.__negationFlag = 'affirmed' - - self.__negTaggedSentence = self.__negTaggedSentence.replace(filler, ' ') - + + self.__negTaggedSentence = self.__negTaggedSentence.replace( + filler, ' ') + for line in aScopes: tokensToReturn = [] thisLineTokens = line.split() @@ -207,11 +216,13 @@ def __init__(self, sentence = '', phrases = None, rules = None, def getNegTaggedSentence(self): return self.__negTaggedSentence + def getNegationFlag(self): return self.__negationFlag + def getScopes(self): return self.__scopesToReturn - + def __str__(self): text = self.__negTaggedSentence text += '\t' + self.__negationFlag diff --git a/negex.python/negex_triggers.txt b/negex.python/negex_triggers.txt old mode 100755 new mode 100644 index b9f3712..a8a4122 --- a/negex.python/negex_triggers.txt +++ b/negex.python/negex_triggers.txt @@ -114,10 +114,10 @@ can rule her out [PREN] can rule the patient out [PREN] can rule him out for [PREN] can rule her out for [PREN] -can rule the patinet out for [PREN] +can rule the patient out for [PREN] can rule him out against [PREN] can rule her out against [PREN] -can rule the patinet out against [PREN] +can rule the patient out against [PREN] adequate to rule out [PREN] adequate to rule him out [PREN] adequate to rule her out [PREN] @@ -148,7 +148,7 @@ rule the patient out [PREP] rule out for [PREP] rule him out for [PREP] rule her out for [PREP] -rule the patinet out for [PREP] +rule the patient out for [PREP] be ruled out for [PREP] should be ruled out for [PREP] ought to be ruled out for [PREP] diff --git a/negex.python/wrapper.py b/negex.python/wrapper.py old mode 100755 new mode 100644 index cbbeadd..d7996f2 --- a/negex.python/wrapper.py +++ b/negex.python/wrapper.py @@ -5,26 +5,30 @@ def main(): rfile = open(r'negex_triggers.txt') irules = sortRules(rfile.readlines()) - reports = csv.reader(open(r'Annotations-1-120.txt','rb'), delimiter = '\t') + reports = csv.reader(open(r'Annotations-1-120.txt', 'rb'), delimiter='\t') reports.next() reportNum = 0 correctNum = 0 ofile = open(r'negex_output.txt', 'w') output = [] - outputfile = csv.writer(ofile, delimiter = '\t') + outputfile = csv.writer(ofile, delimiter='\t') for report in reports: - tagger = negTagger(sentence = report[2], phrases = [report[1]], rules = irules, negP=False) + tagger = negTagger(sentence=report[2], phrases=[ + report[1]], rules=irules, negP=True) report.append(tagger.getNegTaggedSentence()) report.append(tagger.getNegationFlag()) report = report + tagger.getScopes() reportNum += 1 if report[3].lower() == report[5]: - correctNum +=1 + correctNum += 1 output.append(report) - outputfile.writerow(['Percentage correct:', float(correctNum)/float(reportNum)]) + outputfile.writerow( + ['Percentage correct:', float(correctNum)/float(reportNum)]) for row in output: if row: outputfile.writerow(row) ofile.close() -if __name__ == '__main__': main() \ No newline at end of file + +if __name__ == '__main__': + main()