From 3c66a7b2f684ada9d90b64b8de478a896984f86a Mon Sep 17 00:00:00 2001 From: jianlins Date: Tue, 16 Jul 2019 11:42:06 -0600 Subject: [PATCH] fix ConTextMarkup compiled regex cache issue. extend itemData to take both local and remote csv, yml files, also allows to take in csv or yml strings. --- .gitignore | 2 +- pyConTextNLP/ConTextMarkup.py | 30 +++---- pyConTextNLP/itemData.py | 159 +++++++++++++++++++++++++++------ pyConTextNLP/tests/test_yml.py | 29 ++++++ 4 files changed, 174 insertions(+), 46 deletions(-) create mode 100644 pyConTextNLP/tests/test_yml.py diff --git a/.gitignore b/.gitignore index 27deb28..066eec0 100644 --- a/.gitignore +++ b/.gitignore @@ -1,7 +1,7 @@ # Byte-compiled / optimized / DLL files __pycache__/ *.py[cod] - +.idea # C extensions *.so diff --git a/pyConTextNLP/ConTextMarkup.py b/pyConTextNLP/ConTextMarkup.py index ca8c0b3..7c68163 100644 --- a/pyConTextNLP/ConTextMarkup.py +++ b/pyConTextNLP/ConTextMarkup.py @@ -12,7 +12,7 @@ REG_CLEAN2 = re.compile(r"""\s+""", re.UNICODE) REG_CLEAN3 = re.compile(r"""\d""", re.UNICODE) -COMPILED_REGEXPRS = {} + NODE_XML_SKEL = \ """ @@ -305,22 +305,20 @@ def markItem(self, item, ConTextMode="target", ignoreCase=True): # See if we have already created a regular expression - if not item.getLiteral() in COMPILED_REGEXPRS: - if not item.getRE(): - reg_exp = r"\b{}\b".format(item.getLiteral()) - if self.getVerbose(): - print("generating regular expression", reg_exp) - else: - reg_exp = item.getRE() - if self.getVerbose(): - print("using provided regular expression", reg_exp) - if ignoreCase: - regex = re.compile(reg_exp, re.IGNORECASE|re.UNICODE) - else: - regex = re.compile(reg_exp, re.UNICODE) - COMPILED_REGEXPRS[item.getLiteral()] = regex + + if not item.getRE(): + reg_exp = r"\b{}\b".format(item.getLiteral()) + if self.getVerbose(): + print("generating regular expression", reg_exp) + else: + reg_exp = item.getRE() + if self.getVerbose(): + print("using provided regular expression", reg_exp) + if ignoreCase: + regex = re.compile(reg_exp, re.IGNORECASE|re.UNICODE) else: - regex = COMPILED_REGEXPRS[item.getLiteral()] + regex = re.compile(reg_exp, re.UNICODE) + _iter = regex.finditer(self.getText()) terms = [] for i in _iter: diff --git a/pyConTextNLP/itemData.py b/pyConTextNLP/itemData.py index 8b8495d..35d9f46 100644 --- a/pyConTextNLP/itemData.py +++ b/pyConTextNLP/itemData.py @@ -1,41 +1,62 @@ -#Copyright 2010 Brian E. Chapman +# Copyright 2010 Brian E. Chapman # -#Licensed under the Apache License, Version 2.0 (the "License"); -#you may not use this file except in compliance with the License. -#You may obtain a copy of the License at +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # -#Unless required by applicable law or agreed to in writing, software -#distributed under the License is distributed on an "AS IS" BASIS, -#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -#See the License for the specific language governing permissions and -#limitations under the License. +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. """ A module defining the contextItem class. """ -import yaml -import urllib.request, urllib.error, urllib.parse +import os +import yaml +import urllib +import csv +from io import StringIO def _get_fileobj(_file): - if not urllib.parse.urlparse(_file).scheme: - _file = "file://"+_file - return urllib.request.urlopen(_file, data=None) - -def get_items(_file): - f0 = _get_fileobj(_file) - context_items = [contextItem((d["Lex"], - d["Type"], - r"%s"%d["Regex"], - d["Direction"])) for d in yaml.load_all(f0)] - f0.close() - return context_items + p = urllib.parse.urlparse(_file) + if not p.scheme: + csvFile = "file://" + _file + f0 = urllib.urlopen(csvFile, 'rU') + return csv.reader(StringIO(f0.read().decode(), newline=None), delimiter="\t"), f0 -class contextItem(object): +def get_items(file_str): + file_name = file_str.lower() + if file_name.endswith(".csv") or file_name.endswith(".tsv") or file_name.endswith(".yml"): + if 'http' not in file_str.lower(): + pwd = os.getcwd() + if pwd not in file_str: + file_str = os.path.join(pwd, file_str) + if not os.path.exists(file_str): + return contextItem() + if file_name.endswith('csv') or file_name.endswith('tsv'): + return instantiateFromCSVtoitemData(file_str) + elif file_name.endswith('yml'): + return instantiateFromYMLtoitemData(file_str) + else: + return contextItem() + elif "Comments:" in file_str: + return instantiateFromYMLStr(file_str) + elif ',' in file_str: + return instantiateFromCSVStr(file_str, ',') + elif '\t' in file_str: + return instantiateFromCSVStr(file_str, '\t') + else: + print( + "This input format is not supported. It can be either a path of csv, tsv or yaml file, or a string of corresponding file content.") + +class contextItem(object): def __init__(self, args): self.__literal = args[0] @@ -43,7 +64,7 @@ def __init__(self, args): self.__category = [] for c in cs: self.__category.append(c.lower().strip()) - self.__re = r"%s"%args[2] # I need to figure out how to read this raw string in properly + self.__re = r"%s" % args[2] # I need to figure out how to read this raw string in properly self.__rule = args[3].lower() # generate regex from literal if no regex provided @@ -53,32 +74,112 @@ def __init__(self, args): def getLiteral(self): """return the literal associated with this item""" return self.__literal + def getCategory(self): """return the list of categories associated with this item""" return self.__category[:] + def categoryString(self): """return the categories as a string delimited by '_'""" return '_'.join(self.__category) - - def isA(self,testCategory): + def isA(self, testCategory): """test whether testCategory is one of the categories associated with self""" try: return testCategory.lower().strip() in self.__category except: for tc in testCategory: - if( tc.lower().strip() in self.__category ): + if (tc.lower().strip() in self.__category): return True return False def getRE(self): return self.__re + def getRule(self): return self.__rule + def __str__(self): txt = """literal<<{0}>>; category<<{1}>>; re<<{2}>>; rule<<{3}>>""".format( - self.__literal,self.__category,self.__re, self.__rule) + self.__literal, self.__category, self.__re, self.__rule) return txt + def __repr__(self): return self.__str__() + +def instantiateFromCSVStr(content, splitter): + reader = csv.reader(content.split('\n'), delimiter=splitter) + items = contextItem() + for row in reader: + # print(row) + tmp = read_row(row) + if tmp is None: + continue + # tmp = [row[literalColumn], row[categoryColumn], + # row[regexColumn], row[ruleColumn]] + # tmp[2] = r"{0}".format(tmp[2]) # convert the regular expression string into a raw string + item = contextItem(tmp) + items.append(item) + return items + + +def instantiateFromYMLStr(content): + context_items = [contextItem((d["Lex"], + d["Type"], + r"%s" % d["Regex"], + d["Direction"])) for d in yaml.safe_load_all(content)] + return context_items + + +def instantiateFromYMLtoitemData(_file): + def get_fileobj(_file): + if not urllib.parse.urlparse(_file).scheme: + _file = "file://" + _file + return urllib.request.urlopen(_file, data=None) + + f0 = get_fileobj(_file) + context_items = [contextItem((d["Lex"], + d["Type"], + r"%s" % d["Regex"], + d["Direction"])) for d in yaml.safe_load_all(f0)] + return context_items + + +def instantiateFromCSVtoitemData(csvFile, encoding='utf-8', headerRows=1, literalColumn=0, categoryColumn=1, + regexColumn=2, ruleColumn=3): + items = contextItem() # itemData to be returned to the user + header = [] + reader, f0 = _get_fileobj(csvFile) + # reader = csv.reader(open(csvFile, 'rU')) + # first grab numbe rof specified header rows + for i in range(headerRows): + row = next(reader) + header.append(row) + # now grab each itemData + for row in reader: + # print(row) + tmp = read_row(row) + if tmp is None: + continue + # tmp = [row[literalColumn], row[categoryColumn], + # row[regexColumn], row[ruleColumn]] + # tmp[2] = r"{0}".format(tmp[2]) # convert the regular expression string into a raw string + item = contextItem(tmp) + items.append(item) + f0.close() + return items + + +def read_row(row): + tmp = [] + if len(row) < 2 or row[0].startswith('#'): + return None + tmp.extend([row[0], row[1]]) + if len(row) == 3: + tmp.extend([r"{0}".format(row[2]), '']) + elif len(row) == 2: + tmp.extend(['', '']) + else: + tmp.extend([row[2], row[3]]) + return tmp diff --git a/pyConTextNLP/tests/test_yml.py b/pyConTextNLP/tests/test_yml.py new file mode 100644 index 0000000..4ed69fc --- /dev/null +++ b/pyConTextNLP/tests/test_yml.py @@ -0,0 +1,29 @@ +import os +import unittest + +import pyConTextNLP.itemData as itemData +import pyConTextNLP.pyConText as pyConText + + +class SimpleTestCase(unittest.TestCase): + def test_1(self): + sent1 = 'IMPRESSION: 1. R/O STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.' + print(os.getcwd()) + modifiers = itemData.get_items(os.path.join(os.getcwd(), + "../../KB/pneumonia_modifiers.yml")) + targets = itemData.get_items(os.path.join(os.getcwd(), + "../../KB/pneumonia_targets.yml")) + markup = pyConText.ConTextMarkup() + markup.setRawText(sent1.lower()) + + markup.markItems(modifiers, mode="modifier") + markup.markItems(targets, mode="target") + found = False + for node in markup.nodes(data=True): + if 'r/o' in str(node): + found = True + assert found + + +if __name__ == '__main__': + unittest.main()