From 3c66a7b2f684ada9d90b64b8de478a896984f86a Mon Sep 17 00:00:00 2001
From: jianlins <jianlinshi.cn@gmail.com>
Date: Tue, 16 Jul 2019 11:42:06 -0600
Subject: [PATCH] fix ConTextMarkup compiled regex cache issue. extend itemData
 to take both local and remote csv, yml files, also allows to take in csv or
 yml strings.

---
 .gitignore                     |   2 +-
 pyConTextNLP/ConTextMarkup.py  |  30 +++----
 pyConTextNLP/itemData.py       | 159 +++++++++++++++++++++++++++------
 pyConTextNLP/tests/test_yml.py |  29 ++++++
 4 files changed, 174 insertions(+), 46 deletions(-)
 create mode 100644 pyConTextNLP/tests/test_yml.py

diff --git a/.gitignore b/.gitignore
index 27deb28..066eec0 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 # Byte-compiled / optimized / DLL files
 __pycache__/
 *.py[cod]
-
+.idea
 # C extensions
 *.so
 
diff --git a/pyConTextNLP/ConTextMarkup.py b/pyConTextNLP/ConTextMarkup.py
index ca8c0b3..7c68163 100644
--- a/pyConTextNLP/ConTextMarkup.py
+++ b/pyConTextNLP/ConTextMarkup.py
@@ -12,7 +12,7 @@
 REG_CLEAN2 = re.compile(r"""\s+""", re.UNICODE)
 REG_CLEAN3 = re.compile(r"""\d""", re.UNICODE)
 
-COMPILED_REGEXPRS = {}
+
 
 NODE_XML_SKEL = \
 """
@@ -305,22 +305,20 @@ def markItem(self, item, ConTextMode="target", ignoreCase=True):
 
         # See if we have already created a regular expression
 
-        if not item.getLiteral() in COMPILED_REGEXPRS:
-            if not item.getRE():
-                reg_exp = r"\b{}\b".format(item.getLiteral())
-                if self.getVerbose():
-                    print("generating regular expression", reg_exp)
-            else:
-                reg_exp = item.getRE()
-                if self.getVerbose():
-                    print("using provided regular expression", reg_exp)
-            if ignoreCase:
-                regex = re.compile(reg_exp, re.IGNORECASE|re.UNICODE)
-            else:
-                regex = re.compile(reg_exp, re.UNICODE)
-            COMPILED_REGEXPRS[item.getLiteral()] = regex
+
+        if not item.getRE():
+            reg_exp = r"\b{}\b".format(item.getLiteral())
+            if self.getVerbose():
+                print("generating regular expression", reg_exp)
+        else:
+            reg_exp = item.getRE()
+            if self.getVerbose():
+                print("using provided regular expression", reg_exp)
+        if ignoreCase:
+            regex = re.compile(reg_exp, re.IGNORECASE|re.UNICODE)
         else:
-            regex = COMPILED_REGEXPRS[item.getLiteral()]
+            regex = re.compile(reg_exp, re.UNICODE)
+
         _iter = regex.finditer(self.getText())
         terms = []
         for i in _iter:
diff --git a/pyConTextNLP/itemData.py b/pyConTextNLP/itemData.py
index 8b8495d..35d9f46 100644
--- a/pyConTextNLP/itemData.py
+++ b/pyConTextNLP/itemData.py
@@ -1,41 +1,62 @@
-#Copyright 2010 Brian E. Chapman
+# Copyright 2010 Brian E. Chapman
 #
-#Licensed under the Apache License, Version 2.0 (the "License");
-#you may not use this file except in compliance with the License.
-#You may obtain a copy of the License at
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
-#Unless required by applicable law or agreed to in writing, software
-#distributed under the License is distributed on an "AS IS" BASIS,
-#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#See the License for the specific language governing permissions and
-#limitations under the License.
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 
 """
 A module defining the contextItem class.
 """
-import yaml
-import urllib.request, urllib.error, urllib.parse
+import os
 
+import yaml
+import urllib
+import csv
+from io import StringIO
 
 def _get_fileobj(_file):
-    if not urllib.parse.urlparse(_file).scheme:
-        _file = "file://"+_file
-    return urllib.request.urlopen(_file, data=None)
-
-def get_items(_file):
-    f0 = _get_fileobj(_file)
-    context_items =  [contextItem((d["Lex"],
-                                   d["Type"],
-                                   r"%s"%d["Regex"],
-                                   d["Direction"])) for d in yaml.load_all(f0)]
-    f0.close()
-    return context_items
+    p = urllib.parse.urlparse(_file)
+    if not p.scheme:
+        csvFile = "file://" + _file
+    f0 = urllib.urlopen(csvFile, 'rU')
+    return csv.reader(StringIO(f0.read().decode(), newline=None), delimiter="\t"), f0
 
 
-class contextItem(object):
+def get_items(file_str):
+    file_name = file_str.lower()
+    if file_name.endswith(".csv") or file_name.endswith(".tsv") or file_name.endswith(".yml"):
+        if 'http' not in file_str.lower():
+            pwd = os.getcwd()
+            if pwd not in file_str:
+                file_str = os.path.join(pwd, file_str)
+            if not os.path.exists(file_str):
+                return contextItem()
+        if file_name.endswith('csv') or file_name.endswith('tsv'):
+            return instantiateFromCSVtoitemData(file_str)
+        elif file_name.endswith('yml'):
+            return instantiateFromYMLtoitemData(file_str)
+        else:
+            return contextItem()
+    elif "Comments:" in file_str:
+        return instantiateFromYMLStr(file_str)
+    elif ',' in file_str:
+        return instantiateFromCSVStr(file_str, ',')
+    elif '\t' in file_str:
+        return instantiateFromCSVStr(file_str, '\t')
+    else:
+        print(
+            "This input format is not supported. It can be either a path of csv, tsv or yaml file, or a string of corresponding file content.")
+
 
+class contextItem(object):
 
     def __init__(self, args):
         self.__literal = args[0]
@@ -43,7 +64,7 @@ def __init__(self, args):
         self.__category = []
         for c in cs:
             self.__category.append(c.lower().strip())
-        self.__re = r"%s"%args[2] # I need to figure out how to read this raw string in properly
+        self.__re = r"%s" % args[2]  # I need to figure out how to read this raw string in properly
         self.__rule = args[3].lower()
 
         # generate regex from literal if no regex provided
@@ -53,32 +74,112 @@ def __init__(self, args):
     def getLiteral(self):
         """return the literal associated with this item"""
         return self.__literal
+
     def getCategory(self):
         """return the list of categories associated with this item"""
         return self.__category[:]
+
     def categoryString(self):
         """return the categories as a string delimited by '_'"""
         return '_'.join(self.__category)
 
-
-    def isA(self,testCategory):
+    def isA(self, testCategory):
         """test whether testCategory is one of the categories associated with self"""
         try:
             return testCategory.lower().strip() in self.__category
         except:
             for tc in testCategory:
-                if( tc.lower().strip() in self.__category ):
+                if (tc.lower().strip() in self.__category):
                     return True
             return False
 
     def getRE(self):
         return self.__re
+
     def getRule(self):
         return self.__rule
+
     def __str__(self):
         txt = """literal<<{0}>>; category<<{1}>>; re<<{2}>>; rule<<{3}>>""".format(
-            self.__literal,self.__category,self.__re, self.__rule)
+            self.__literal, self.__category, self.__re, self.__rule)
         return txt
+
     def __repr__(self):
         return self.__str__()
 
+
+def instantiateFromCSVStr(content, splitter):
+    reader = csv.reader(content.split('\n'), delimiter=splitter)
+    items = contextItem()
+    for row in reader:
+        # print(row)
+        tmp = read_row(row)
+        if tmp is None:
+            continue
+        # tmp = [row[literalColumn], row[categoryColumn],
+        # 	   row[regexColumn], row[ruleColumn]]
+        # tmp[2] = r"{0}".format(tmp[2])  # convert the regular expression string into a raw string
+        item = contextItem(tmp)
+        items.append(item)
+    return items
+
+
+def instantiateFromYMLStr(content):
+    context_items = [contextItem((d["Lex"],
+                                  d["Type"],
+                                  r"%s" % d["Regex"],
+                                  d["Direction"])) for d in yaml.safe_load_all(content)]
+    return context_items
+
+
+def instantiateFromYMLtoitemData(_file):
+    def get_fileobj(_file):
+        if not urllib.parse.urlparse(_file).scheme:
+            _file = "file://" + _file
+        return urllib.request.urlopen(_file, data=None)
+
+    f0 = get_fileobj(_file)
+    context_items = [contextItem((d["Lex"],
+                                  d["Type"],
+                                  r"%s" % d["Regex"],
+                                  d["Direction"])) for d in yaml.safe_load_all(f0)]
+    return context_items
+
+
+def instantiateFromCSVtoitemData(csvFile, encoding='utf-8', headerRows=1, literalColumn=0, categoryColumn=1,
+                                 regexColumn=2, ruleColumn=3):
+    items = contextItem()  # itemData to be returned to the user
+    header = []
+    reader, f0 = _get_fileobj(csvFile)
+    # reader = csv.reader(open(csvFile, 'rU'))
+    # first grab numbe rof specified header rows
+    for i in range(headerRows):
+        row = next(reader)
+        header.append(row)
+    # now grab each itemData
+    for row in reader:
+        # print(row)
+        tmp = read_row(row)
+        if tmp is None:
+            continue
+        # tmp = [row[literalColumn], row[categoryColumn],
+        # 	   row[regexColumn], row[ruleColumn]]
+        # tmp[2] = r"{0}".format(tmp[2])  # convert the regular expression string into a raw string
+        item = contextItem(tmp)
+        items.append(item)
+    f0.close()
+    return items
+
+
+def read_row(row):
+    tmp = []
+    if len(row) < 2 or row[0].startswith('#'):
+        return None
+    tmp.extend([row[0], row[1]])
+    if len(row) == 3:
+        tmp.extend([r"{0}".format(row[2]), ''])
+    elif len(row) == 2:
+        tmp.extend(['', ''])
+    else:
+        tmp.extend([row[2], row[3]])
+    return tmp
diff --git a/pyConTextNLP/tests/test_yml.py b/pyConTextNLP/tests/test_yml.py
new file mode 100644
index 0000000..4ed69fc
--- /dev/null
+++ b/pyConTextNLP/tests/test_yml.py
@@ -0,0 +1,29 @@
+import os
+import unittest
+
+import pyConTextNLP.itemData as itemData
+import pyConTextNLP.pyConText as pyConText
+
+
+class SimpleTestCase(unittest.TestCase):
+    def test_1(self):
+        sent1 = 'IMPRESSION: 1. R/O STUDY DEMONSTRATING NO GROSS EVIDENCE OF SIGNIFICANT PULMONARY EMBOLISM.'
+        print(os.getcwd())
+        modifiers = itemData.get_items(os.path.join(os.getcwd(),
+                                                    "../../KB/pneumonia_modifiers.yml"))
+        targets = itemData.get_items(os.path.join(os.getcwd(),
+                                                  "../../KB/pneumonia_targets.yml"))
+        markup = pyConText.ConTextMarkup()
+        markup.setRawText(sent1.lower())
+
+        markup.markItems(modifiers, mode="modifier")
+        markup.markItems(targets, mode="target")
+        found = False
+        for node in markup.nodes(data=True):
+            if 'r/o' in str(node):
+                found = True
+        assert found
+
+
+if __name__ == '__main__':
+    unittest.main()