From 776874b81dc68907a54aca25a9005b4a43002e3b Mon Sep 17 00:00:00 2001 From: Narcisse Zekpa Date: Mon, 18 Jul 2022 17:09:04 -0400 Subject: [PATCH 1/2] Adding custom header processor capability --- src-python/tests/test_trp.py | 31 +++++++++++++++++++++++++++++++ src-python/trp/__init__.py | 5 ++++- 2 files changed, 35 insertions(+), 1 deletion(-) diff --git a/src-python/tests/test_trp.py b/src-python/tests/test_trp.py index a9bd029..44c3560 100644 --- a/src-python/tests/test_trp.py +++ b/src-python/tests/test_trp.py @@ -141,3 +141,34 @@ def test_table_with_header(caplog): rows = table.rows_without_header assert len(rows) == 7 + + +def test_table_with_header_get_field_names(caplog): + caplog.set_level(logging.DEBUG) + p = os.path.dirname(os.path.realpath(__file__)) + f = open(os.path.join(p, "data", "response.json")) + j = json.load(f) + doc = Document(j) + + page = doc.pages[0] + table = page.tables[2] + + def process_headers(header_cells): + header_names = [] + for header in header_cells: + s = [] + for cell in header: + if cell._isChildOfMergedCell: + s.append(cell.mergedText.strip()) + else: + s.append(cell.text.strip()) + header_names.append(s) + + t = header_names[0] + b = header_names[1] + header_names = [i + " / " + j for i, j in zip(t, b)] + return header_names + + + headers = table.get_header_field_names(process_headers) + assert len(headers) == 6 diff --git a/src-python/trp/__init__.py b/src-python/trp/__init__.py index 78c74c6..027e784 100644 --- a/src-python/trp/__init__.py +++ b/src-python/trp/__init__.py @@ -460,8 +460,11 @@ def _resolve_merged_cells(self, blockMap): merged_cell = MergedCell(blockMap[cid], blockMap, self._rows) self._merged_cells.append(merged_cell) - def get_header_field_names(self): + def get_header_field_names(self, header_proc_func=None): header_cells = self.header + if header_proc_func != None: + return header_proc_func(header_cells) + header_names = [] for header in header_cells: s = [] From 2e110fbe056b071a2831850a8a5e3a638a977058 Mon Sep 17 00:00:00 2001 From: Narcisse Zekpa Date: Fri, 29 Jul 2022 13:22:49 -0400 Subject: [PATCH 2/2] Adding documentation to custom header processor capability --- src-python/README.md | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/src-python/README.md b/src-python/README.md index 52cc078..52e49c4 100644 --- a/src-python/README.md +++ b/src-python/README.md @@ -254,6 +254,50 @@ for page in doc.pages: ``` +## Table Headers +Through the Table class you can retrieve header column names by calling the get_header_field_names method. By default it returns an array containing the detected column names. If the table has more than one header, the output is returned in the form an array of arrays. You may alternatively pass a custom header processor function as argument to reformat the columns list(s) according to your own requirements. Please find a code sample below. + +```python + + from textractcaller.t_call import call_textract, Textract_Features + from trp.trp2 import TDocument, TDocumentSchema + from trp.t_pipeline import order_blocks_by_geo + import trp + import json + + j = call_textract(input_document="path_to_some_document (PDF, JPEG, PNG)", features=[Textract_Features.FORMS, Textract_Features.TABLES]) + t_doc = TDocumentSchema().load(j) + ordered_doc = order_blocks_by_geo(t_doc) + trp_doc = trp.Document(TDocumentSchema().dump(ordered_doc)) + + page = trp_doc.pages[0] + table = page.tables[2] + + def process_headers(header_cells): + header_names = [] + for header in header_cells: + s = [] + for cell in header: + if cell._isChildOfMergedCell: + s.append(cell.mergedText.strip()) + else: + s.append(cell.text.strip()) + header_names.append(s) + + t = header_names[0] + b = header_names[1] + header_names = [i + " / " + j for i, j in zip(t, b)] + return header_names + + + headers = table.get_header_field_names(process_headers) + +``` + + + + + ## Test - Clone the repo and run pytest