diff --git a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py index 14e00c80..c49b2388 100644 --- a/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py +++ b/dedoc/structure_constructors/concrete_structure_constructors/tree_constructor.py @@ -33,6 +33,7 @@ def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = Build the tree structure representation for the given document intermediate representation. To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`. """ + from dedoc.data_structures.concrete_annotations import AttachAnnotation, TableAnnotation from dedoc.data_structures.document_content import DocumentContent from dedoc.data_structures.document_metadata import DocumentMetadata @@ -46,7 +47,8 @@ def construct(self, document: UnstructuredDocument, parameters: Optional[dict] = # multiline header hl_equal = line.metadata.hierarchy_level == tree.metadata.hierarchy_level line_type_equal = line.metadata.hierarchy_level.line_type == tree.metadata.hierarchy_level.line_type - if line.metadata.hierarchy_level.can_be_multiline and hl_equal and line_type_equal: + has_no_refs = len([ann for ann in tree.annotations if ann.name in (AttachAnnotation.name, TableAnnotation.name)]) == 0 + if line.metadata.hierarchy_level.can_be_multiline and hl_equal and line_type_equal and has_no_refs: tree.add_text(line) # move up and add child diff --git a/tests/api_tests/test_api_doctype_article.py b/tests/api_tests/test_api_doctype_article.py index f19642ff..b8d70911 100644 --- a/tests/api_tests/test_api_doctype_article.py +++ b/tests/api_tests/test_api_doctype_article.py @@ -69,8 +69,8 @@ def test_article(self) -> None: self.assertEqual(self._get_text_of_row(table["cells"][0]), ["Software (8-bit)", "code size", "cycle", "cost", "physical"]) section_with_table_refs = self._get_by_tree_path(tree, "0.7.0") table_refs_in_text = [ann for ann in section_with_table_refs["annotations"] if ann["name"] == "table" and ann["value"] == table["metadata"]["uid"]] - self.assertEqual(len(table_refs_in_text), 2) - self.assertEqual(["1", "1"], [section_with_table_refs["text"][table_refs_in_text[n]["start"]:table_refs_in_text[n]["end"]] for n in range(2)]) + self.assertEqual(len(table_refs_in_text), 1) + self.assertEqual("1", section_with_table_refs["text"][table_refs_in_text[0]["start"]:table_refs_in_text[0]["end"]]) table = result["content"]["tables"][1] # Grobid can't recognize vertical orientation tables self.assertEqual(table["metadata"]["title"], "Table 2 .List of our target implementations.") diff --git a/tests/api_tests/test_api_doctype_tz.py b/tests/api_tests/test_api_doctype_tz.py index f7e11719..753c5244 100644 --- a/tests/api_tests/test_api_doctype_tz.py +++ b/tests/api_tests/test_api_doctype_tz.py @@ -56,20 +56,20 @@ def test_gasu_tz(self) -> None: self.assertEqual("СОДЕРЖАНИЕ", node["text"][:30].strip()) self.assertEqual("toc", node["metadata"]["paragraph_type"]) - node = self._get_by_tree_path(tree, "0.1.5") + node = self._get_by_tree_path(tree, "0.1.6") self.assertEqual("5.\tТехнические требования к ГАС «Управление»", node["text"].strip()) self.assertEqual("part", node["metadata"]["paragraph_type"]) - node = self._get_by_tree_path(tree, "0.1.5.0") + node = self._get_by_tree_path(tree, "0.1.6.0") self.assertEqual("5.1.\tТребования к функционированию", node["text"].strip()) self.assertEqual("named_item", node["metadata"]["paragraph_type"]) - node = self._get_by_tree_path(tree, "0.1.4") + node = self._get_by_tree_path(tree, "0.1.5") self.assertIn("«Управление»", node["text"]) self.assertEqual("4.\tОбщие требования к техническим компонентам ГАС«Управление»", node["text"].strip()) self.assertEqual("part", node["metadata"]["paragraph_type"]) - node = self._get_by_tree_path(tree, "0.1.5.0.0.19.0") + node = self._get_by_tree_path(tree, "0.1.6.0.0.19.2") self.assertIn("10 лет) ", node["text"]) self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) diff --git a/tests/api_tests/test_api_format_pdf.py b/tests/api_tests/test_api_format_pdf.py index ec474b16..765c894f 100644 --- a/tests/api_tests/test_api_format_pdf.py +++ b/tests/api_tests/test_api_format_pdf.py @@ -142,7 +142,7 @@ def test_pdf_with_some_tables(self) -> None: self._test_table_refs(result["content"]) # checks indentations - par = self._get_by_tree_path(tree, "0.4.0.0") + par = self._get_by_tree_path(tree, "0.5.0.0") annotations = par["annotations"] self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, annotations) self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"]) diff --git a/tests/api_tests/test_api_format_pdf_tabby_reader.py b/tests/api_tests/test_api_format_pdf_tabby_reader.py index 959e15ca..815d4f3f 100644 --- a/tests/api_tests/test_api_format_pdf_tabby_reader.py +++ b/tests/api_tests/test_api_format_pdf_tabby_reader.py @@ -200,15 +200,15 @@ def test_pdf_with_tables(self) -> None: self.assertEqual("raw_text", node["metadata"]["paragraph_type"]) self.assertEqual("ВВП (валовой внутренний продук", node["text"].strip()[:30]) - node = self._get_by_tree_path(tree, "0.5.0") + node = self._get_by_tree_path(tree, "0.6.0") self.assertEqual("list_item", node["metadata"]["paragraph_type"]) self.assertEqual("1. В соответствии с доходами.", node["text"].strip()[:30]) - node = self._get_by_tree_path(tree, "0.5.1") + node = self._get_by_tree_path(tree, "0.6.1") self.assertEqual("list_item", node["metadata"]["paragraph_type"]) self.assertEqual("2. В соответствии с расходами.", node["text"].strip()[:30]) - node = self._get_by_tree_path(tree, "0.5.2") + node = self._get_by_tree_path(tree, "0.6.2") self.assertEqual("list_item", node["metadata"]["paragraph_type"]) self.assertEqual("3. В соответствии с полученной", node["text"].strip()[:30]) diff --git a/tests/api_tests/test_api_format_pdf_with_text.py b/tests/api_tests/test_api_format_pdf_with_text.py index fe17ef08..f620bed0 100644 --- a/tests/api_tests/test_api_format_pdf_with_text.py +++ b/tests/api_tests/test_api_format_pdf_with_text.py @@ -138,7 +138,7 @@ def test_pdf_with_some_tables(self) -> None: self._check_tree_sanity(tree) # checks indentations - par = self._get_by_tree_path(tree, "0.4.0.0") + par = self._get_by_tree_path(tree, "0.5.0.0") self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, par["annotations"]) self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"]) diff --git a/tests/api_tests/test_api_misc_nesting_list.py b/tests/api_tests/test_api_misc_nesting_list.py index ecc0c96d..2cdd7e70 100644 --- a/tests/api_tests/test_api_misc_nesting_list.py +++ b/tests/api_tests/test_api_misc_nesting_list.py @@ -8,13 +8,13 @@ def test_list_nesting_content(self) -> None: result = self._send_request(file_name, data={"structure_type": "tree"}) content = result["content"]["structure"] - lst = content["subparagraphs"][2] + lst = content["subparagraphs"][3] self.assertEqual(lst["subparagraphs"][4]["text"], "1.5.\tОснования разработки") self.assertEqual(lst["subparagraphs"][5]["text"], "1.6.\tНормативные документы") self.assertEqual(lst["subparagraphs"][6]["text"], "1.7.\tСведения об источниках и порядке финансирования работ") self.assertEqual(len(lst["subparagraphs"][5]["subparagraphs"][0]["subparagraphs"]), 12) - lst = content["subparagraphs"][5] + lst = content["subparagraphs"][6] lst = lst["subparagraphs"][0]["subparagraphs"][0] self.assertEqual(lst["text"], "4.1.1. Требования к структуре и функционированию") self.assertEqual(lst["subparagraphs"][0]["text"].startswith("Система должна иметь базу хранения"), True)