Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def construct(self, document: UnstructuredDocument, parameters: Optional[dict] =
Build the tree structure representation for the given document intermediate representation.
To get the information about the parameters look at the documentation of :class:`~dedoc.structure_constructors.AbstractStructureConstructor`.
"""
from dedoc.data_structures.concrete_annotations import AttachAnnotation, TableAnnotation
from dedoc.data_structures.document_content import DocumentContent
from dedoc.data_structures.document_metadata import DocumentMetadata

Expand All @@ -46,7 +47,8 @@ def construct(self, document: UnstructuredDocument, parameters: Optional[dict] =
# multiline header
hl_equal = line.metadata.hierarchy_level == tree.metadata.hierarchy_level
line_type_equal = line.metadata.hierarchy_level.line_type == tree.metadata.hierarchy_level.line_type
if line.metadata.hierarchy_level.can_be_multiline and hl_equal and line_type_equal:
has_no_refs = len([ann for ann in tree.annotations if ann.name in (AttachAnnotation.name, TableAnnotation.name)]) == 0
if line.metadata.hierarchy_level.can_be_multiline and hl_equal and line_type_equal and has_no_refs:
tree.add_text(line)
# move up and add child

Expand Down
4 changes: 2 additions & 2 deletions tests/api_tests/test_api_doctype_article.py
Original file line number Diff line number Diff line change
Expand Up @@ -69,8 +69,8 @@ def test_article(self) -> None:
self.assertEqual(self._get_text_of_row(table["cells"][0]), ["Software (8-bit)", "code size", "cycle", "cost", "physical"])
section_with_table_refs = self._get_by_tree_path(tree, "0.7.0")
table_refs_in_text = [ann for ann in section_with_table_refs["annotations"] if ann["name"] == "table" and ann["value"] == table["metadata"]["uid"]]
self.assertEqual(len(table_refs_in_text), 2)
self.assertEqual(["1", "1"], [section_with_table_refs["text"][table_refs_in_text[n]["start"]:table_refs_in_text[n]["end"]] for n in range(2)])
self.assertEqual(len(table_refs_in_text), 1)
self.assertEqual("1", section_with_table_refs["text"][table_refs_in_text[0]["start"]:table_refs_in_text[0]["end"]])

table = result["content"]["tables"][1] # Grobid can't recognize vertical orientation tables
self.assertEqual(table["metadata"]["title"], "Table 2 .List of our target implementations.")
8 changes: 4 additions & 4 deletions tests/api_tests/test_api_doctype_tz.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,20 +56,20 @@ def test_gasu_tz(self) -> None:
self.assertEqual("СОДЕРЖАНИЕ", node["text"][:30].strip())
self.assertEqual("toc", node["metadata"]["paragraph_type"])

node = self._get_by_tree_path(tree, "0.1.5")
node = self._get_by_tree_path(tree, "0.1.6")
self.assertEqual("5.\tТехнические требования к ГАС «Управление»", node["text"].strip())
self.assertEqual("part", node["metadata"]["paragraph_type"])

node = self._get_by_tree_path(tree, "0.1.5.0")
node = self._get_by_tree_path(tree, "0.1.6.0")
self.assertEqual("5.1.\tТребования к функционированию", node["text"].strip())
self.assertEqual("named_item", node["metadata"]["paragraph_type"])

node = self._get_by_tree_path(tree, "0.1.4")
node = self._get_by_tree_path(tree, "0.1.5")
self.assertIn("«Управление»", node["text"])
self.assertEqual("4.\tОбщие требования к техническим компонентам ГАС«Управление»", node["text"].strip())
self.assertEqual("part", node["metadata"]["paragraph_type"])

node = self._get_by_tree_path(tree, "0.1.5.0.0.19.0")
node = self._get_by_tree_path(tree, "0.1.6.0.0.19.2")
self.assertIn("10 лет) ", node["text"])
self.assertEqual("raw_text", node["metadata"]["paragraph_type"])

Expand Down
2 changes: 1 addition & 1 deletion tests/api_tests/test_api_format_pdf.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,7 @@ def test_pdf_with_some_tables(self) -> None:
self._test_table_refs(result["content"])

# checks indentations
par = self._get_by_tree_path(tree, "0.4.0.0")
par = self._get_by_tree_path(tree, "0.5.0.0")
annotations = par["annotations"]
self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, annotations)
self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"])
Expand Down
6 changes: 3 additions & 3 deletions tests/api_tests/test_api_format_pdf_tabby_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -200,15 +200,15 @@ def test_pdf_with_tables(self) -> None:
self.assertEqual("raw_text", node["metadata"]["paragraph_type"])
self.assertEqual("ВВП (валовой внутренний продук", node["text"].strip()[:30])

node = self._get_by_tree_path(tree, "0.5.0")
node = self._get_by_tree_path(tree, "0.6.0")
self.assertEqual("list_item", node["metadata"]["paragraph_type"])
self.assertEqual("1. В соответствии с доходами.", node["text"].strip()[:30])

node = self._get_by_tree_path(tree, "0.5.1")
node = self._get_by_tree_path(tree, "0.6.1")
self.assertEqual("list_item", node["metadata"]["paragraph_type"])
self.assertEqual("2. В соответствии с расходами.", node["text"].strip()[:30])

node = self._get_by_tree_path(tree, "0.5.2")
node = self._get_by_tree_path(tree, "0.6.2")
self.assertEqual("list_item", node["metadata"]["paragraph_type"])
self.assertEqual("3. В соответствии с полученной", node["text"].strip()[:30])

Expand Down
2 changes: 1 addition & 1 deletion tests/api_tests/test_api_format_pdf_with_text.py
Original file line number Diff line number Diff line change
Expand Up @@ -138,7 +138,7 @@ def test_pdf_with_some_tables(self) -> None:
self._check_tree_sanity(tree)

# checks indentations
par = self._get_by_tree_path(tree, "0.4.0.0")
par = self._get_by_tree_path(tree, "0.5.0.0")
self.assertIn({"end": 170, "value": "600", "name": "indentation", "start": 0}, par["annotations"])
self.assertIn("Методика расчета ВВП по доходам характеризуется суммой национального\n", par["text"])

Expand Down
4 changes: 2 additions & 2 deletions tests/api_tests/test_api_misc_nesting_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,13 +8,13 @@ def test_list_nesting_content(self) -> None:
result = self._send_request(file_name, data={"structure_type": "tree"})
content = result["content"]["structure"]

lst = content["subparagraphs"][2]
lst = content["subparagraphs"][3]
self.assertEqual(lst["subparagraphs"][4]["text"], "1.5.\tОснования разработки")
self.assertEqual(lst["subparagraphs"][5]["text"], "1.6.\tНормативные документы")
self.assertEqual(lst["subparagraphs"][6]["text"], "1.7.\tСведения об источниках и порядке финансирования работ")
self.assertEqual(len(lst["subparagraphs"][5]["subparagraphs"][0]["subparagraphs"]), 12)

lst = content["subparagraphs"][5]
lst = content["subparagraphs"][6]
lst = lst["subparagraphs"][0]["subparagraphs"][0]
self.assertEqual(lst["text"], "4.1.1. Требования к структуре и функционированию")
self.assertEqual(lst["subparagraphs"][0]["text"].startswith("Система должна иметь базу хранения"), True)
Expand Down
Loading