From a3fde654f558f9542ce2cc1bf03cc6743891cbc3 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 16 Jun 2020 11:22:49 -0300 Subject: [PATCH 01/17] =?UTF-8?q?Divide=20self.=5Ffix=5Fstyles=20em=20duas?= =?UTF-8?q?=20fun=C3=A7=C3=B5es?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- src/scielo/bin/xml/prodtools/processing/sgmlxml.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 9c25054ea..cbc275540 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -252,14 +252,22 @@ def well_formed_xml_content(self): super().well_formed_xml_content() def _fix_styles(self): - # TODO: corrigir misplaced tags content = self._content + content = self._style_tags_upper_to_lower_case(content) + content = self._fix_mismatched_style_tags(content) + self._content = content + + def _fix_mismatched_style_tags(self, content): + # TODO: corrigir misplaced tags + return content + + def _style_tags_upper_to_lower_case(self, content): for style in ("BOLD", "ITALIC", "SUP", "SUB"): tag_open = "<{}>".format(style) tag_close = "".format(style) content = content.replace(tag_open, tag_open.lower()) content = content.replace(tag_close, tag_close.lower()) - self._content = content + return content def _fix_quotes(self): """ From cefb3e56dbd3bcbc568055b0ed390b0af3cf56e7 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 16 Jun 2020 11:47:15 -0300 Subject: [PATCH 02/17] =?UTF-8?q?Verifica=20se=20est=C3=A1=20mal=20formado?= =?UTF-8?q?=20antes=20de=20tentar=20consertar?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 33 ++++++++++++++----- 1 file changed, 25 insertions(+), 8 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index cbc275540..51dbcffe6 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -241,6 +241,11 @@ def __init__(self, src_pkgfiles, sgmlhtml): self._insert_xhtml_tables_in_document() logger.debug("...") + @property + def _is_well_formed(self): + xml, xml_error = xml_utils.load_xml(self._content) + return xml is not None + def well_formed_xml_content(self): self._content = xml_utils.insert_namespaces_in_root( "doc", self._content) @@ -252,28 +257,40 @@ def well_formed_xml_content(self): super().well_formed_xml_content() def _fix_styles(self): + self._style_tags_upper_to_lower_case() + self._fix_mismatched_style_tags() + + def _fix_mismatched_style_tags(self): + """ + No markup as tags de estilos são geradas pela marcação de estilos do + próprio Word, ou seja, bold, itálico, sup e sub não são tags do Markup. + No entanto, o resultado é a mescla de tags do Markup + tags de estilos + podendo acontecer de se mesclarem de forma que o resultado não seja + um XML não bem formado. O propósito deste atributo é consertar isso. + """ + if self._is_well_formed: + return content = self._content - content = self._style_tags_upper_to_lower_case(content) - content = self._fix_mismatched_style_tags(content) self._content = content - def _fix_mismatched_style_tags(self, content): - # TODO: corrigir misplaced tags - return content - - def _style_tags_upper_to_lower_case(self, content): + def _style_tags_upper_to_lower_case(self): + if self._is_well_formed: + return + content = self._content for style in ("BOLD", "ITALIC", "SUP", "SUB"): tag_open = "<{}>".format(style) tag_close = "".format(style) content = content.replace(tag_open, tag_open.lower()) content = content.replace(tag_close, tag_close.lower()) - return content + self._content = content def _fix_quotes(self): """ Às vezes a codificação das aspas vem diferente de ", sendo assim são necessários trocas antes de carregar a árvore de XML """ + if self._is_well_formed: + return content = self._content content = content.replace("<", "FIXQUOTESBREK<") content = content.replace(">", ">FIXQUOTESBREK") From 0b6a0ef601de7d618272adfa913529a8e00bea00 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 16 Jun 2020 16:02:26 -0300 Subject: [PATCH 03/17] =?UTF-8?q?Cria=20uma=20classe=20para=20tratar=20das?= =?UTF-8?q?=20tags=20de=20estilo=20Para=20primeiro=20passo,=20converte=20a?= =?UTF-8?q?s=20tags=20"xml"=20em=20tags=20"sgml"=20de=20forma=20que=20seja?= =?UTF-8?q?m=20ignoradas=20ao=20carregar=20o=20xml=20em=20formato=20de=20?= =?UTF-8?q?=C3=A1rvore=20Para=20segundo=20passo,=20verifica=20cada=20trech?= =?UTF-8?q?o=20node.text=20e=20tenta=20restaurar=20as=20tags=20de=20estilo?= =?UTF-8?q?,=20ou=20seja,=20convertendo=20de=20"sgml"=20para=20"xml"=20e?= =?UTF-8?q?=20verificando=20se=20o=20XML=20segue=20bem=20formado,=20caso?= =?UTF-8?q?=20contr=C3=A1rio=20n=C3=A3o=20restaura=20as=20tags=20dos=20tre?= =?UTF-8?q?chos.=20As=20tags=20que=20causam=20um=20XML=20mal=20formado=20f?= =?UTF-8?q?icam=20como=20"sgml"=20para=20nos=20pr=C3=B3ximos=20passos=20se?= =?UTF-8?q?rem=20tratadas.=20Cria=20testes=20para=20a=20classe?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 79 ++++++++++++++++++- src/scielo/bin/xml/tests/test_sgmlxml.py | 48 +++++++++++ 2 files changed, 125 insertions(+), 2 deletions(-) create mode 100644 src/scielo/bin/xml/tests/test_sgmlxml.py diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 51dbcffe6..ba4e3ff8d 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -230,6 +230,7 @@ class SGMLXMLContentEnhancer(xml_utils.SuitableXML): def __init__(self, src_pkgfiles, sgmlhtml): self.sgmlhtml = sgmlhtml self.src_pkgfiles = src_pkgfiles + self.style_tags_fixer = StyleTagsFixer() super().__init__(src_pkgfiles.filename) if self.xml_error: raise Exception(self.xml_error) @@ -270,12 +271,13 @@ def _fix_mismatched_style_tags(self): """ if self._is_well_formed: return - content = self._content - self._content = content + logger.debug("_fix_mismatched_style_tags") + self._content = self.style_tags_fixer.fix(self._content) def _style_tags_upper_to_lower_case(self): if self._is_well_formed: return + logger.debug("_style_tags_upper_to_lower_case") content = self._content for style in ("BOLD", "ITALIC", "SUP", "SUB"): tag_open = "<{}>".format(style) @@ -291,6 +293,7 @@ def _fix_quotes(self): """ if self._is_well_formed: return + logger.debug("_fix_quotes") content = self._content content = content.replace("<", "FIXQUOTESBREK<") content = content.replace(">", ">FIXQUOTESBREK") @@ -427,6 +430,78 @@ def _find_href_file_in_folder(self, elem_name, elem_id, return (new_href, no_parents_img_counter) +class StyleTagsFixer(object): + + def __init__(self): + self.TAGS = [] + for style in ("bold", "italic", "sup", "sub"): + tag_open = "<{}>".format(style) + tag_close = "".format(style) + new_open = tag_open.replace("<", "[").replace(">", "]") + new_close = tag_close.replace("<", "[").replace(">", "]") + self.TAGS.append((tag_open, new_open)) + self.TAGS.append((tag_close, new_close)) + + def fix(self, content): + original = content + content = self._disguise_style_tags(content) + + xml, xml_error = xml_utils.load_xml(content) + if xml is None: + # XML já está mal formado. As tags de estilo não são a causa. + return original + + content = self._restore_matched_style_tags_in_node_texts(xml) + return content + + def _disguise_style_tags(self, content): + """ + Disfarça as tags de estilo, mudando `` para `[tag]` + """ + for tag, new_tag in self.TAGS: + content = content.replace(tag, new_tag) + return content + + def _revert_disguised_style_tags(self, content): + """ + Reverte o disfarce das tags de estilo, mudando `[tag]` para `` + """ + for tag, new_tag in self.TAGS: + content = content.replace(new_tag, tag) + return content + + def _restore_matched_style_tags_in_node_texts(self, xml): + for node in xml.findall(".//*"): + if node.text and "[" in node.text and "]" in node.text: + new_node = self._restore_matched_style_tags_in_node_text(node) + if new_node is not None: + parent = node.getparent() + parent.replace(node, new_node) + return xml_utils.tostring(xml) + + def _restore_matched_style_tags_in_node_tails(self, xml): + for node in xml.findall(".//*"): + if node.tail and "[" in node.tail and "]" in node.tail: + self._restore_matched_style_tags_in_node_tail(node) + return xml_utils.tostring(xml) + + def _restore_matched_style_tags_in_node_text(self, node): + text = node.text + text = self._revert_disguised_style_tags(text) + root = "<{}>{}".format(node.tag, text, node.tag) + xml, xml_error = xml_utils.load_xml(root) + if xml is not None: + return xml.find(".").getchildren()[0] + else: + # TODO: usar alguma estratégia para corrigir + print("\n"*10) + print(text) + + def _restore_matched_style_tags_in_node_tail(self, node): + # TODO: usar alguma estratégia para corrigir + pass + + class PackageNamer(object): def __init__(self, src_pkgfiles, acron, dest_path): diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py new file mode 100644 index 000000000..5c82dd9d3 --- /dev/null +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -0,0 +1,48 @@ +# coding=utf-8 +import sys +from unittest import TestCase + +from prodtools.processing import sgmlxml +from prodtools.utils import xml_utils + + +python_version = sys.version_info.major + + +class TestStyleTagsFixer(TestCase): + + def setUp(self): + self.style_tags_fixer = sgmlxml.StyleTagsFixer() + + def test_restore_matched_style_tags_in_node_text_returns_restored_style_tags(self): + text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2

""" + xml = xml_utils.etree.fromstring(text) + node = xml.find(".//p") + new_node = self.style_tags_fixer._restore_matched_style_tags_in_node_text(node) + expected = """

texto 1 sup bold texto 2

""" + self.assertEqual( + expected, xml_utils.tostring(new_node)) + + def test_restore_matched_style_tags_in_node_text_does_not_restore_because_they_are_mismatched(self): + text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" + xml = xml_utils.etree.fromstring(text) + node = xml.find(".//p") + new_node = self.style_tags_fixer._restore_matched_style_tags_in_node_text(node) + self.assertIsNone(xml_utils.tostring(new_node)) + + def test_restore_matched_style_tags_in_node_texts_returns_restored_style_tags(self): + text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2

""" + expected = """

texto 1 sup bold texto 2

""" + xml = xml_utils.etree.fromstring(text) + self.assertEqual( + expected, + self.style_tags_fixer._restore_matched_style_tags_in_node_texts(xml)) + + def test_restore_matched_style_tags_in_node_texts_does_not_restore_because_they_are_mismatched(self): + text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" + expected = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" + xml = xml_utils.etree.fromstring(text) + self.assertEqual( + expected, + self.style_tags_fixer._restore_matched_style_tags_in_node_texts(xml)) + From 06edfe45b3dcec67a42a86173aaac545c5bcc876 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 16 Jun 2020 17:17:34 -0300 Subject: [PATCH 04/17] Cria atributo que restaura as tags de estilo de node.tail --- .../bin/xml/prodtools/processing/sgmlxml.py | 30 ++++++++++++++-- src/scielo/bin/xml/tests/test_sgmlxml.py | 34 +++++++++++++++++++ 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index ba4e3ff8d..6c94fcb32 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -451,6 +451,7 @@ def fix(self, content): # XML já está mal formado. As tags de estilo não são a causa. return original + content = self._restore_matched_style_tags_in_node_tails(xml) content = self._restore_matched_style_tags_in_node_texts(xml) return content @@ -471,6 +472,9 @@ def _revert_disguised_style_tags(self, content): return content def _restore_matched_style_tags_in_node_texts(self, xml): + """ + Restaura as tags de estilo de todos os node.text + """ for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: new_node = self._restore_matched_style_tags_in_node_text(node) @@ -480,26 +484,46 @@ def _restore_matched_style_tags_in_node_texts(self, xml): return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_tails(self, xml): + """ + Restaura as tags de estilo de todos os node.tail + """ for node in xml.findall(".//*"): if node.tail and "[" in node.tail and "]" in node.tail: self._restore_matched_style_tags_in_node_tail(node) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_text(self, node): + """ + Restaura as tags de estilo de um node.text + """ text = node.text text = self._revert_disguised_style_tags(text) root = "<{}>{}".format(node.tag, text, node.tag) xml, xml_error = xml_utils.load_xml(root) if xml is not None: - return xml.find(".").getchildren()[0] + return deepcopy(xml.find(".").getchildren()[0]) else: # TODO: usar alguma estratégia para corrigir print("\n"*10) print(text) def _restore_matched_style_tags_in_node_tail(self, node): - # TODO: usar alguma estratégia para corrigir - pass + """ + Restaura as tags de estilo de um node.tail + """ + tail = node.tail + tail = self._revert_disguised_style_tags(tail) + root = "{}".format(tail) + xml, xml_error = xml_utils.load_xml(root) + if xml is not None: + node.tail = "" + for n in xml.find(".").getchildren(): + node.addnext(deepcopy(n)) + node.tail = xml.find(".").text + else: + # TODO: usar alguma estratégia para corrigir + print("\n"*10) + print(tail) class PackageNamer(object): diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index 5c82dd9d3..9d21e67a8 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -46,3 +46,37 @@ def test_restore_matched_style_tags_in_node_texts_does_not_restore_because_they_ expected, self.style_tags_fixer._restore_matched_style_tags_in_node_texts(xml)) + def test_restore_matched_style_tags_in_node_tail_returns_restored_style_tags(self): + text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2""" + xml = xml_utils.etree.fromstring(text) + node = xml.find(".//p") + self.style_tags_fixer._restore_matched_style_tags_in_node_tail(node) + self.assertEqual(node.tail, "texto 1 ") + self.assertEqual( + xml_utils.tostring(node.getnext()), + "sup bold") + self.assertEqual(node.getnext().tail, " texto 2") + + def test_restore_matched_style_tags_in_node_tail_does_not_restore_because_they_are_mismatched(self): + text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" + xml = xml_utils.etree.fromstring(text) + node = xml.find(".//p") + self.style_tags_fixer._restore_matched_style_tags_in_node_tail(node) + self.assertEqual( + node.tail, "texto 1 [sup][bold]sup bold[/sup][/bold] texto 2") + + def test_restore_matched_style_tags_in_node_tails_returns_restored_style_tags(self): + text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2""" + expected = """

texto 1 sup bold texto 2""" + xml = xml_utils.etree.fromstring(text) + self.assertEqual( + expected, + self.style_tags_fixer._restore_matched_style_tags_in_node_tails(xml)) + + def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_are_mismatched(self): + text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" + expected = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" + xml = xml_utils.etree.fromstring(text) + self.assertEqual( + expected, + self.style_tags_fixer._restore_matched_style_tags_in_node_tails(xml)) From 5b4abf48b80c55c4db2925ba171b592f48b9bf2a Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Tue, 16 Jun 2020 19:23:52 -0300 Subject: [PATCH 05/17] =?UTF-8?q?Tenta=20uma=20nova=20estrat=C3=A9gia,=20s?= =?UTF-8?q?e=20o=20par=C3=A2metro=20retry=20=C3=A9=20igual=20a=20True.=20T?= =?UTF-8?q?enta=20usar=20uma=20outra=20estrat=C3=A9gia?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 43 ++++++++++++------- src/scielo/bin/xml/tests/test_sgmlxml.py | 36 ++++++++++++++-- 2 files changed, 60 insertions(+), 19 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 6c94fcb32..260c462ed 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -433,14 +433,14 @@ def _find_href_file_in_folder(self, elem_name, elem_id, class StyleTagsFixer(object): def __init__(self): - self.TAGS = [] + self.XML_TO_SGML = [] for style in ("bold", "italic", "sup", "sub"): tag_open = "<{}>".format(style) tag_close = "".format(style) new_open = tag_open.replace("<", "[").replace(">", "]") new_close = tag_close.replace("<", "[").replace(">", "]") - self.TAGS.append((tag_open, new_open)) - self.TAGS.append((tag_close, new_close)) + self.XML_TO_SGML.append((tag_open, new_open)) + self.XML_TO_SGML.append((tag_close, new_close)) def fix(self, content): original = content @@ -453,13 +453,14 @@ def fix(self, content): content = self._restore_matched_style_tags_in_node_tails(xml) content = self._restore_matched_style_tags_in_node_texts(xml) + return content def _disguise_style_tags(self, content): """ Disfarça as tags de estilo, mudando `` para `[tag]` """ - for tag, new_tag in self.TAGS: + for tag, new_tag in self.XML_TO_SGML: content = content.replace(tag, new_tag) return content @@ -467,7 +468,7 @@ def _revert_disguised_style_tags(self, content): """ Reverte o disfarce das tags de estilo, mudando `[tag]` para `` """ - for tag, new_tag in self.TAGS: + for tag, new_tag in self.XML_TO_SGML: content = content.replace(new_tag, tag) return content @@ -492,7 +493,7 @@ def _restore_matched_style_tags_in_node_tails(self, xml): self._restore_matched_style_tags_in_node_tail(node) return xml_utils.tostring(xml) - def _restore_matched_style_tags_in_node_text(self, node): + def _restore_matched_style_tags_in_node_text(self, node, retry=False): """ Restaura as tags de estilo de um node.text """ @@ -500,14 +501,12 @@ def _restore_matched_style_tags_in_node_text(self, node): text = self._revert_disguised_style_tags(text) root = "<{}>{}".format(node.tag, text, node.tag) xml, xml_error = xml_utils.load_xml(root) + if xml is None and retry: + xml = self.retry(root, text) if xml is not None: return deepcopy(xml.find(".").getchildren()[0]) - else: - # TODO: usar alguma estratégia para corrigir - print("\n"*10) - print(text) - def _restore_matched_style_tags_in_node_tail(self, node): + def _restore_matched_style_tags_in_node_tail(self, node, retry=False): """ Restaura as tags de estilo de um node.tail """ @@ -515,15 +514,29 @@ def _restore_matched_style_tags_in_node_tail(self, node): tail = self._revert_disguised_style_tags(tail) root = "{}".format(tail) xml, xml_error = xml_utils.load_xml(root) + if xml is None and retry: + xml = self.retry(root, tail) + if xml is not None: node.tail = "" for n in xml.find(".").getchildren(): node.addnext(deepcopy(n)) node.tail = xml.find(".").text - else: - # TODO: usar alguma estratégia para corrigir - print("\n"*10) - print(tail) + return + + def loss(self, xml, tail): + _xml = xml and "".join(xml.find(".").itertext()) + _tail = tail + for tag, sgml in self.XML_TO_SGML: + _tail = _tail.replace(tag, "") + return _xml != _tail + + def retry(self, root, text): + # tenta carregar o xml, usando o parâmetro "recover=True" + # para tentar resolver mismatched tags ou tags não fechadas + xml, xml_error = xml_utils.load_xml(root, recover=True) + if not self.loss(xml, text): + return xml class PackageNamer(object): diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index 9d21e67a8..eccce1711 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -36,7 +36,8 @@ def test_restore_matched_style_tags_in_node_texts_returns_restored_style_tags(se xml = xml_utils.etree.fromstring(text) self.assertEqual( expected, - self.style_tags_fixer._restore_matched_style_tags_in_node_texts(xml)) + self.style_tags_fixer._restore_matched_style_tags_in_node_texts( + xml)) def test_restore_matched_style_tags_in_node_texts_does_not_restore_because_they_are_mismatched(self): text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" @@ -44,7 +45,8 @@ def test_restore_matched_style_tags_in_node_texts_does_not_restore_because_they_ xml = xml_utils.etree.fromstring(text) self.assertEqual( expected, - self.style_tags_fixer._restore_matched_style_tags_in_node_texts(xml)) + self.style_tags_fixer._restore_matched_style_tags_in_node_texts( + xml)) def test_restore_matched_style_tags_in_node_tail_returns_restored_style_tags(self): text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2""" @@ -65,13 +67,38 @@ def test_restore_matched_style_tags_in_node_tail_does_not_restore_because_they_a self.assertEqual( node.tail, "texto 1 [sup][bold]sup bold[/sup][/bold] texto 2") + def test_restore_matched_style_tags_in_node_tail_with_retry_true_restores_them_although_they_are_mismatched(self): + text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" + xml = xml_utils.etree.fromstring(text) + node = xml.find(".//p") + self.style_tags_fixer._restore_matched_style_tags_in_node_tail( + node, retry=True) + self.assertEqual(node.tail, "texto 1 ") + self.assertEqual( + xml_utils.tostring(node.getnext()), + "sup bold") + self.assertEqual(node.getnext().tail, " texto 2") + + def test_restore_matched_style_tags_in_node_tail_with_retry_true_restores_sup_although_it_is_not_closed(self): + text = """

texto 1 [sup]sup bold texto 2""" + xml = xml_utils.etree.fromstring(text) + node = xml.find(".//p") + self.style_tags_fixer._restore_matched_style_tags_in_node_tail( + node, retry=True) + self.assertEqual(node.tail, "texto 1 ") + self.assertEqual( + xml_utils.tostring(node.getnext()), + "sup bold texto 2") + self.assertEqual(node.getnext().tail, None or "") + def test_restore_matched_style_tags_in_node_tails_returns_restored_style_tags(self): text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2""" expected = """

texto 1 sup bold texto 2""" xml = xml_utils.etree.fromstring(text) self.assertEqual( expected, - self.style_tags_fixer._restore_matched_style_tags_in_node_tails(xml)) + self.style_tags_fixer._restore_matched_style_tags_in_node_tails( + xml)) def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_are_mismatched(self): text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" @@ -79,4 +106,5 @@ def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_ xml = xml_utils.etree.fromstring(text) self.assertEqual( expected, - self.style_tags_fixer._restore_matched_style_tags_in_node_tails(xml)) + self.style_tags_fixer._restore_matched_style_tags_in_node_tails( + xml)) From ec06cc1f841f0b21afee0c6c7c7fcd401fd8b734 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 07:36:13 -0300 Subject: [PATCH 06/17] =?UTF-8?q?Cria=20atributos=20para=20inserir=20tags?= =?UTF-8?q?=20de=20"fecha"=20e=20"abre"=20que=20est=C3=A3o=20ausentes=20na?= =?UTF-8?q?s=20extremidades?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 107 +++++++++++++++--- src/scielo/bin/xml/tests/test_sgmlxml.py | 14 +++ 2 files changed, 103 insertions(+), 18 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 260c462ed..32f6503a3 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -430,17 +430,27 @@ def _find_href_file_in_folder(self, elem_name, elem_id, return (new_href, no_parents_img_counter) +class StyleTag(object): + + def __init__(self, name): + self.name = name + self.xml_open = "<{}>".format(name) + self.sgml_open = "[{}]".format(name) + self.xml_close = "".format(name) + self.sgml_close = "[/{}]".format(name) + + class StyleTagsFixer(object): def __init__(self): self.XML_TO_SGML = [] + self.style_tags = {} for style in ("bold", "italic", "sup", "sub"): - tag_open = "<{}>".format(style) - tag_close = "".format(style) - new_open = tag_open.replace("<", "[").replace(">", "]") - new_close = tag_close.replace("<", "[").replace(">", "]") - self.XML_TO_SGML.append((tag_open, new_open)) - self.XML_TO_SGML.append((tag_close, new_close)) + self.style_tags[style] = StyleTag(style) + style_tag = self.style_tags[style] + self.XML_TO_SGML.append((style_tag.xml_open, style_tag.sgml_open)) + self.XML_TO_SGML.append( + (style_tag.xml_close, style_tag.sgml_close)) def fix(self, content): original = content @@ -502,7 +512,7 @@ def _restore_matched_style_tags_in_node_text(self, node, retry=False): root = "<{}>{}".format(node.tag, text, node.tag) xml, xml_error = xml_utils.load_xml(root) if xml is None and retry: - xml = self.retry(root, text) + xml = self._retry(text, root, node.tag) if xml is not None: return deepcopy(xml.find(".").getchildren()[0]) @@ -515,7 +525,7 @@ def _restore_matched_style_tags_in_node_tail(self, node, retry=False): root = "{}".format(tail) xml, xml_error = xml_utils.load_xml(root) if xml is None and retry: - xml = self.retry(root, tail) + xml = self._retry(tail, root) if xml is not None: node.tail = "" @@ -524,20 +534,81 @@ def _restore_matched_style_tags_in_node_tail(self, node, retry=False): node.tail = xml.find(".").text return - def loss(self, xml, tail): + def _loss(self, xml, text): + logger.debug("StyleTagsFixer._loss: %s", text) _xml = xml and "".join(xml.find(".").itertext()) - _tail = tail + _text = text + logger.debug("StyleTagsFixer._loss: _xml=%s", _xml) + logger.debug("StyleTagsFixer._loss: _text=%s", _text) for tag, sgml in self.XML_TO_SGML: - _tail = _tail.replace(tag, "") - return _xml != _tail - - def retry(self, root, text): - # tenta carregar o xml, usando o parâmetro "recover=True" - # para tentar resolver mismatched tags ou tags não fechadas - xml, xml_error = xml_utils.load_xml(root, recover=True) - if not self.loss(xml, text): + _text = _text.replace(tag, "") + return _xml != _text + + def _retry(self, text, wrapped_text, wrap_tag=None): + logger.debug("StyleTagsFixer._retry: %s", text) + # text = self._retry_inserting_tags_at_the_extremities(text) + xml = self._retry_loading_xml_with_recover_true(wrapped_text, text) + return xml + + def _retry_loading_xml_with_recover_true(self, wrapped_text, text): + """ + Tenta carregar o xml, usando o parâmetro "recover=True" + para tentar resolver mismatched tags ou tags não fechadas + """ + logger.debug( + "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", text) + xml, xml_error = xml_utils.load_xml(wrapped_text, recover=True) + if not self._loss(xml, text): + logger.debug( + "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", + xml_utils.tostring(xml)) return xml + def _retry_inserting_tags_at_the_extremities(self, text): + """ + Tenta resolver mismatched tags inserindo tag de abre no início e + tag de fecha no fim, se ausentes + """ + logger.debug("StyleTagsFixer.retry_checking_tags: %s", text) + text = self._disguise_style_tags(text) + found_tags = self._find_style_tags(text) + if len(found_tags) > 0: + text = self._insert_open_tag_at_the_start(found_tags[0], text) + text = self._insert_close_tag_at_the_end(found_tags[-1], text) + text = self._revert_disguised_style_tags(text) + return text + + def _insert_open_tag_at_the_start(self, first_tag, text): + """ + Se a primeira tag é "fecha", então insere tag "abre" no início + """ + if first_tag.startswith("[/"): + style_tag = self.style_tags.get(first_tag[2:-1]) + text = style_tag.sgml_open + text + return text + + def _insert_close_tag_at_the_end(self, last_tag, text): + """ + Se a última tag "abre", então insere tag "fecha" no fim + """ + if not last_tag.startswith("[/"): + style_tag = self.style_tags.get(last_tag[1:-1]) + text += style_tag.sgml_close + return text + + def _find_style_tags(self, text): + """ + Identifica as tags de estilo em text + """ + items = text.replace( + "[", "BREAKSTYLETAGS[").replace( + "]", "]BREAKSTYLETAGS").split("BREAKSTYLETAGS") + return [ + item + for item in items + if item.startswith("[") and item.endswith("]") + ] + class PackageNamer(object): diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index eccce1711..fde715bc3 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -108,3 +108,17 @@ def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_ expected, self.style_tags_fixer._restore_matched_style_tags_in_node_tails( xml)) + + def test_retry_inserting_tags_at_the_extremities_insert_at_the_start(self): + text = """texto 1 [/sup] texto 2""" + expected = """texto 1 texto 2""" + result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + text) + self.assertEqual(expected, result) + + def test_retry_inserting_tags_at_the_extremities_insert_at_the_end(self): + text = """texto 1 [sup] texto 2""" + expected = """texto 1 texto 2""" + result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + text) + self.assertEqual(expected, result) From 69e4eba5b6503531aeb3320cb9f3563c69d1c160 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 08:32:42 -0300 Subject: [PATCH 07/17] Cria atributos para envelopar node.text e node.tail --- .../bin/xml/prodtools/processing/sgmlxml.py | 88 ++++++++++--------- 1 file changed, 47 insertions(+), 41 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 32f6503a3..13c95100c 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -509,98 +509,104 @@ def _restore_matched_style_tags_in_node_text(self, node, retry=False): """ text = node.text text = self._revert_disguised_style_tags(text) - root = "<{}>{}".format(node.tag, text, node.tag) - xml, xml_error = xml_utils.load_xml(root) + wrapped_node_text = self._wrapped_content(text, node.tag) + xml, xml_error = xml_utils.load_xml(wrapped_node_text) if xml is None and retry: - xml = self._retry(text, root, node.tag) + xml = self._retry(text, wrapped_node_text, node.tag) if xml is not None: return deepcopy(xml.find(".").getchildren()[0]) + def _wrapped_content(self, content, node_tag=None): + if not node_tag: + return "{}".format(content) + return "<{}>{}".format(node_tag, content, node_tag) + def _restore_matched_style_tags_in_node_tail(self, node, retry=False): """ Restaura as tags de estilo de um node.tail """ tail = node.tail tail = self._revert_disguised_style_tags(tail) - root = "{}".format(tail) - xml, xml_error = xml_utils.load_xml(root) + wrapped_node_tail = self._wrapped_content(tail) + xml, xml_error = xml_utils.load_xml(wrapped_node_tail) if xml is None and retry: - xml = self._retry(tail, root) + xml = self._retry(tail, wrapped_node_tail) if xml is not None: node.tail = "" for n in xml.find(".").getchildren(): node.addnext(deepcopy(n)) node.tail = xml.find(".").text - return - def _loss(self, xml, text): - logger.debug("StyleTagsFixer._loss: %s", text) + def _loss(self, xml, content): _xml = xml and "".join(xml.find(".").itertext()) - _text = text - logger.debug("StyleTagsFixer._loss: _xml=%s", _xml) - logger.debug("StyleTagsFixer._loss: _text=%s", _text) + _content = content for tag, sgml in self.XML_TO_SGML: - _text = _text.replace(tag, "") - return _xml != _text - - def _retry(self, text, wrapped_text, wrap_tag=None): - logger.debug("StyleTagsFixer._retry: %s", text) - # text = self._retry_inserting_tags_at_the_extremities(text) - xml = self._retry_loading_xml_with_recover_true(wrapped_text, text) + _content = _content.replace(tag, "") + logger.debug("StyleTagsFixer._loss: content=%s", content) + logger.debug("StyleTagsFixer._loss: _xml=%s", _xml) + logger.debug("StyleTagsFixer._loss: _content=%s", _content) + return _xml != _content + + def _retry(self, content, wrapped_content, node_tag=None): + logger.debug("StyleTagsFixer._retry: %s", content) + # content = self._retry_inserting_tags_at_the_extremities(content) + xml = self._retry_loading_xml_with_recover_true( + wrapped_content, content) return xml - def _retry_loading_xml_with_recover_true(self, wrapped_text, text): + def _retry_loading_xml_with_recover_true(self, wrapped_content, content): """ Tenta carregar o xml, usando o parâmetro "recover=True" para tentar resolver mismatched tags ou tags não fechadas + Esta estratégia não é excelente pois não é previsível """ logger.debug( - "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", text) - xml, xml_error = xml_utils.load_xml(wrapped_text, recover=True) - if not self._loss(xml, text): + "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", content) + xml, xml_error = xml_utils.load_xml(wrapped_content, recover=True) + if not self._loss(xml, content): logger.debug( "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", xml_utils.tostring(xml)) return xml - def _retry_inserting_tags_at_the_extremities(self, text): + def _retry_inserting_tags_at_the_extremities(self, content): """ Tenta resolver mismatched tags inserindo tag de abre no início e tag de fecha no fim, se ausentes """ - logger.debug("StyleTagsFixer.retry_checking_tags: %s", text) - text = self._disguise_style_tags(text) - found_tags = self._find_style_tags(text) - if len(found_tags) > 0: - text = self._insert_open_tag_at_the_start(found_tags[0], text) - text = self._insert_close_tag_at_the_end(found_tags[-1], text) - text = self._revert_disguised_style_tags(text) - return text + logger.debug("StyleTagsFixer.retry_checking_tags: %s", content) + content = self._disguise_style_tags(content) + found = self._find_style_tags(content) + if len(found) > 0: + content = self._insert_open_tag_at_the_start(found[0], content) + content = self._insert_close_tag_at_the_end(found[-1], content) + content = self._revert_disguised_style_tags(content) + return content - def _insert_open_tag_at_the_start(self, first_tag, text): + def _insert_open_tag_at_the_start(self, first_tag, content): """ Se a primeira tag é "fecha", então insere tag "abre" no início """ if first_tag.startswith("[/"): style_tag = self.style_tags.get(first_tag[2:-1]) - text = style_tag.sgml_open + text - return text + content = style_tag.sgml_open + content + return content - def _insert_close_tag_at_the_end(self, last_tag, text): + def _insert_close_tag_at_the_end(self, last_tag, content): """ Se a última tag "abre", então insere tag "fecha" no fim """ if not last_tag.startswith("[/"): style_tag = self.style_tags.get(last_tag[1:-1]) - text += style_tag.sgml_close - return text + content += style_tag.sgml_close + return content - def _find_style_tags(self, text): + def _find_style_tags(self, content): """ - Identifica as tags de estilo em text + Identifica as tags de estilo em content """ - items = text.replace( + items = content.replace( "[", "BREAKSTYLETAGS[").replace( "]", "]BREAKSTYLETAGS").split("BREAKSTYLETAGS") return [ From 204cb369e98e98c650b06d7dffa73fcf5db169fa Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 09:40:01 -0300 Subject: [PATCH 08/17] Faz uns ajustes --- src/scielo/bin/xml/prodtools/processing/sgmlxml.py | 10 ++++++---- src/scielo/bin/xml/tests/test_sgmlxml.py | 4 ++-- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 13c95100c..cf6dbdd4a 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -512,7 +512,7 @@ def _restore_matched_style_tags_in_node_text(self, node, retry=False): wrapped_node_text = self._wrapped_content(text, node.tag) xml, xml_error = xml_utils.load_xml(wrapped_node_text) if xml is None and retry: - xml = self._retry(text, wrapped_node_text, node.tag) + xml = self._retry(text, node.tag) if xml is not None: return deepcopy(xml.find(".").getchildren()[0]) @@ -530,7 +530,7 @@ def _restore_matched_style_tags_in_node_tail(self, node, retry=False): wrapped_node_tail = self._wrapped_content(tail) xml, xml_error = xml_utils.load_xml(wrapped_node_tail) if xml is None and retry: - xml = self._retry(tail, wrapped_node_tail) + xml = self._retry(tail) if xml is not None: node.tail = "" @@ -548,9 +548,11 @@ def _loss(self, xml, content): logger.debug("StyleTagsFixer._loss: _content=%s", _content) return _xml != _content - def _retry(self, content, wrapped_content, node_tag=None): + def _retry(self, content, node_tag=None): logger.debug("StyleTagsFixer._retry: %s", content) - # content = self._retry_inserting_tags_at_the_extremities(content) + content = self._retry_inserting_tags_at_the_extremities(content) + + wrapped_content = self._wrapped_content(content, node_tag) xml = self._retry_loading_xml_with_recover_true( wrapped_content, content) return xml diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index fde715bc3..365753747 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -110,14 +110,14 @@ def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_ xml)) def test_retry_inserting_tags_at_the_extremities_insert_at_the_start(self): - text = """texto 1 [/sup] texto 2""" + text = """texto 1 texto 2""" expected = """texto 1 texto 2""" result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) def test_retry_inserting_tags_at_the_extremities_insert_at_the_end(self): - text = """texto 1 [sup] texto 2""" + text = """texto 1 texto 2""" expected = """texto 1 texto 2""" result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( text) From 0deb7701e7f50ebfb87874f069e3cb1bda73144a Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 12:59:58 -0300 Subject: [PATCH 09/17] =?UTF-8?q?Cria=20atributos=20para=20inserir=20tags?= =?UTF-8?q?=20de=20"fecha"=20e=20"abre"=20que=20est=C3=A3o=20ausentes=20na?= =?UTF-8?q?s=20extremidades=20at=C3=A9=20que=20n=C3=A3o=20exista=20necessi?= =?UTF-8?q?dade?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 26 +++++++++++++------ src/scielo/bin/xml/tests/test_sgmlxml.py | 14 ++++++++++ 2 files changed, 32 insertions(+), 8 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index cf6dbdd4a..345c6509d 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -551,11 +551,10 @@ def _loss(self, xml, content): def _retry(self, content, node_tag=None): logger.debug("StyleTagsFixer._retry: %s", content) content = self._retry_inserting_tags_at_the_extremities(content) - wrapped_content = self._wrapped_content(content, node_tag) - xml = self._retry_loading_xml_with_recover_true( + xml, xml_error = xml_utils.load_xml(wrapped_content) + return xml or self._retry_loading_xml_with_recover_true( wrapped_content, content) - return xml def _retry_loading_xml_with_recover_true(self, wrapped_content, content): """ @@ -579,10 +578,16 @@ def _retry_inserting_tags_at_the_extremities(self, content): """ logger.debug("StyleTagsFixer.retry_checking_tags: %s", content) content = self._disguise_style_tags(content) - found = self._find_style_tags(content) - if len(found) > 0: + while True: + found = self._find_style_tags(content) + if len(found) == 0: + break + old_content = content content = self._insert_open_tag_at_the_start(found[0], content) content = self._insert_close_tag_at_the_end(found[-1], content) + if old_content == content: + # acabaram as potenciais mudanças + break content = self._revert_disguised_style_tags(content) return content @@ -592,7 +597,8 @@ def _insert_open_tag_at_the_start(self, first_tag, content): """ if first_tag.startswith("[/"): style_tag = self.style_tags.get(first_tag[2:-1]) - content = style_tag.sgml_open + content + content = (style_tag.xml_open + + content.replace(first_tag, style_tag.xml_close, 1)) return content def _insert_close_tag_at_the_end(self, last_tag, content): @@ -601,20 +607,24 @@ def _insert_close_tag_at_the_end(self, last_tag, content): """ if not last_tag.startswith("[/"): style_tag = self.style_tags.get(last_tag[1:-1]) - content += style_tag.sgml_close + start = content[:content.find(last_tag)] + end = content[content.find(last_tag):].replace( + last_tag, style_tag.xml_open, 1) + content = start + end + style_tag.xml_close return content def _find_style_tags(self, content): """ Identifica as tags de estilo em content """ + sgml_tags = dict(self.XML_TO_SGML).values() items = content.replace( "[", "BREAKSTYLETAGS[").replace( "]", "]BREAKSTYLETAGS").split("BREAKSTYLETAGS") return [ item for item in items - if item.startswith("[") and item.endswith("]") + if item in sgml_tags ] diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index 365753747..fa770727f 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -122,3 +122,17 @@ def test_retry_inserting_tags_at_the_extremities_insert_at_the_end(self): result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) + + def test_retry_inserting_tags_at_the_extremities_insert_at_the_start_repeatly(self): + text = """texto 1 texto 2""" + expected = """texto 1 texto 2""" + result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + text) + self.assertEqual(expected, result) + + def test_retry_inserting_tags_at_the_extremities_insert_at_the_end_repeatly(self): + text = """texto 1 texto 2""" + expected = """texto 1 texto 2""" + result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + text) + self.assertEqual(expected, result) From 37ad883e08742f8e6a408ef3b617ce8defee310a Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 13:27:13 -0300 Subject: [PATCH 10/17] =?UTF-8?q?Distinguir=20entre=20as=20tags=20de=20est?= =?UTF-8?q?ilos=20j=C3=A1=20verificadas=20vs=20n=C3=A3o=20verificadas=20Tr?= =?UTF-8?q?ocar=20"retry"=20por=20"fix"?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 69 ++++++++++++------- src/scielo/bin/xml/tests/test_sgmlxml.py | 24 +++---- 2 files changed, 55 insertions(+), 38 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 345c6509d..fe8b21122 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -435,8 +435,10 @@ class StyleTag(object): def __init__(self, name): self.name = name self.xml_open = "<{}>".format(name) - self.sgml_open = "[{}]".format(name) self.xml_close = "".format(name) + self.fixed_xml_open = "<{}>".format(name.upper()) + self.fixed_xml_close = "".format(name.upper()) + self.sgml_open = "[{}]".format(name) self.sgml_close = "[/{}]".format(name) @@ -464,6 +466,13 @@ def fix(self, content): content = self._restore_matched_style_tags_in_node_tails(xml) content = self._restore_matched_style_tags_in_node_texts(xml) + # testar o xml + xml, xml_error = xml_utils.load_xml(content) + if xml is None: + content = self._disguise_style_tags(content) + content = self._restore_matched_style_tags_in_node_tails(xml) + content = self._restore_matched_style_tags_in_node_texts(xml) + return content def _disguise_style_tags(self, content): @@ -482,13 +491,22 @@ def _revert_disguised_style_tags(self, content): content = content.replace(new_tag, tag) return content + def _mark_fixed_style_tags(self, content): + """ + Marca as tags de estilo convertidas sem erro + """ + for tag, new_tag in self.XML_TO_SGML: + content = content.replace(new_tag, tag.upper()) + return content + def _restore_matched_style_tags_in_node_texts(self, xml): """ Restaura as tags de estilo de todos os node.text """ for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: - new_node = self._restore_matched_style_tags_in_node_text(node) + new_node = self._restore_matched_style_tags_in_node_text( + node, True) if new_node is not None: parent = node.getparent() parent.replace(node, new_node) @@ -500,19 +518,18 @@ def _restore_matched_style_tags_in_node_tails(self, xml): """ for node in xml.findall(".//*"): if node.tail and "[" in node.tail and "]" in node.tail: - self._restore_matched_style_tags_in_node_tail(node) + self._restore_matched_style_tags_in_node_tail(node, True) return xml_utils.tostring(xml) - def _restore_matched_style_tags_in_node_text(self, node, retry=False): + def _restore_matched_style_tags_in_node_text(self, node, fix): """ Restaura as tags de estilo de um node.text """ - text = node.text - text = self._revert_disguised_style_tags(text) + text = self._mark_fixed_style_tags(node.text) wrapped_node_text = self._wrapped_content(text, node.tag) xml, xml_error = xml_utils.load_xml(wrapped_node_text) - if xml is None and retry: - xml = self._retry(text, node.tag) + if xml is None and fix: + xml = self._fix(node.text, node.tag) if xml is not None: return deepcopy(xml.find(".").getchildren()[0]) @@ -521,16 +538,15 @@ def _wrapped_content(self, content, node_tag=None): return "{}".format(content) return "<{}>{}".format(node_tag, content, node_tag) - def _restore_matched_style_tags_in_node_tail(self, node, retry=False): + def _restore_matched_style_tags_in_node_tail(self, node, fix): """ Restaura as tags de estilo de um node.tail """ - tail = node.tail - tail = self._revert_disguised_style_tags(tail) + tail = self._mark_fixed_style_tags(node.tail) wrapped_node_tail = self._wrapped_content(tail) xml, xml_error = xml_utils.load_xml(wrapped_node_tail) - if xml is None and retry: - xml = self._retry(tail) + if xml is None and fix: + xml = self._fix(node.tail) if xml is not None: node.tail = "" @@ -548,35 +564,35 @@ def _loss(self, xml, content): logger.debug("StyleTagsFixer._loss: _content=%s", _content) return _xml != _content - def _retry(self, content, node_tag=None): - logger.debug("StyleTagsFixer._retry: %s", content) - content = self._retry_inserting_tags_at_the_extremities(content) + def _fix(self, content, node_tag=None): + logger.debug("StyleTagsFixer._fix: %s", content) + content = self._fix_inserting_tags_at_the_extremities(content) wrapped_content = self._wrapped_content(content, node_tag) xml, xml_error = xml_utils.load_xml(wrapped_content) - return xml or self._retry_loading_xml_with_recover_true( + return xml or self._fix_loading_xml_with_recover_true( wrapped_content, content) - def _retry_loading_xml_with_recover_true(self, wrapped_content, content): + def _fix_loading_xml_with_recover_true(self, wrapped_content, content): """ Tenta carregar o xml, usando o parâmetro "recover=True" para tentar resolver mismatched tags ou tags não fechadas Esta estratégia não é excelente pois não é previsível """ logger.debug( - "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", content) + "StyleTagsFixer._fix_loading_xml_with_recover_true: %s", content) xml, xml_error = xml_utils.load_xml(wrapped_content, recover=True) if not self._loss(xml, content): logger.debug( - "StyleTagsFixer._retry_loading_xml_with_recover_true: %s", + "StyleTagsFixer._fix_loading_xml_with_recover_true: %s", xml_utils.tostring(xml)) return xml - def _retry_inserting_tags_at_the_extremities(self, content): + def _fix_inserting_tags_at_the_extremities(self, content): """ Tenta resolver mismatched tags inserindo tag de abre no início e tag de fecha no fim, se ausentes """ - logger.debug("StyleTagsFixer.retry_checking_tags: %s", content) + logger.debug("StyleTagsFixer.fix_checking_tags: %s", content) content = self._disguise_style_tags(content) while True: found = self._find_style_tags(content) @@ -597,8 +613,9 @@ def _insert_open_tag_at_the_start(self, first_tag, content): """ if first_tag.startswith("[/"): style_tag = self.style_tags.get(first_tag[2:-1]) - content = (style_tag.xml_open + - content.replace(first_tag, style_tag.xml_close, 1)) + content = (style_tag.fixed_xml_open + + content.replace( + first_tag, style_tag.fixed_xml_close, 1)) return content def _insert_close_tag_at_the_end(self, last_tag, content): @@ -609,8 +626,8 @@ def _insert_close_tag_at_the_end(self, last_tag, content): style_tag = self.style_tags.get(last_tag[1:-1]) start = content[:content.find(last_tag)] end = content[content.find(last_tag):].replace( - last_tag, style_tag.xml_open, 1) - content = start + end + style_tag.xml_close + last_tag, style_tag.fixed_xml_open, 1) + content = start + end + style_tag.fixed_xml_close return content def _find_style_tags(self, content): diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index fa770727f..7c8fae3bf 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -67,24 +67,24 @@ def test_restore_matched_style_tags_in_node_tail_does_not_restore_because_they_a self.assertEqual( node.tail, "texto 1 [sup][bold]sup bold[/sup][/bold] texto 2") - def test_restore_matched_style_tags_in_node_tail_with_retry_true_restores_them_although_they_are_mismatched(self): + def test_restore_matched_style_tags_in_node_tail_with_fix_true_restores_them_although_they_are_mismatched(self): text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" xml = xml_utils.etree.fromstring(text) node = xml.find(".//p") self.style_tags_fixer._restore_matched_style_tags_in_node_tail( - node, retry=True) + node, fix=True) self.assertEqual(node.tail, "texto 1 ") self.assertEqual( xml_utils.tostring(node.getnext()), "sup bold") self.assertEqual(node.getnext().tail, " texto 2") - def test_restore_matched_style_tags_in_node_tail_with_retry_true_restores_sup_although_it_is_not_closed(self): + def test_restore_matched_style_tags_in_node_tail_with_fix_true_restores_sup_although_it_is_not_closed(self): text = """

texto 1 [sup]sup bold texto 2""" xml = xml_utils.etree.fromstring(text) node = xml.find(".//p") self.style_tags_fixer._restore_matched_style_tags_in_node_tail( - node, retry=True) + node, fix=True) self.assertEqual(node.tail, "texto 1 ") self.assertEqual( xml_utils.tostring(node.getnext()), @@ -109,30 +109,30 @@ def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_ self.style_tags_fixer._restore_matched_style_tags_in_node_tails( xml)) - def test_retry_inserting_tags_at_the_extremities_insert_at_the_start(self): + def test_fix_inserting_tags_at_the_extremities_insert_at_the_start(self): text = """texto 1 texto 2""" expected = """texto 1 texto 2""" - result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) - def test_retry_inserting_tags_at_the_extremities_insert_at_the_end(self): + def test_fix_inserting_tags_at_the_extremities_insert_at_the_end(self): text = """texto 1 texto 2""" expected = """texto 1 texto 2""" - result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) - def test_retry_inserting_tags_at_the_extremities_insert_at_the_start_repeatly(self): + def test_fix_inserting_tags_at_the_extremities_insert_at_the_start_repeatly(self): text = """texto 1 texto 2""" expected = """texto 1 texto 2""" - result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) - def test_retry_inserting_tags_at_the_extremities_insert_at_the_end_repeatly(self): + def test_fix_inserting_tags_at_the_extremities_insert_at_the_end_repeatly(self): text = """texto 1 texto 2""" expected = """texto 1 texto 2""" - result = self.style_tags_fixer._retry_inserting_tags_at_the_extremities( + result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) From 444d4d08b39456c0b6861b17d319bf01abe179ad Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 14:14:02 -0300 Subject: [PATCH 11/17] =?UTF-8?q?Cria=20atributos=20para=20atualizar=20nod?= =?UTF-8?q?e.text=20e=20node.tail=20com=20as=20corre=C3=A7=C3=B5es=20das?= =?UTF-8?q?=20tags=20de=20estilo?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 32 ++++++++++++++++--- 1 file changed, 28 insertions(+), 4 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index fe8b21122..c58c70838 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -533,6 +533,18 @@ def _restore_matched_style_tags_in_node_text(self, node, fix): if xml is not None: return deepcopy(xml.find(".").getchildren()[0]) + def _update_node_text(self, node, node_text): + """ + Atualiza node.text com o valor de node_text + """ + text = self._mark_fixed_style_tags(node.text) + wrapped_node_text = self._wrapped_content(text, node.tag) + xml, xml_error = xml_utils.load_xml(wrapped_node_text) + if xml is not None: + new_node = deepcopy(xml.find(".").getchildren()[0]) + parent = node.getparent() + parent.replace(node, new_node) + def _wrapped_content(self, content, node_tag=None): if not node_tag: return "{}".format(content) @@ -554,6 +566,18 @@ def _restore_matched_style_tags_in_node_tail(self, node, fix): node.addnext(deepcopy(n)) node.tail = xml.find(".").text + def _update_node_tail(self, node, new_tail): + """ + Atualiza node.tail com o valor new_tail + """ + wrapped_node_tail = self._wrapped_content(new_tail) + xml, xml_error = xml_utils.load_xml(wrapped_node_tail) + if xml is not None: + node.tail = "" + for n in xml.find(".").getchildren(): + node.addnext(deepcopy(n)) + node.tail = xml.find(".").text + def _loss(self, xml, content): _xml = xml and "".join(xml.find(".").itertext()) _content = content @@ -568,9 +592,10 @@ def _fix(self, content, node_tag=None): logger.debug("StyleTagsFixer._fix: %s", content) content = self._fix_inserting_tags_at_the_extremities(content) wrapped_content = self._wrapped_content(content, node_tag) - xml, xml_error = xml_utils.load_xml(wrapped_content) - return xml or self._fix_loading_xml_with_recover_true( + xml1, xml_error = xml_utils.load_xml(wrapped_content) + xml2 = self._fix_loading_xml_with_recover_true( wrapped_content, content) + return xml2 or xml1 def _fix_loading_xml_with_recover_true(self, wrapped_content, content): """ @@ -580,6 +605,7 @@ def _fix_loading_xml_with_recover_true(self, wrapped_content, content): """ logger.debug( "StyleTagsFixer._fix_loading_xml_with_recover_true: %s", content) + wrapped_content = self._revert_disguised_style_tags(wrapped_content) xml, xml_error = xml_utils.load_xml(wrapped_content, recover=True) if not self._loss(xml, content): logger.debug( @@ -593,7 +619,6 @@ def _fix_inserting_tags_at_the_extremities(self, content): tag de fecha no fim, se ausentes """ logger.debug("StyleTagsFixer.fix_checking_tags: %s", content) - content = self._disguise_style_tags(content) while True: found = self._find_style_tags(content) if len(found) == 0: @@ -604,7 +629,6 @@ def _fix_inserting_tags_at_the_extremities(self, content): if old_content == content: # acabaram as potenciais mudanças break - content = self._revert_disguised_style_tags(content) return content def _insert_open_tag_at_the_start(self, first_tag, content): From 54465a6cf00113d492538b103b8fb0eb6f5be3a8 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 14:31:42 -0300 Subject: [PATCH 12/17] Usa self._update_node_text e self._update_node_tail --- .../bin/xml/prodtools/processing/sgmlxml.py | 40 ++++++++----------- 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index c58c70838..fae5a39cf 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -505,11 +505,7 @@ def _restore_matched_style_tags_in_node_texts(self, xml): """ for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: - new_node = self._restore_matched_style_tags_in_node_text( - node, True) - if new_node is not None: - parent = node.getparent() - parent.replace(node, new_node) + self._restore_matched_style_tags_in_node_text(node, True) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_tails(self, xml): @@ -526,24 +522,21 @@ def _restore_matched_style_tags_in_node_text(self, node, fix): Restaura as tags de estilo de um node.text """ text = self._mark_fixed_style_tags(node.text) - wrapped_node_text = self._wrapped_content(text, node.tag) - xml, xml_error = xml_utils.load_xml(wrapped_node_text) - if xml is None and fix: - xml = self._fix(node.text, node.tag) - if xml is not None: - return deepcopy(xml.find(".").getchildren()[0]) + updated = self._update_node_text(node, text) + if not updated and fix: + self._fix(node, node.tag) def _update_node_text(self, node, node_text): """ Atualiza node.text com o valor de node_text """ - text = self._mark_fixed_style_tags(node.text) - wrapped_node_text = self._wrapped_content(text, node.tag) + wrapped_node_text = self._wrapped_content(node_text, node.tag) xml, xml_error = xml_utils.load_xml(wrapped_node_text) if xml is not None: new_node = deepcopy(xml.find(".").getchildren()[0]) parent = node.getparent() parent.replace(node, new_node) + return True def _wrapped_content(self, content, node_tag=None): if not node_tag: @@ -555,16 +548,9 @@ def _restore_matched_style_tags_in_node_tail(self, node, fix): Restaura as tags de estilo de um node.tail """ tail = self._mark_fixed_style_tags(node.tail) - wrapped_node_tail = self._wrapped_content(tail) - xml, xml_error = xml_utils.load_xml(wrapped_node_tail) - if xml is None and fix: - xml = self._fix(node.tail) - - if xml is not None: - node.tail = "" - for n in xml.find(".").getchildren(): - node.addnext(deepcopy(n)) - node.tail = xml.find(".").text + updated = self._update_node_tail(node, tail) + if not updated and fix: + self._fix(node) def _update_node_tail(self, node, new_tail): """ @@ -577,6 +563,7 @@ def _update_node_tail(self, node, new_tail): for n in xml.find(".").getchildren(): node.addnext(deepcopy(n)) node.tail = xml.find(".").text + return True def _loss(self, xml, content): _xml = xml and "".join(xml.find(".").itertext()) @@ -588,9 +575,14 @@ def _loss(self, xml, content): logger.debug("StyleTagsFixer._loss: _content=%s", _content) return _xml != _content - def _fix(self, content, node_tag=None): + def _fix(self, node, node_tag=None): + if node_tag: + content = node.text + else: + content = node.tail logger.debug("StyleTagsFixer._fix: %s", content) content = self._fix_inserting_tags_at_the_extremities(content) + wrapped_content = self._wrapped_content(content, node_tag) xml1, xml_error = xml_utils.load_xml(wrapped_content) xml2 = self._fix_loading_xml_with_recover_true( From def48465f0478661b69ac534cae7406249297417 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 14:37:29 -0300 Subject: [PATCH 13/17] Remover a chamada de `self._fix()` --- src/scielo/bin/xml/prodtools/processing/sgmlxml.py | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index fae5a39cf..26b5db41c 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -505,7 +505,7 @@ def _restore_matched_style_tags_in_node_texts(self, xml): """ for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: - self._restore_matched_style_tags_in_node_text(node, True) + self._restore_matched_style_tags_in_node_text(node) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_tails(self, xml): @@ -514,7 +514,7 @@ def _restore_matched_style_tags_in_node_tails(self, xml): """ for node in xml.findall(".//*"): if node.tail and "[" in node.tail and "]" in node.tail: - self._restore_matched_style_tags_in_node_tail(node, True) + self._restore_matched_style_tags_in_node_tail(node) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_text(self, node, fix): @@ -522,9 +522,7 @@ def _restore_matched_style_tags_in_node_text(self, node, fix): Restaura as tags de estilo de um node.text """ text = self._mark_fixed_style_tags(node.text) - updated = self._update_node_text(node, text) - if not updated and fix: - self._fix(node, node.tag) + self._update_node_text(node, text) def _update_node_text(self, node, node_text): """ @@ -548,9 +546,7 @@ def _restore_matched_style_tags_in_node_tail(self, node, fix): Restaura as tags de estilo de um node.tail """ tail = self._mark_fixed_style_tags(node.tail) - updated = self._update_node_tail(node, tail) - if not updated and fix: - self._fix(node) + self._update_node_tail(node, tail) def _update_node_tail(self, node, new_tail): """ @@ -582,7 +578,7 @@ def _fix(self, node, node_tag=None): content = node.tail logger.debug("StyleTagsFixer._fix: %s", content) content = self._fix_inserting_tags_at_the_extremities(content) - + wrapped_content = self._wrapped_content(content, node_tag) xml1, xml_error = xml_utils.load_xml(wrapped_content) xml2 = self._fix_loading_xml_with_recover_true( From e2b30ebafb6c3770e5f347c52be74e3e245d5bc0 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 14:46:53 -0300 Subject: [PATCH 14/17] =?UTF-8?q?Remove=20=5Frestore=5Fmatched=5Fstyle=5Ft?= =?UTF-8?q?ags=5Fin=5Fnode=5Ftext=20e=20=5Frestore=5Fmatched=5Fstyle=5Ftag?= =?UTF-8?q?s=5Fin=5Fnode=5Ftail=20e=20seus=20respectivos=20conte=C3=BAdo?= =?UTF-8?q?=20substituem=20as=20suas=20respectivas=20chamadas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 20 ++++--------------- 1 file changed, 4 insertions(+), 16 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 26b5db41c..9b703e50c 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -505,7 +505,8 @@ def _restore_matched_style_tags_in_node_texts(self, xml): """ for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: - self._restore_matched_style_tags_in_node_text(node) + text = self._mark_fixed_style_tags(node.text) + self._update_node_text(node, text) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_tails(self, xml): @@ -514,16 +515,10 @@ def _restore_matched_style_tags_in_node_tails(self, xml): """ for node in xml.findall(".//*"): if node.tail and "[" in node.tail and "]" in node.tail: - self._restore_matched_style_tags_in_node_tail(node) + tail = self._mark_fixed_style_tags(node.tail) + self._update_node_tail(node, tail) return xml_utils.tostring(xml) - def _restore_matched_style_tags_in_node_text(self, node, fix): - """ - Restaura as tags de estilo de um node.text - """ - text = self._mark_fixed_style_tags(node.text) - self._update_node_text(node, text) - def _update_node_text(self, node, node_text): """ Atualiza node.text com o valor de node_text @@ -541,13 +536,6 @@ def _wrapped_content(self, content, node_tag=None): return "{}".format(content) return "<{}>{}".format(node_tag, content, node_tag) - def _restore_matched_style_tags_in_node_tail(self, node, fix): - """ - Restaura as tags de estilo de um node.tail - """ - tail = self._mark_fixed_style_tags(node.tail) - self._update_node_tail(node, tail) - def _update_node_tail(self, node, new_tail): """ Atualiza node.tail com o valor new_tail From 9ba3cb5896ba0267af35a7f58b90e7aa1c3abe62 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 15:22:07 -0300 Subject: [PATCH 15/17] =?UTF-8?q?Remove=20trecho=20de=20`=5Fupdate=5Fnode?= =?UTF-8?q?=5Ftext`=20e=20de=20`=5Fupdate=5Fnode=5Ftail`=20para=20criar=20?= =?UTF-8?q?`=5Fcheck=5Fcontent`=20e=20faz=20outros=20ajustes=20em=20`=5Fup?= =?UTF-8?q?date=5Fnode=5Ftext`=20e=20`=5Fupdate=5Fnode=5Ftail`=20Elimina?= =?UTF-8?q?=20`=5Ffix()`=20e=20seu=20conte=C3=BAdo,=20ou=20seja,=20as=20ch?= =?UTF-8?q?amadas=20a=20`=5Ffix=5Finserting=5Ftags=5Fat=5Fthe=5Fextremitie?= =?UTF-8?q?s`=20e=20`=5Ffix=5Floading=5Fxml=5Fwith=5Frecover=5Ftrue`,=20pa?= =?UTF-8?q?ssam=20a=20ser=20feitas=20dentro=20de=20`=5Frestore=5Fmatched?= =?UTF-8?q?=5Fstyle=5Ftags=5Fin=5Fnode=5Ftails`=20e=20`=5Frestore=5Fmatche?= =?UTF-8?q?d=5Fstyle=5Ftags=5Fin=5Fnode=5Ftexts`?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 66 +++++++++++-------- 1 file changed, 39 insertions(+), 27 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 9b703e50c..2c1724596 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -506,7 +506,19 @@ def _restore_matched_style_tags_in_node_texts(self, xml): for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: text = self._mark_fixed_style_tags(node.text) - self._update_node_text(node, text) + new_xml = self._check_content(node, text, node.tag) + self._update_node_text(node, new_xml) + + for node in xml.findall(".//*"): + if node.text and "[" in node.text and "]" in node.text: + text = self._fix_inserting_tags_at_the_extremities(node.text) + new_xml = self._check_content(node, text, node.tag) + self._update_node_text(node, new_xml) + + for node in xml.findall(".//*"): + if node.text and "[" in node.text and "]" in node.text: + new_xml = self._fix_loading_xml_with_recover_true(node.text) + self._update_node_text(node, new_xml) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_tails(self, xml): @@ -516,15 +528,33 @@ def _restore_matched_style_tags_in_node_tails(self, xml): for node in xml.findall(".//*"): if node.tail and "[" in node.tail and "]" in node.tail: tail = self._mark_fixed_style_tags(node.tail) - self._update_node_tail(node, tail) + new_xml = self._check_content(node, tail) + self._update_node_tail(node, new_xml) + + for node in xml.findall(".//*"): + if node.tail and "[" in node.tail and "]" in node.tail: + tail = self._fix_inserting_tags_at_the_extremities(node.tail) + new_xml = self._check_content(node, tail) + self._update_node_tail(node, new_xml) + + for node in xml.findall(".//*"): + if node.tail and "[" in node.tail and "]" in node.tail: + new_xml = self._fix_loading_xml_with_recover_true(node.tail) + self._update_node_tail(node, new_xml) return xml_utils.tostring(xml) - def _update_node_text(self, node, node_text): + def _check_content(self, node, content, node_tag=None): + """ + Atualiza node.text com o valor de content + """ + wrapped_content = self._wrapped_content(content, node_tag) + xml, xml_error = xml_utils.load_xml(wrapped_content) + return xml + + def _update_node_text(self, node, xml): """ Atualiza node.text com o valor de node_text """ - wrapped_node_text = self._wrapped_content(node_text, node.tag) - xml, xml_error = xml_utils.load_xml(wrapped_node_text) if xml is not None: new_node = deepcopy(xml.find(".").getchildren()[0]) parent = node.getparent() @@ -536,12 +566,10 @@ def _wrapped_content(self, content, node_tag=None): return "{}".format(content) return "<{}>{}".format(node_tag, content, node_tag) - def _update_node_tail(self, node, new_tail): + def _update_node_tail(self, node, xml): """ Atualiza node.tail com o valor new_tail """ - wrapped_node_tail = self._wrapped_content(new_tail) - xml, xml_error = xml_utils.load_xml(wrapped_node_tail) if xml is not None: node.tail = "" for n in xml.find(".").getchildren(): @@ -559,21 +587,7 @@ def _loss(self, xml, content): logger.debug("StyleTagsFixer._loss: _content=%s", _content) return _xml != _content - def _fix(self, node, node_tag=None): - if node_tag: - content = node.text - else: - content = node.tail - logger.debug("StyleTagsFixer._fix: %s", content) - content = self._fix_inserting_tags_at_the_extremities(content) - - wrapped_content = self._wrapped_content(content, node_tag) - xml1, xml_error = xml_utils.load_xml(wrapped_content) - xml2 = self._fix_loading_xml_with_recover_true( - wrapped_content, content) - return xml2 or xml1 - - def _fix_loading_xml_with_recover_true(self, wrapped_content, content): + def _fix_loading_xml_with_recover_true(self, content, node_tag): """ Tenta carregar o xml, usando o parâmetro "recover=True" para tentar resolver mismatched tags ou tags não fechadas @@ -581,12 +595,10 @@ def _fix_loading_xml_with_recover_true(self, wrapped_content, content): """ logger.debug( "StyleTagsFixer._fix_loading_xml_with_recover_true: %s", content) - wrapped_content = self._revert_disguised_style_tags(wrapped_content) + content = self._mark_fixed_style_tags(content) + wrapped_content = self._wrapped_content(content, node_tag) xml, xml_error = xml_utils.load_xml(wrapped_content, recover=True) if not self._loss(xml, content): - logger.debug( - "StyleTagsFixer._fix_loading_xml_with_recover_true: %s", - xml_utils.tostring(xml)) return xml def _fix_inserting_tags_at_the_extremities(self, content): From fe2494e24faee9494c64c604457899c74c994a95 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 15:46:44 -0300 Subject: [PATCH 16/17] =?UTF-8?q?Remove=20as=20tags=20de=20estilo=20que=20?= =?UTF-8?q?n=C3=A3o=20puderam=20ser=20correspondidas=20Desmarca=20as=20tag?= =?UTF-8?q?s=20de=20estilos=20corrigidas?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 43 +++++++++++-------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index 2c1724596..bae28dea2 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -465,38 +465,39 @@ def fix(self, content): content = self._restore_matched_style_tags_in_node_tails(xml) content = self._restore_matched_style_tags_in_node_texts(xml) - - # testar o xml - xml, xml_error = xml_utils.load_xml(content) - if xml is None: - content = self._disguise_style_tags(content) - content = self._restore_matched_style_tags_in_node_tails(xml) - content = self._restore_matched_style_tags_in_node_texts(xml) - + content = self._unmark_fixed_style_tags(content) return content def _disguise_style_tags(self, content): """ Disfarça as tags de estilo, mudando `` para `[tag]` """ - for tag, new_tag in self.XML_TO_SGML: - content = content.replace(tag, new_tag) + for xml_tag, sgml_tag in self.XML_TO_SGML: + content = content.replace(xml_tag, sgml_tag) return content - def _revert_disguised_style_tags(self, content): + def _unmark_fixed_style_tags(self, content): """ Reverte o disfarce das tags de estilo, mudando `[tag]` para `` """ - for tag, new_tag in self.XML_TO_SGML: - content = content.replace(new_tag, tag) + for xml_tag, sgml_tag in self.XML_TO_SGML: + content = content.replace(xml_tag.upper(), xml_tag) + return content + + def _delete_sgml_style_tags(self, content): + """ + Apaga as tags de estilo sgml, mudando `[tag]` para `''` + """ + for xml_tag, sgml_tag in self.XML_TO_SGML: + content = content.replace(sgml_tag, "") return content def _mark_fixed_style_tags(self, content): """ Marca as tags de estilo convertidas sem erro """ - for tag, new_tag in self.XML_TO_SGML: - content = content.replace(new_tag, tag.upper()) + for xml_tag, sgml_tag in self.XML_TO_SGML: + content = content.replace(sgml_tag, xml_tag.upper()) return content def _restore_matched_style_tags_in_node_texts(self, xml): @@ -519,6 +520,10 @@ def _restore_matched_style_tags_in_node_texts(self, xml): if node.text and "[" in node.text and "]" in node.text: new_xml = self._fix_loading_xml_with_recover_true(node.text) self._update_node_text(node, new_xml) + + for node in xml.findall(".//*"): + if node.text and "[" in node.text and "]" in node.text: + node.text = self._delete_sgml_style_tags(node.text) return xml_utils.tostring(xml) def _restore_matched_style_tags_in_node_tails(self, xml): @@ -541,6 +546,10 @@ def _restore_matched_style_tags_in_node_tails(self, xml): if node.tail and "[" in node.tail and "]" in node.tail: new_xml = self._fix_loading_xml_with_recover_true(node.tail) self._update_node_tail(node, new_xml) + + for node in xml.findall(".//*"): + if node.tail and "[" in node.tail and "]" in node.tail: + node.tail = self._delete_sgml_style_tags(node.tail) return xml_utils.tostring(xml) def _check_content(self, node, content, node_tag=None): @@ -580,8 +589,8 @@ def _update_node_tail(self, node, xml): def _loss(self, xml, content): _xml = xml and "".join(xml.find(".").itertext()) _content = content - for tag, sgml in self.XML_TO_SGML: - _content = _content.replace(tag, "") + for xml_tag, sgml_tag in self.XML_TO_SGML: + _content = _content.replace(xml_tag, "") logger.debug("StyleTagsFixer._loss: content=%s", content) logger.debug("StyleTagsFixer._loss: _xml=%s", _xml) logger.debug("StyleTagsFixer._loss: _content=%s", _content) From 3024c1fba266abf30d08d2a2b37f944378a68425 Mon Sep 17 00:00:00 2001 From: Roberta Takenaka Date: Wed, 17 Jun 2020 18:17:40 -0300 Subject: [PATCH 17/17] =?UTF-8?q?Atualiza=20os=20testes=20que=20quebraram?= =?UTF-8?q?=20ap=C3=B3s=20remover=20alguns=20atributos?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../bin/xml/prodtools/processing/sgmlxml.py | 8 +- src/scielo/bin/xml/tests/test_sgmlxml.py | 121 ++++++------------ 2 files changed, 42 insertions(+), 87 deletions(-) diff --git a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py index bae28dea2..8cd362615 100644 --- a/src/scielo/bin/xml/prodtools/processing/sgmlxml.py +++ b/src/scielo/bin/xml/prodtools/processing/sgmlxml.py @@ -518,7 +518,8 @@ def _restore_matched_style_tags_in_node_texts(self, xml): for node in xml.findall(".//*"): if node.text and "[" in node.text and "]" in node.text: - new_xml = self._fix_loading_xml_with_recover_true(node.text) + new_xml = self._fix_loading_xml_with_recover_true( + node.text, node.tag) self._update_node_text(node, new_xml) for node in xml.findall(".//*"): @@ -544,7 +545,8 @@ def _restore_matched_style_tags_in_node_tails(self, xml): for node in xml.findall(".//*"): if node.tail and "[" in node.tail and "]" in node.tail: - new_xml = self._fix_loading_xml_with_recover_true(node.tail) + new_xml = self._fix_loading_xml_with_recover_true( + node.tail, None) self._update_node_tail(node, new_xml) for node in xml.findall(".//*"): @@ -590,7 +592,7 @@ def _loss(self, xml, content): _xml = xml and "".join(xml.find(".").itertext()) _content = content for xml_tag, sgml_tag in self.XML_TO_SGML: - _content = _content.replace(xml_tag, "") + _content = _content.replace(xml_tag.upper(), "") logger.debug("StyleTagsFixer._loss: content=%s", content) logger.debug("StyleTagsFixer._loss: _xml=%s", _xml) logger.debug("StyleTagsFixer._loss: _content=%s", _content) diff --git a/src/scielo/bin/xml/tests/test_sgmlxml.py b/src/scielo/bin/xml/tests/test_sgmlxml.py index 7c8fae3bf..96c95ed3c 100644 --- a/src/scielo/bin/xml/tests/test_sgmlxml.py +++ b/src/scielo/bin/xml/tests/test_sgmlxml.py @@ -14,125 +14,78 @@ class TestStyleTagsFixer(TestCase): def setUp(self): self.style_tags_fixer = sgmlxml.StyleTagsFixer() - def test_restore_matched_style_tags_in_node_text_returns_restored_style_tags(self): + def test_restore_matched_style_tags_in_node_texts_returns_style_tags_in_upper_case(self): text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2

""" - xml = xml_utils.etree.fromstring(text) - node = xml.find(".//p") - new_node = self.style_tags_fixer._restore_matched_style_tags_in_node_text(node) - expected = """

texto 1 sup bold texto 2

""" - self.assertEqual( - expected, xml_utils.tostring(new_node)) - - def test_restore_matched_style_tags_in_node_text_does_not_restore_because_they_are_mismatched(self): - text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" - xml = xml_utils.etree.fromstring(text) - node = xml.find(".//p") - new_node = self.style_tags_fixer._restore_matched_style_tags_in_node_text(node) - self.assertIsNone(xml_utils.tostring(new_node)) - - def test_restore_matched_style_tags_in_node_texts_returns_restored_style_tags(self): - text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2

""" - expected = """

texto 1 sup bold texto 2

""" + expected = """

texto 1 sup bold texto 2

""" xml = xml_utils.etree.fromstring(text) self.assertEqual( expected, self.style_tags_fixer._restore_matched_style_tags_in_node_texts( xml)) - def test_restore_matched_style_tags_in_node_texts_does_not_restore_because_they_are_mismatched(self): + def test_restore_matched_style_tags_in_node_texts_fixes_mismatched_tags(self): text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" - expected = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2

""" xml = xml_utils.etree.fromstring(text) - self.assertEqual( - expected, - self.style_tags_fixer._restore_matched_style_tags_in_node_texts( - xml)) - - def test_restore_matched_style_tags_in_node_tail_returns_restored_style_tags(self): + result = self.style_tags_fixer._restore_matched_style_tags_in_node_texts( + xml) + self.assertIn("", result) + self.assertIn("", result) + self.assertIn("", result) + self.assertIn("", result) + + def test_fix_loading_xml_with_recover_true_fixes_mismatched_tags(self): + text = """texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" + xml = self.style_tags_fixer._fix_loading_xml_with_recover_true( + text, None) + result = xml_utils.tostring(xml) + self.assertIn("", result) + self.assertIn("", result) + self.assertIn("", result) + self.assertIn("", result) + + def test_restore_matched_style_tags_in_node_tails_returns_style_tags_in_upper_case(self): text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2""" - xml = xml_utils.etree.fromstring(text) - node = xml.find(".//p") - self.style_tags_fixer._restore_matched_style_tags_in_node_tail(node) - self.assertEqual(node.tail, "texto 1 ") - self.assertEqual( - xml_utils.tostring(node.getnext()), - "sup bold") - self.assertEqual(node.getnext().tail, " texto 2") - - def test_restore_matched_style_tags_in_node_tail_does_not_restore_because_they_are_mismatched(self): - text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" - xml = xml_utils.etree.fromstring(text) - node = xml.find(".//p") - self.style_tags_fixer._restore_matched_style_tags_in_node_tail(node) - self.assertEqual( - node.tail, "texto 1 [sup][bold]sup bold[/sup][/bold] texto 2") - - def test_restore_matched_style_tags_in_node_tail_with_fix_true_restores_them_although_they_are_mismatched(self): - text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" - xml = xml_utils.etree.fromstring(text) - node = xml.find(".//p") - self.style_tags_fixer._restore_matched_style_tags_in_node_tail( - node, fix=True) - self.assertEqual(node.tail, "texto 1 ") - self.assertEqual( - xml_utils.tostring(node.getnext()), - "sup bold") - self.assertEqual(node.getnext().tail, " texto 2") - - def test_restore_matched_style_tags_in_node_tail_with_fix_true_restores_sup_although_it_is_not_closed(self): - text = """

texto 1 [sup]sup bold texto 2""" - xml = xml_utils.etree.fromstring(text) - node = xml.find(".//p") - self.style_tags_fixer._restore_matched_style_tags_in_node_tail( - node, fix=True) - self.assertEqual(node.tail, "texto 1 ") - self.assertEqual( - xml_utils.tostring(node.getnext()), - "sup bold texto 2") - self.assertEqual(node.getnext().tail, None or "") - - def test_restore_matched_style_tags_in_node_tails_returns_restored_style_tags(self): - text = """

texto 1 [sup][bold]sup bold[/bold][/sup] texto 2""" - expected = """

texto 1 sup bold texto 2""" + expected = """

texto 1 sup bold texto 2""" xml = xml_utils.etree.fromstring(text) self.assertEqual( expected, self.style_tags_fixer._restore_matched_style_tags_in_node_tails( xml)) - def test_restore_matched_style_tags_in_node_tails_does_not_restore_because_they_are_mismatched(self): + def test_restore_matched_style_tags_in_node_tails_fixes_mismatched_tags(self): text = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" - expected = """

texto 1 [sup][bold]sup bold[/sup][/bold] texto 2""" xml = xml_utils.etree.fromstring(text) - self.assertEqual( - expected, - self.style_tags_fixer._restore_matched_style_tags_in_node_tails( - xml)) + result = self.style_tags_fixer._restore_matched_style_tags_in_node_tails( + xml) + self.assertIn("", result) + self.assertIn("", result) + self.assertIn("", result) + self.assertIn("", result) def test_fix_inserting_tags_at_the_extremities_insert_at_the_start(self): - text = """texto 1 texto 2""" - expected = """texto 1 texto 2""" + text = """texto 1 [/sup] texto 2""" + expected = """texto 1 texto 2""" result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) def test_fix_inserting_tags_at_the_extremities_insert_at_the_end(self): - text = """texto 1 texto 2""" - expected = """texto 1 texto 2""" + text = """texto 1 [sup] texto 2""" + expected = """texto 1 texto 2""" result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) def test_fix_inserting_tags_at_the_extremities_insert_at_the_start_repeatly(self): - text = """texto 1 texto 2""" - expected = """texto 1 texto 2""" + text = """texto 1 [/sup][/bold][/italic][/sub] texto 2""" + expected = """texto 1 texto 2""" result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result) def test_fix_inserting_tags_at_the_extremities_insert_at_the_end_repeatly(self): - text = """texto 1 texto 2""" - expected = """texto 1 texto 2""" + text = """texto 1 [sub][italic][bold][sup] texto 2""" + expected = """texto 1 texto 2""" result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities( text) self.assertEqual(expected, result)