Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
17 commits
Select commit Hold shift + click to select a range
a3fde65
Divide self._fix_styles em duas funções
robertatakenaka Jun 16, 2020
cefb3e5
Verifica se está mal formado antes de tentar consertar
robertatakenaka Jun 16, 2020
0b6a0ef
Cria uma classe para tratar das tags de estilo
robertatakenaka Jun 16, 2020
06edfe4
Cria atributo que restaura as tags de estilo de node.tail
robertatakenaka Jun 16, 2020
5b4abf4
Tenta uma nova estratégia, se o parâmetro retry é igual a True.
robertatakenaka Jun 16, 2020
ec06cc1
Cria atributos para inserir tags de "fecha" e "abre" que estão ausent…
robertatakenaka Jun 17, 2020
69e4eba
Cria atributos para envelopar node.text e node.tail
robertatakenaka Jun 17, 2020
204cb36
Faz uns ajustes
robertatakenaka Jun 17, 2020
0deb770
Cria atributos para inserir tags de "fecha" e "abre" que estão ausent…
robertatakenaka Jun 17, 2020
37ad883
Distinguir entre as tags de estilos já verificadas vs não verificadas
robertatakenaka Jun 17, 2020
444d4d0
Cria atributos para atualizar node.text e node.tail com as correções …
robertatakenaka Jun 17, 2020
54465a6
Usa self._update_node_text e self._update_node_tail
robertatakenaka Jun 17, 2020
def4846
Remover a chamada de `self._fix()`
robertatakenaka Jun 17, 2020
e2b30eb
Remove _restore_matched_style_tags_in_node_text e _restore_matched_st…
robertatakenaka Jun 17, 2020
9ba3cb5
Remove trecho de `_update_node_text` e de `_update_node_tail` para cr…
robertatakenaka Jun 17, 2020
fe2494e
Remove as tags de estilo que não puderam ser correspondidas
robertatakenaka Jun 17, 2020
3024c1f
Atualiza os testes que quebraram após remover alguns atributos
robertatakenaka Jun 17, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
268 changes: 267 additions & 1 deletion src/scielo/bin/xml/prodtools/processing/sgmlxml.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,6 +230,7 @@ class SGMLXMLContentEnhancer(xml_utils.SuitableXML):
def __init__(self, src_pkgfiles, sgmlhtml):
self.sgmlhtml = sgmlhtml
self.src_pkgfiles = src_pkgfiles
self.style_tags_fixer = StyleTagsFixer()
super().__init__(src_pkgfiles.filename)
if self.xml_error:
raise Exception(self.xml_error)
Expand All @@ -241,6 +242,11 @@ def __init__(self, src_pkgfiles, sgmlhtml):
self._insert_xhtml_tables_in_document()
logger.debug("...")

@property
def _is_well_formed(self):
xml, xml_error = xml_utils.load_xml(self._content)
return xml is not None

def well_formed_xml_content(self):
self._content = xml_utils.insert_namespaces_in_root(
"doc", self._content)
Expand All @@ -252,7 +258,26 @@ def well_formed_xml_content(self):
super().well_formed_xml_content()

def _fix_styles(self):
# TODO: corrigir misplaced tags
self._style_tags_upper_to_lower_case()
self._fix_mismatched_style_tags()

def _fix_mismatched_style_tags(self):
"""
No markup as tags de estilos são geradas pela marcação de estilos do
próprio Word, ou seja, bold, itálico, sup e sub não são tags do Markup.
No entanto, o resultado é a mescla de tags do Markup + tags de estilos
podendo acontecer de se mesclarem de forma que o resultado não seja
um XML não bem formado. O propósito deste atributo é consertar isso.
"""
if self._is_well_formed:
return
logger.debug("_fix_mismatched_style_tags")
self._content = self.style_tags_fixer.fix(self._content)

def _style_tags_upper_to_lower_case(self):
if self._is_well_formed:
return
logger.debug("_style_tags_upper_to_lower_case")
content = self._content
for style in ("BOLD", "ITALIC", "SUP", "SUB"):
tag_open = "<{}>".format(style)
Expand All @@ -266,6 +291,9 @@ def _fix_quotes(self):
Às vezes a codificação das aspas vem diferente de ", sendo assim
são necessários trocas antes de carregar a árvore de XML
"""
if self._is_well_formed:
return
logger.debug("_fix_quotes")
content = self._content
content = content.replace("<", "FIXQUOTESBREK<")
content = content.replace(">", ">FIXQUOTESBREK")
Expand Down Expand Up @@ -402,6 +430,244 @@ def _find_href_file_in_folder(self, elem_name, elem_id,
return (new_href, no_parents_img_counter)


class StyleTag(object):

def __init__(self, name):
self.name = name
self.xml_open = "<{}>".format(name)
self.xml_close = "</{}>".format(name)
self.fixed_xml_open = "<{}>".format(name.upper())
self.fixed_xml_close = "</{}>".format(name.upper())
self.sgml_open = "[{}]".format(name)
self.sgml_close = "[/{}]".format(name)


class StyleTagsFixer(object):

def __init__(self):
self.XML_TO_SGML = []
self.style_tags = {}
for style in ("bold", "italic", "sup", "sub"):
self.style_tags[style] = StyleTag(style)
style_tag = self.style_tags[style]
self.XML_TO_SGML.append((style_tag.xml_open, style_tag.sgml_open))
self.XML_TO_SGML.append(
(style_tag.xml_close, style_tag.sgml_close))

def fix(self, content):
original = content
content = self._disguise_style_tags(content)

xml, xml_error = xml_utils.load_xml(content)
if xml is None:
# XML já está mal formado. As tags de estilo não são a causa.
return original

content = self._restore_matched_style_tags_in_node_tails(xml)
content = self._restore_matched_style_tags_in_node_texts(xml)
content = self._unmark_fixed_style_tags(content)
return content

def _disguise_style_tags(self, content):
"""
Disfarça as tags de estilo, mudando `<tag>` para `[tag]`
"""
for xml_tag, sgml_tag in self.XML_TO_SGML:
content = content.replace(xml_tag, sgml_tag)
return content

def _unmark_fixed_style_tags(self, content):
"""
Reverte o disfarce das tags de estilo, mudando `[tag]` para `<tag>`
"""
for xml_tag, sgml_tag in self.XML_TO_SGML:
content = content.replace(xml_tag.upper(), xml_tag)
return content

def _delete_sgml_style_tags(self, content):
"""
Apaga as tags de estilo sgml, mudando `[tag]` para `''`
"""
for xml_tag, sgml_tag in self.XML_TO_SGML:
content = content.replace(sgml_tag, "")
return content

def _mark_fixed_style_tags(self, content):
"""
Marca as tags de estilo convertidas sem erro
"""
for xml_tag, sgml_tag in self.XML_TO_SGML:
content = content.replace(sgml_tag, xml_tag.upper())
return content

def _restore_matched_style_tags_in_node_texts(self, xml):
"""
Restaura as tags de estilo de todos os node.text
"""
for node in xml.findall(".//*"):
if node.text and "[" in node.text and "]" in node.text:
text = self._mark_fixed_style_tags(node.text)
new_xml = self._check_content(node, text, node.tag)
self._update_node_text(node, new_xml)

for node in xml.findall(".//*"):
if node.text and "[" in node.text and "]" in node.text:
text = self._fix_inserting_tags_at_the_extremities(node.text)
new_xml = self._check_content(node, text, node.tag)
self._update_node_text(node, new_xml)

for node in xml.findall(".//*"):
if node.text and "[" in node.text and "]" in node.text:
new_xml = self._fix_loading_xml_with_recover_true(
node.text, node.tag)
self._update_node_text(node, new_xml)

for node in xml.findall(".//*"):
if node.text and "[" in node.text and "]" in node.text:
node.text = self._delete_sgml_style_tags(node.text)
return xml_utils.tostring(xml)

def _restore_matched_style_tags_in_node_tails(self, xml):
"""
Restaura as tags de estilo de todos os node.tail
"""
for node in xml.findall(".//*"):
if node.tail and "[" in node.tail and "]" in node.tail:
tail = self._mark_fixed_style_tags(node.tail)
new_xml = self._check_content(node, tail)
self._update_node_tail(node, new_xml)

for node in xml.findall(".//*"):
if node.tail and "[" in node.tail and "]" in node.tail:
tail = self._fix_inserting_tags_at_the_extremities(node.tail)
new_xml = self._check_content(node, tail)
self._update_node_tail(node, new_xml)

for node in xml.findall(".//*"):
if node.tail and "[" in node.tail and "]" in node.tail:
new_xml = self._fix_loading_xml_with_recover_true(
node.tail, None)
self._update_node_tail(node, new_xml)

for node in xml.findall(".//*"):
if node.tail and "[" in node.tail and "]" in node.tail:
node.tail = self._delete_sgml_style_tags(node.tail)
return xml_utils.tostring(xml)

def _check_content(self, node, content, node_tag=None):
"""
Atualiza node.text com o valor de content
"""
wrapped_content = self._wrapped_content(content, node_tag)
xml, xml_error = xml_utils.load_xml(wrapped_content)
return xml

def _update_node_text(self, node, xml):
"""
Atualiza node.text com o valor de node_text
"""
if xml is not None:
new_node = deepcopy(xml.find(".").getchildren()[0])
parent = node.getparent()
parent.replace(node, new_node)
return True

def _wrapped_content(self, content, node_tag=None):
if not node_tag:
return "<root>{}</root>".format(content)
return "<root><{}>{}</{}></root>".format(node_tag, content, node_tag)

def _update_node_tail(self, node, xml):
"""
Atualiza node.tail com o valor new_tail
"""
if xml is not None:
node.tail = ""
for n in xml.find(".").getchildren():
node.addnext(deepcopy(n))
node.tail = xml.find(".").text
return True

def _loss(self, xml, content):
_xml = xml and "".join(xml.find(".").itertext())
_content = content
for xml_tag, sgml_tag in self.XML_TO_SGML:
_content = _content.replace(xml_tag.upper(), "")
logger.debug("StyleTagsFixer._loss: content=%s", content)
logger.debug("StyleTagsFixer._loss: _xml=%s", _xml)
logger.debug("StyleTagsFixer._loss: _content=%s", _content)
return _xml != _content

def _fix_loading_xml_with_recover_true(self, content, node_tag):
"""
Tenta carregar o xml, usando o parâmetro "recover=True"
para tentar resolver mismatched tags ou tags não fechadas
Esta estratégia não é excelente pois não é previsível
"""
logger.debug(
"StyleTagsFixer._fix_loading_xml_with_recover_true: %s", content)
content = self._mark_fixed_style_tags(content)
wrapped_content = self._wrapped_content(content, node_tag)
xml, xml_error = xml_utils.load_xml(wrapped_content, recover=True)
if not self._loss(xml, content):
return xml

def _fix_inserting_tags_at_the_extremities(self, content):
"""
Tenta resolver mismatched tags inserindo tag de abre no início e
tag de fecha no fim, se ausentes
"""
logger.debug("StyleTagsFixer.fix_checking_tags: %s", content)
while True:
found = self._find_style_tags(content)
if len(found) == 0:
break
old_content = content
content = self._insert_open_tag_at_the_start(found[0], content)
content = self._insert_close_tag_at_the_end(found[-1], content)
if old_content == content:
# acabaram as potenciais mudanças
break
return content

def _insert_open_tag_at_the_start(self, first_tag, content):
"""
Se a primeira tag é "fecha", então insere tag "abre" no início
"""
if first_tag.startswith("[/"):
style_tag = self.style_tags.get(first_tag[2:-1])
content = (style_tag.fixed_xml_open +
content.replace(
first_tag, style_tag.fixed_xml_close, 1))
return content

def _insert_close_tag_at_the_end(self, last_tag, content):
"""
Se a última tag "abre", então insere tag "fecha" no fim
"""
if not last_tag.startswith("[/"):
style_tag = self.style_tags.get(last_tag[1:-1])
start = content[:content.find(last_tag)]
end = content[content.find(last_tag):].replace(
last_tag, style_tag.fixed_xml_open, 1)
content = start + end + style_tag.fixed_xml_close
return content

def _find_style_tags(self, content):
"""
Identifica as tags de estilo em content
"""
sgml_tags = dict(self.XML_TO_SGML).values()
items = content.replace(
"[", "BREAKSTYLETAGS[").replace(
"]", "]BREAKSTYLETAGS").split("BREAKSTYLETAGS")
return [
item
for item in items
if item in sgml_tags
]


class PackageNamer(object):

def __init__(self, src_pkgfiles, acron, dest_path):
Expand Down
91 changes: 91 additions & 0 deletions src/scielo/bin/xml/tests/test_sgmlxml.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
# coding=utf-8
import sys
from unittest import TestCase

from prodtools.processing import sgmlxml
from prodtools.utils import xml_utils


python_version = sys.version_info.major


class TestStyleTagsFixer(TestCase):

def setUp(self):
self.style_tags_fixer = sgmlxml.StyleTagsFixer()

def test_restore_matched_style_tags_in_node_texts_returns_style_tags_in_upper_case(self):
text = """<root><p>texto 1 [sup][bold]sup bold[/bold][/sup] texto 2</p></root>"""
expected = """<root><p>texto 1 <SUP><BOLD>sup bold</BOLD></SUP> texto 2</p></root>"""
xml = xml_utils.etree.fromstring(text)
self.assertEqual(
expected,
self.style_tags_fixer._restore_matched_style_tags_in_node_texts(
xml))

def test_restore_matched_style_tags_in_node_texts_fixes_mismatched_tags(self):
text = """<root><p>texto 1 [sup][bold]sup bold[/sup][/bold] texto 2</p></root>"""
xml = xml_utils.etree.fromstring(text)
result = self.style_tags_fixer._restore_matched_style_tags_in_node_texts(
xml)
self.assertIn("<SUP>", result)
self.assertIn("</SUP>", result)
self.assertIn("<BOLD>", result)
self.assertIn("</BOLD>", result)

def test_fix_loading_xml_with_recover_true_fixes_mismatched_tags(self):
text = """texto 1 [sup][bold]sup bold[/sup][/bold] texto 2"""
xml = self.style_tags_fixer._fix_loading_xml_with_recover_true(
text, None)
result = xml_utils.tostring(xml)
self.assertIn("<SUP>", result)
self.assertIn("</SUP>", result)
self.assertIn("<BOLD>", result)
self.assertIn("</BOLD>", result)

def test_restore_matched_style_tags_in_node_tails_returns_style_tags_in_upper_case(self):
text = """<root><p/>texto 1 [sup][bold]sup bold[/bold][/sup] texto 2</root>"""
expected = """<root><p/>texto 1 <SUP><BOLD>sup bold</BOLD></SUP> texto 2</root>"""
xml = xml_utils.etree.fromstring(text)
self.assertEqual(
expected,
self.style_tags_fixer._restore_matched_style_tags_in_node_tails(
xml))

def test_restore_matched_style_tags_in_node_tails_fixes_mismatched_tags(self):
text = """<root><p/>texto 1 [sup][bold]sup bold[/sup][/bold] texto 2</root>"""
xml = xml_utils.etree.fromstring(text)
result = self.style_tags_fixer._restore_matched_style_tags_in_node_tails(
xml)
self.assertIn("<SUP>", result)
self.assertIn("</SUP>", result)
self.assertIn("<BOLD>", result)
self.assertIn("</BOLD>", result)

def test_fix_inserting_tags_at_the_extremities_insert_at_the_start(self):
text = """texto 1 [/sup] texto 2"""
expected = """<SUP>texto 1 </SUP> texto 2"""
result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities(
text)
self.assertEqual(expected, result)

def test_fix_inserting_tags_at_the_extremities_insert_at_the_end(self):
text = """texto 1 [sup] texto 2"""
expected = """texto 1 <SUP> texto 2</SUP>"""
result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities(
text)
self.assertEqual(expected, result)

def test_fix_inserting_tags_at_the_extremities_insert_at_the_start_repeatly(self):
text = """texto 1 [/sup][/bold][/italic][/sub] texto 2"""
expected = """<SUB><ITALIC><BOLD><SUP>texto 1 </SUP></BOLD></ITALIC></SUB> texto 2"""
result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities(
text)
self.assertEqual(expected, result)

def test_fix_inserting_tags_at_the_extremities_insert_at_the_end_repeatly(self):
text = """texto 1 [sub][italic][bold][sup] texto 2"""
expected = """texto 1 <SUB><ITALIC><BOLD><SUP> texto 2</SUP></BOLD></ITALIC></SUB>"""
result = self.style_tags_fixer._fix_inserting_tags_at_the_extremities(
text)
self.assertEqual(expected, result)