diff --git a/HISTORY.rst b/HISTORY.rst index ce2d8cf..cc4f794 100644 --- a/HISTORY.rst +++ b/HISTORY.rst @@ -3,6 +3,25 @@ Release History --------------- +1.1.5 (2026-04-17) +++++++++++++++++++ + +**Updates** + +- None + +**Fixes** + +- Fixes `#70 `_: malformed table overflow. | `dfop02 `_ +- Fixes `#73 `_: Error parsing styles with spaces. | `dfop02 `_ +- Fixes `#71 `_: Error applying color to table cells. | `vvalchev `_ +- Fixes `#76 `_: Invalid image width and height. | `dfop02 `_ + +**New Features** + +- None + + 1.1.4 (2026-02-27) ++++++++++++++++++ diff --git a/html4docx/constants.py b/html4docx/constants.py index 3ef9de5..de06b3a 100644 --- a/html4docx/constants.py +++ b/html4docx/constants.py @@ -161,3 +161,5 @@ def default_borders(): re.compile(r'page-break-after\s*:\s*always\s*(?:!important)?\s*(?:;|$)'), re.compile(r'break-after\s*:\s*page\s*(?:!important)?\s*(?:;|$)'), ) + +RGB_SPACES_REGEX = re.compile(r'(rgba?\()([^)]+)(\))', re.IGNORECASE) diff --git a/html4docx/h4d.py b/html4docx/h4d.py index 85b992a..6b1ea6a 100644 --- a/html4docx/h4d.py +++ b/html4docx/h4d.py @@ -2,30 +2,30 @@ import logging import os import re -from io import BytesIO +from functools import lru_cache from html.parser import HTMLParser -from typing import Dict, Any +from io import BytesIO +from typing import Any, Dict import docx from bs4 import BeautifulSoup from docx import Document -from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.enum.table import WD_ALIGN_VERTICAL +from docx.enum.text import WD_ALIGN_PARAGRAPH from docx.oxml import OxmlElement from docx.oxml.ns import qn from docx.shared import RGBColor -from functools import lru_cache - -from html4docx import constants -from html4docx import utils +from html4docx import constants, utils from html4docx.metadata import Metadata + class HtmlToDocx(HTMLParser): """ - Class to convert HTML to Docx - source: https://docs.python.org/3/library/html.parser.html + Class to convert HTML to Docx + source: https://docs.python.org/3/library/html.parser.html """ + def __init__(self, style_map=None, tag_style_overrides=None, default_paragraph_style="Normal"): super().__init__() self.options = dict(constants.DEFAULT_OPTIONS) @@ -38,11 +38,11 @@ def __init__(self, style_map=None, tag_style_overrides=None, default_paragraph_s def set_initial_attrs(self, document=None): self.tags = { - 'span': [], - 'list': [], + "span": [], + "list": [], } self.doc = document if document else Document() - self.bs = self.options['fix-html'] # whether or not to clean with BeautifulSoup + self.bs = self.options["fix-html"] # whether or not to clean with BeautifulSoup self.paragraph = None self.run = None self.skip = False @@ -66,25 +66,25 @@ def set_initial_attrs(self, document=None): @property def metadata(self) -> Dict[str, Any]: - if not hasattr(self, '_metadata'): + if not hasattr(self, "_metadata"): self._metadata = Metadata(self.doc) return self._metadata @property def include_tables(self) -> bool: - return self.options.get('tables', True) + return self.options.get("tables", True) @property def include_images(self) -> bool: - return self.options.get('images', True) + return self.options.get("images", True) @property def include_styles(self) -> bool: - return self.options.get('styles', True) + return self.options.get("styles", True) @property def include_html_comments(self) -> bool: - return self.options.get('html-comments', False) + return self.options.get("html-comments", False) @property def include_stylemap(self) -> bool: @@ -98,11 +98,11 @@ def save(self, destination) -> None: """Save the document to a file path or BytesIO object.""" if isinstance(destination, str): destination, _ = os.path.splitext(destination) - self.doc.save(f'{destination}.docx') + self.doc.save(f"{destination}.docx") elif isinstance(destination, BytesIO): self.doc.save(destination) else: - raise TypeError('destination must be a str path or BytesIO object') + raise TypeError("destination must be a str path or BytesIO object") def copy_settings_from(self, other): """Copy settings from another instance of HtmlToDocx""" @@ -160,9 +160,7 @@ def apply_style_to_paragraph(self, paragraph, style_name): return True except KeyError: # Style doesn't exist in document - print( - f"Warning: Style '{style_name}' not found in document. Using default." - ) + print(f"Warning: Style '{style_name}' not found in document. Using default.") return False def apply_style_to_run(self, style_name): @@ -184,12 +182,8 @@ def apply_style_to_run(self, style_name): return False except ValueError as e: if "need type CHARACTER" in str(e): - print( - f"Warning: '{style_name}' is a paragraph style, not a character style." - ) - print( - "For inline elements like , please create a character style in Word." - ) + print(f"Warning: '{style_name}' is a paragraph style, not a character style.") + print("For inline elements like , please create a character style in Word.") return False def parse_inline_styles(self, style_string): @@ -271,9 +265,7 @@ def apply_inline_styles_to_run(self, styles_dict): # Apply font-family if "font-family" in styles_dict: - font_family = ( - styles_dict["font-family"].split(",")[0].strip().strip('"').strip("'") - ) + font_family = styles_dict["font-family"].split(",")[0].strip().strip('"').strip("'") self.run.font.name = font_family def get_cell_html(self, soup): @@ -282,14 +274,14 @@ def get_cell_html(self, soup): Cannot use find_all as it only finds element tags and does not find text which is not inside an element """ - return ' '.join([str(i) for i in soup.contents]) + return " ".join([str(i) for i in soup.contents]) def set_cell_background(self, cell, color): """Set the background color of a table cell.""" tc = cell._tc tcPr = tc.get_or_add_tcPr() - shd = OxmlElement('w:shd') - shd.set(qn('w:fill'), color.lstrip('#')) + shd = OxmlElement("w:shd") + shd.set(qn("w:fill"), color.lstrip("#")) tcPr.append(shd) def set_cell_borders(self, cell, styles): @@ -314,11 +306,11 @@ def set_cell_borders(self, cell, styles): border_styles = constants.BORDER_STYLES keywords = constants.BORDER_KEYWORDS border_sides = ("top", "right", "bottom", "left") - border_width_pattern = re.compile(r'^[0-9]*\.?[0-9]+(px|pt|cm|in|rem|em|%)$') + border_width_pattern = re.compile(r"^[0-9]*\.?[0-9]+(px|pt|cm|in|rem|em|%)$") def parse_border_style(value: str) -> str: """Parses border styles to match word standart""" - return constants.BORDER_STYLES[value] if value in constants.BORDER_STYLES.keys() else 'none' + return constants.BORDER_STYLES[value] if value in constants.BORDER_STYLES.keys() else "none" def check_unit_keywords(value: str) -> str: """Convert medium, thin, thick keywords to numeric values (px)""" @@ -332,23 +324,23 @@ def border_unit_converter(unit_value: str): unit_value = check_unit_keywords(unit_value) # Return default if no value or empty - if not unit_value or unit_value == '': + if not unit_value or unit_value == "": return default_size - unit = re.sub(r'[0-9\.]+', '', unit_value) - value = float(re.sub(r'[a-zA-Z\!\%]+', '', unit_value)) # Allow float values + unit = re.sub(r"[0-9\.]+", "", unit_value) + value = float(re.sub(r"[a-zA-Z\!\%]+", "", unit_value)) # Allow float values - if unit == 'px': + if unit == "px": result = value * 0.75 # 1 px = 0.75 pt - elif unit == 'cm': + elif unit == "cm": result = value * 28.35 # 1 cm = 28.35 pt - elif unit == 'in': + elif unit == "in": result = value * 72 # 1 inch = 72 pt - elif unit == 'pt': - result = value # default is pt - elif unit == 'rem' or unit == 'em': + elif unit == "pt": + result = value # default is pt + elif unit == "rem" or unit == "em": result = value * 12 # Assuming 1rem/em = 16px, converted to pt - elif unit == '%': + elif unit == "%": result = constants.MAX_INDENT * (value / 100) else: return None # Unsupported units return None @@ -360,10 +352,11 @@ def parse_border_value(value: str): Parses a border value like: '1px solid #000000', 'solid 1px red', or '#000000 medium dashed' in any order. """ - parts = value.split() + value = value.strip() + parts = utils.normalize_rgb_spaces(value).split() # Return all default if there is only 'none' or empty - if (len(parts) == 1 and parts[0] == 'none') or (not value or value.strip() == ''): + if (len(parts) == 1 and parts[0].lower() == "none") or (not value or value.strip() == ""): return default_size, default_style, default_color size = None @@ -452,9 +445,9 @@ def parse_border_value(value: str): borders[side].update({"size": size, "style": style, "color": color}) # Check if w:tcBorders exists, otherwise create it - tcBorders = tcPr.first_child_found_in('w:tcBorders') + tcBorders = tcPr.first_child_found_in("w:tcBorders") if tcBorders is None: - tcBorders = OxmlElement('w:tcBorders') + tcBorders = OxmlElement("w:tcBorders") tcPr.append(tcBorders) # Apply borders to the cell @@ -463,17 +456,17 @@ def parse_border_value(value: str): border = OxmlElement(f"w:{side}") border.set(qn("w:val"), border_info["style"]) # Set border style border.set(qn("w:sz"), str(border_info["size"] * 8)) # Word uses eighths of a point - border.set(qn("w:color"), border_info["color"].replace('#', '')) # Set border color + border.set(qn("w:color"), border_info["color"].replace("#", "")) # Set border color tcBorders.append(border) def add_bookmark(self, bookmark_name): """Adds a word bookmark to an existing paragraph""" - bookmark_start = OxmlElement('w:bookmarkStart') - bookmark_start.set(qn('w:id'), str(self.bookmark_id)) - bookmark_start.set(qn('w:name'), bookmark_name) + bookmark_start = OxmlElement("w:bookmarkStart") + bookmark_start.set(qn("w:id"), str(self.bookmark_id)) + bookmark_start.set(qn("w:name"), bookmark_name) - bookmark_end = OxmlElement('w:bookmarkEnd') - bookmark_end.set(qn('w:id'), str(self.bookmark_id)) + bookmark_end = OxmlElement("w:bookmarkEnd") + bookmark_end.set(qn("w:id"), str(self.bookmark_id)) if not self.paragraph: self.paragraph = self.doc.add_paragraph() @@ -492,18 +485,14 @@ def apply_styles_to_run(self, run, style, isCustom=False): return except ValueError as e: if "need type CHARACTER" in str(e): - print( - f"Warning: '{style}' is a paragraph style, not a character style." - ) - print( - "For inline elements like , please create a character style in Word." - ) - - if not style or not hasattr(run, 'font'): + print(f"Warning: '{style}' is a paragraph style, not a character style.") + print("For inline elements like , please create a character style in Word.") + + if not style or not hasattr(run, "font"): return # Find current paragraph and run position - if not hasattr(self, 'paragraph') or self.paragraph is None: + if not hasattr(self, "paragraph") or self.paragraph is None: return paragraph_id = id(self.paragraph) @@ -518,17 +507,17 @@ def apply_styles_to_run(self, run, style, isCustom=False): for style_name, style_value in style.items(): if style_name in constants.RUN_STYLES: - if style_name.startswith('background-color') and style_value in ('inherit', 'initial'): + if style_name.startswith("background-color") and style_value in ("inherit", "initial"): continue self.paragraph_span_styles[paragraph_id][run_index].add(style_name) - if style_name == 'text-decoration': + if style_name == "text-decoration": # If span sets text-decoration shorthand, it conflicts with all text-decoration-* properties - self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-line') - self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-style') - self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-color') - elif style_name.startswith('text-decoration-'): + self.paragraph_span_styles[paragraph_id][run_index].add("text-decoration-line") + self.paragraph_span_styles[paragraph_id][run_index].add("text-decoration-style") + self.paragraph_span_styles[paragraph_id][run_index].add("text-decoration-color") + elif style_name.startswith("text-decoration-"): pass for style_name, style_value in style.items(): @@ -536,7 +525,7 @@ def apply_styles_to_run(self, run, style, isCustom=False): continue elif style_name in constants.RUN_STYLES: handler = getattr(self, constants.RUN_STYLES[style_name]) - param_name = style_name.replace('-', '_') + param_name = style_name.replace("-", "_") handler(run=run, **{param_name: style_value}) else: logging.warning(f"Warning: Unrecognized style '{style_name}', will be skipped.") @@ -550,7 +539,7 @@ def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): print(f"Warning: Style '{style}' not found in document. Using default.") return - if not style or not hasattr(paragraph, 'paragraph_format'): + if not style or not hasattr(paragraph, "paragraph_format"): return for style_name, style_value in style.items(): @@ -562,40 +551,35 @@ def apply_styles_to_paragraph(self, paragraph, style, isCustom=False): logging.warning(f"Warning: Unrecognized paragraph style '{style_name}', will be skipped.") continue - handler( - paragraph=paragraph, - style_name=style_name, - value=style_value, - all_styles=style - ) + handler(paragraph=paragraph, style_name=style_name, value=style_value, all_styles=style) def _apply_alignment_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] align = utils.remove_important_from_style(value) - if 'center' in align: + if "center" in align: paragraph.alignment = WD_ALIGN_PARAGRAPH.CENTER - elif 'left' in align: + elif "left" in align: paragraph.alignment = WD_ALIGN_PARAGRAPH.LEFT - elif 'right' in align: + elif "right" in align: paragraph.alignment = WD_ALIGN_PARAGRAPH.RIGHT - elif 'justify' in align: + elif "justify" in align: paragraph.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY def _apply_line_height_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] line_height = utils.remove_important_from_style(value) - if line_height in ('normal', 'inherit'): + if line_height in ("normal", "inherit"): paragraph.paragraph_format.line_spacing = None else: try: - if line_height.replace('.', '').replace('%', '').isdigit(): - multiplier = float(line_height[:-1]) / 100.0 if line_height.endswith('%') else float(line_height) + if line_height.replace(".", "").replace("%", "").isdigit(): + multiplier = float(line_height[:-1]) / 100.0 if line_height.endswith("%") else float(line_height) paragraph.paragraph_format.line_spacing = multiplier else: converted = utils.unit_converter(line_height, target_unit="pt") @@ -605,35 +589,35 @@ def _apply_line_height_paragraph(self, **kwargs): paragraph.paragraph_format.line_spacing = None def _apply_margins_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - style_name = kwargs['style_name'] - all_styles = kwargs['all_styles'] + paragraph = kwargs["paragraph"] + style_name = kwargs["style_name"] + all_styles = kwargs["all_styles"] - margin_left = all_styles.get('margin-left') - margin_right = all_styles.get('margin-right') + margin_left = all_styles.get("margin-left") + margin_right = all_styles.get("margin-right") if margin_left and margin_right: - if 'auto' in margin_left and 'auto' in margin_right: + if "auto" in margin_left and "auto" in margin_right: paragraph.paragraph_format.alignment = WD_ALIGN_PARAGRAPH.CENTER return - if style_name == 'margin-left' and margin_left and 'auto' not in margin_left: + if style_name == "margin-left" and margin_left and "auto" not in margin_left: paragraph.paragraph_format.left_indent = utils.unit_converter(margin_left) - if style_name == 'margin-right' and margin_right and 'auto' not in margin_right: + if style_name == "margin-right" and margin_right and "auto" not in margin_right: paragraph.paragraph_format.right_indent = utils.unit_converter(margin_right) def _apply_text_indent_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] indent_value = utils.remove_important_from_style(value) paragraph.paragraph_format.first_line_indent = utils.unit_converter(indent_value, target_unit="pt") def _apply_font_weight_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] font_weight = utils.remove_important_from_style(value).lower() @@ -641,7 +625,7 @@ def _apply_font_weight_paragraph(self, **kwargs): paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'font-weight' in paragraph_spans[i]: + if i in paragraph_spans and "font-weight" in paragraph_spans[i]: continue self._apply_font_weight_to_run( @@ -650,11 +634,11 @@ def _apply_font_weight_paragraph(self, **kwargs): ) def _apply_font_weight_to_run(self, **kwargs): - font_weight = kwargs['font_weight'] - run = kwargs['run'] - if font_weight in ('bold', 'bolder', '700', '800', '900'): + font_weight = kwargs["font_weight"] + run = kwargs["run"] + if font_weight in ("bold", "bolder", "700", "800", "900"): run.font.bold = True - elif font_weight in ('normal', 'lighter', '400', '300', '100'): + elif font_weight in ("normal", "lighter", "400", "300", "100"): run.font.bold = False # Note: Decide what to do for values between 400-700 elif font_weight.isdigit(): @@ -662,8 +646,8 @@ def _apply_font_weight_to_run(self, **kwargs): run.font.bold = weight >= 700 def _apply_font_style_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] font_style = utils.remove_important_from_style(value).lower() @@ -671,7 +655,7 @@ def _apply_font_style_paragraph(self, **kwargs): paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'font-style' in paragraph_spans[i]: + if i in paragraph_spans and "font-style" in paragraph_spans[i]: continue self._apply_font_style_to_run( @@ -680,17 +664,17 @@ def _apply_font_style_paragraph(self, **kwargs): ) def _apply_font_style_to_run(self, **kwargs): - font_style = kwargs['font_style'] - run = kwargs['run'] + font_style = kwargs["font_style"] + run = kwargs["run"] - if font_style in ('italic', 'oblique'): + if font_style in ("italic", "oblique"): run.font.italic = True - elif font_style == 'normal': + elif font_style == "normal": run.font.italic = False def _apply_font_size_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] font_size = utils.remove_important_from_style(value).lower() @@ -701,7 +685,7 @@ def _apply_font_size_paragraph(self, **kwargs): paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'font-size' in paragraph_spans[i]: + if i in paragraph_spans and "font-size" in paragraph_spans[i]: continue self._apply_font_size_to_run( @@ -710,14 +694,14 @@ def _apply_font_size_paragraph(self, **kwargs): ) def _apply_font_size_to_run(self, **kwargs): - run = kwargs['run'] - font_size = kwargs['font_size'] + run = kwargs["run"] + font_size = kwargs["font_size"] font_size = utils.remove_important_from_style(font_size).lower() font_size = utils.adapt_font_size(font_size) try: - if font_size in ('normal', 'initial', 'inherit'): + if font_size in ("normal", "initial", "inherit"): run.font.size = None else: converted_size = utils.unit_converter(font_size, target_unit="pt") @@ -728,8 +712,8 @@ def _apply_font_size_to_run(self, **kwargs): logging.warning(f"Warning: Could not parse font-size '{font_size}': {e}") def _apply_font_family_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] font_family = utils.remove_important_from_style(value).strip() @@ -737,7 +721,7 @@ def _apply_font_family_paragraph(self, **kwargs): paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'font-family' in paragraph_spans[i]: + if i in paragraph_spans and "font-family" in paragraph_spans[i]: continue self._apply_font_family_to_run( @@ -746,21 +730,30 @@ def _apply_font_family_paragraph(self, **kwargs): ) def _apply_font_family_to_run(self, **kwargs): - run = kwargs['run'] - font_family = kwargs['font_family'] + run = kwargs["run"] + font_family = kwargs["font_family"] - if not font_family or font_family in ('inherit', 'initial', 'unset'): + if not font_family or font_family in ("inherit", "initial", "unset"): return try: - font_families = [f.strip().strip('"\'') for f in font_family.split(',')] + font_families = [f.strip().strip("\"'") for f in font_family.split(",")] for font_name in font_families: - if font_name and font_name not in ('inherit', 'initial', 'unset', 'serif', 'sans-serif', 'monospace', - 'cursive', 'fantasy', 'system-ui'): + if font_name and font_name not in ( + "inherit", + "initial", + "unset", + "serif", + "sans-serif", + "monospace", + "cursive", + "fantasy", + "system-ui", + ): run.font.name = font_name break - elif font_name in ('serif', 'sans-serif', 'monospace'): + elif font_name in ("serif", "sans-serif", "monospace"): run.font.name = constants.GENERIC_FONT_STYLES[font_name] break @@ -768,17 +761,17 @@ def _apply_font_family_to_run(self, **kwargs): logging.warning(f"Warning: Could not apply font-family '{font_family}': {e}") def _apply_color_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - all_styles = kwargs['all_styles'] - color_value = utils.remove_important_from_style(all_styles.get('color', '')).lower().strip() - if color_value in ('inherit', 'initial', 'transparent', 'currentcolor'): + paragraph = kwargs["paragraph"] + all_styles = kwargs["all_styles"] + color_value = utils.remove_important_from_style(all_styles.get("color", "")).lower().strip() + if color_value in ("inherit", "initial", "transparent", "currentcolor"): return paragraph_id = id(paragraph) paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'color' in paragraph_spans[i]: + if i in paragraph_spans and "color" in paragraph_spans[i]: continue self._apply_color_to_run( run=run, @@ -786,8 +779,8 @@ def _apply_color_paragraph(self, **kwargs): ) def _apply_color_to_run(self, **kwargs): - run = kwargs['run'] - color_value = kwargs['color'] + run = kwargs["run"] + color_value = kwargs["color"] try: colors = utils.parse_color(color_value) run.font.color.rgb = RGBColor(*colors) @@ -795,8 +788,8 @@ def _apply_color_to_run(self, **kwargs): logging.warning(f"Could not apply color '{color_value}': {e}") def _apply_text_transform_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] text_transform = utils.remove_important_from_style(value).lower() @@ -804,7 +797,7 @@ def _apply_text_transform_paragraph(self, **kwargs): paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'text-transform' in paragraph_spans[i]: + if i in paragraph_spans and "text-transform" in paragraph_spans[i]: continue self._apply_text_transform_to_run( @@ -813,54 +806,50 @@ def _apply_text_transform_paragraph(self, **kwargs): ) def _apply_text_transform_to_run(self, **kwargs): - run = kwargs['run'] - text_transform = kwargs['text_transform'] + run = kwargs["run"] + text_transform = kwargs["text_transform"] if not run.text: return try: - if text_transform == 'uppercase': + if text_transform == "uppercase": run.text = run.text.upper() - elif text_transform == 'lowercase': + elif text_transform == "lowercase": run.text = run.text.lower() - elif text_transform == 'capitalize': + elif text_transform == "capitalize": run.text = run.text.title() - elif text_transform in ('none', 'initial', 'inherit'): + elif text_transform in ("none", "initial", "inherit"): # No transformation needed pass - elif text_transform in ('full-width', 'math-auto', 'full-size-kana'): + elif text_transform in ("full-width", "math-auto", "full-size-kana"): logging.warning(f"Warning: Unsupported text transform '{text_transform}'") except (AttributeError, Exception) as e: logging.warning(f"Warning: Could not apply text-transform '{text_transform}': {e}") def _apply_text_decoration_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - all_styles = kwargs['all_styles'] + paragraph = kwargs["paragraph"] + all_styles = kwargs["all_styles"] # Initialize decorations - decorations = { - 'line_type': None, - 'line_style': None, - 'color': None - } + decorations = {"line_type": None, "line_style": None, "color": None} - if 'text-decoration' in all_styles: - text_decoration_value = utils.remove_important_from_style(all_styles['text-decoration']).lower() + if "text-decoration" in all_styles: + text_decoration_value = utils.remove_important_from_style(all_styles["text-decoration"]).lower() decorations = utils.parse_text_decoration(text_decoration_value) - if 'text-decoration-line' in all_styles: - line_value = utils.remove_important_from_style(all_styles['text-decoration-line']).lower() - decorations['line_type'] = line_value + if "text-decoration-line" in all_styles: + line_value = utils.remove_important_from_style(all_styles["text-decoration-line"]).lower() + decorations["line_type"] = line_value - if 'text-decoration-style' in all_styles: - style_value = utils.remove_important_from_style(all_styles['text-decoration-style']).lower() - decorations['line_style'] = style_value + if "text-decoration-style" in all_styles: + style_value = utils.remove_important_from_style(all_styles["text-decoration-style"]).lower() + decorations["line_style"] = style_value - if 'text-decoration-color' in all_styles: - color_value = utils.remove_important_from_style(all_styles['text-decoration-color']).lower() - decorations['color'] = color_value + if "text-decoration-color" in all_styles: + color_value = utils.remove_important_from_style(all_styles["text-decoration-color"]).lower() + decorations["color"] = color_value paragraph_id = id(paragraph) paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) @@ -869,82 +858,82 @@ def _apply_text_decoration_paragraph(self, **kwargs): span_styles = paragraph_spans.get(i, set()) # If span has text-decoration shorthand, skip entirely - if 'text-decoration' in span_styles: + if "text-decoration" in span_styles: continue - if decorations['line_type'] and 'text-decoration-line' not in span_styles: + if decorations["line_type"] and "text-decoration-line" not in span_styles: self._apply_text_decoration_line_to_run( run=run, - text_decoration_line=decorations['line_type'], + text_decoration_line=decorations["line_type"], ) - if decorations['line_style'] and 'text-decoration-style' not in span_styles: + if decorations["line_style"] and "text-decoration-style" not in span_styles: self._apply_text_decoration_style_to_run( run=run, - text_decoration_style=decorations['line_style'], + text_decoration_style=decorations["line_style"], ) - if decorations['color'] and 'text-decoration-color' not in span_styles: + if decorations["color"] and "text-decoration-color" not in span_styles: self._apply_text_decoration_color_to_run( run=run, - text_decoration_color=decorations['color'], + text_decoration_color=decorations["color"], ) def _apply_text_decoration_to_run(self, **kwargs): - run = kwargs['run'] - text_decoration = kwargs['text_decoration'] + run = kwargs["run"] + text_decoration = kwargs["text_decoration"] if not text_decoration: return decorations = utils.parse_text_decoration(text_decoration) - if decorations['line_type']: + if decorations["line_type"]: self._apply_text_decoration_line_to_run( run=run, - text_decoration_line=decorations['line_type'], + text_decoration_line=decorations["line_type"], ) - if decorations['line_style']: + if decorations["line_style"]: self._apply_text_decoration_style_to_run( run=run, - text_decoration_style=decorations['line_style'], + text_decoration_style=decorations["line_style"], ) - if decorations['color']: + if decorations["color"]: self._apply_text_decoration_color_to_run( run=run, - text_decoration_color=decorations['color'], + text_decoration_color=decorations["color"], ) def _apply_text_decoration_line_to_run(self, **kwargs): - run = kwargs['run'] - text_decoration_line = kwargs['text_decoration_line'] + run = kwargs["run"] + text_decoration_line = kwargs["text_decoration_line"] if text_decoration_line in constants.FONT_UNDERLINE: - if text_decoration_line == 'underline': + if text_decoration_line == "underline": run.font.underline = True run.font.strike = False - elif text_decoration_line == 'line-through': + elif text_decoration_line == "line-through": run.font.strike = True run.font.underline = False - elif text_decoration_line == 'none': + elif text_decoration_line == "none": run.font.underline = False run.font.strike = False else: logging.warning(f"Warning: Unsupported text decoration '{text_decoration_line}'") def _apply_text_decoration_style_to_run(self, **kwargs): - run = kwargs['run'] - text_decoration_style = kwargs['text_decoration_style'] + run = kwargs["run"] + text_decoration_style = kwargs["text_decoration_style"] if not text_decoration_style or run.font.underline is False: return False should_apply = False if run.font.underline: should_apply = True - elif hasattr(self.paragraph, '_pending_styles'): + elif hasattr(self.paragraph, "_pending_styles"): for pending_style in self.paragraph._pending_styles: - if 'text-decoration' in pending_style or 'text-decoration-line' in pending_style: + if "text-decoration" in pending_style or "text-decoration-line" in pending_style: should_apply = True break @@ -959,31 +948,31 @@ def _apply_text_decoration_style_to_run(self, **kwargs): paragraph_id = id(self.paragraph) run_index = len(self.paragraph.runs) - 1 if paragraph_id in self.paragraph_span_styles and run_index in self.paragraph_span_styles[paragraph_id]: - self.paragraph_span_styles[paragraph_id][run_index].add('text-decoration-line') + self.paragraph_span_styles[paragraph_id][run_index].add("text-decoration-line") return True def _apply_text_decoration_color_to_run(self, **kwargs): - run = kwargs['run'] - text_decoration_color = kwargs['text_decoration_color'] + run = kwargs["run"] + text_decoration_color = kwargs["text_decoration_color"] if not text_decoration_color or not utils.is_color(text_decoration_color): return color_hex = utils.parse_color(text_decoration_color, return_hex=True) rPr = run._r.get_or_add_rPr() - u = rPr.find(qn('w:u')) + u = rPr.find(qn("w:u")) if u is not None: - u.set(qn('w:color'), color_hex.upper().lstrip('#')) + u.set(qn("w:color"), color_hex.upper().lstrip("#")) def _apply_background_color_paragraph(self, **kwargs): - paragraph = kwargs['paragraph'] - value = kwargs['value'] + paragraph = kwargs["paragraph"] + value = kwargs["value"] background_color = utils.remove_important_from_style(value).lower().strip() - if background_color in ('inherit', 'initial'): + if background_color in ("inherit", "initial"): return - elif background_color in ('transparent', 'none'): + elif background_color in ("transparent", "none"): logging.warning(f"Warning: Unsupported background color '{background_color}'") return @@ -996,7 +985,7 @@ def _apply_background_color_paragraph(self, **kwargs): paragraph_spans = self.paragraph_span_styles.get(paragraph_id, {}) for i, run in enumerate(paragraph.runs): - if i in paragraph_spans and 'background-color' in paragraph_spans[i]: + if i in paragraph_spans and "background-color" in paragraph_spans[i]: continue self._apply_background_color_to_run( run=run, @@ -1007,12 +996,12 @@ def _apply_background_color_paragraph(self, **kwargs): logging.warning(f"Could not apply background-color to paragraph: {e}") def _apply_background_color_to_run(self, **kwargs): - run = kwargs['run'] - background_color = kwargs['background_color'] + run = kwargs["run"] + background_color = kwargs["background_color"] try: - if background_color in ('inherit', 'initial'): + if background_color in ("inherit", "initial"): return - elif background_color in ('transparent', 'none'): + elif background_color in ("transparent", "none"): logging.warning(f"Warning: Unsupported background color '{background_color}'") return @@ -1020,15 +1009,15 @@ def _apply_background_color_to_run(self, **kwargs): if not color_hex: return - shd = OxmlElement('w:shd') - shd.set(qn('w:val'), 'clear') - shd.set(qn('w:color'), 'auto') - shd.set(qn('w:fill'), color_hex.lstrip('#')) + shd = OxmlElement("w:shd") + shd.set(qn("w:val"), "clear") + shd.set(qn("w:color"), "auto") + shd.set(qn("w:fill"), color_hex.lstrip("#")) r_pr = run._element.get_or_add_rPr() # Remove existing shading - existing_shd = r_pr.find(qn('w:shd')) + existing_shd = r_pr.find(qn("w:shd")) if existing_shd is not None: r_pr.remove(existing_shd) @@ -1039,66 +1028,68 @@ def _apply_background_color_to_run(self, **kwargs): def add_text_align_or_margin_to(self, obj, style): """Styles that can be applied on multiple objects""" - if 'text-align' in style: - align = utils.remove_important_from_style(style['text-align']) + if "text-align" in style: + align = utils.remove_important_from_style(style["text-align"]) - if 'center' in align: + if "center" in align: obj.alignment = WD_ALIGN_PARAGRAPH.CENTER - elif 'left' in align: + elif "left" in align: obj.alignment = WD_ALIGN_PARAGRAPH.LEFT - elif 'right' in align: + elif "right" in align: obj.alignment = WD_ALIGN_PARAGRAPH.RIGHT - elif 'justify' in align: + elif "justify" in align: obj.alignment = WD_ALIGN_PARAGRAPH.JUSTIFY - if 'margin-left' in style and 'margin-right' in style: - if 'auto' in style['margin-left'] and 'auto' in style['margin-right']: + if "margin-left" in style and "margin-right" in style: + if "auto" in style["margin-left"] and "auto" in style["margin-right"]: obj.alignment = WD_ALIGN_PARAGRAPH.CENTER - elif 'margin-left' in style: - obj.left_indent = utils.unit_converter(style['margin-left']) + elif "margin-left" in style: + obj.left_indent = utils.unit_converter(style["margin-left"]) def add_styles_to_table_cell(self, styles, doc_cell, cell_row): """Styles that must be applied specifically in a _Cell object""" # Set background color - if 'background-color' in styles: - self.set_cell_background(doc_cell, styles['background-color']) + + if "background-color" in styles: + color = utils.parse_color(styles["background-color"], return_hex=True) + self.set_cell_background(doc_cell, color) # Set width (approximate, since DOCX uses different units) - if 'width' in styles: - doc_cell.width = utils.unit_converter(styles['width']) + if "width" in styles: + doc_cell.width = utils.unit_converter(styles["width"]) # Set height (due word limitations, cannot set individually cell height, only whole row) - if 'height' in styles: - cell_row.height = utils.unit_converter(styles['height']) + if "height" in styles: + cell_row.height = utils.unit_converter(styles["height"]) # Set text color - if 'color' in styles: - color = utils.parse_color(styles['color']) + if "color" in styles: + color = utils.parse_color(styles["color"]) if color: for paragraph in doc_cell.paragraphs: for run in paragraph.runs: - run.font.color.rgb = color + run.font.color.rgb = RGBColor(*color) # Set vertical align (for individual cells) - if 'vertical-align' in styles: - align = utils.remove_important_from_style(styles['vertical-align']) + if "vertical-align" in styles: + align = utils.remove_important_from_style(styles["vertical-align"]) - if 'top' in align: + if "top" in align: doc_cell.vertical_alignment = WD_ALIGN_VERTICAL.TOP - elif 'middle' in align: + elif "middle" in align: doc_cell.vertical_alignment = WD_ALIGN_VERTICAL.CENTER - elif 'bottom' in align: + elif "bottom" in align: doc_cell.vertical_alignment = WD_ALIGN_VERTICAL.BOTTOM # Set borders - if any('border' in style for style in styles.keys()): + if any("border" in style for style in styles.keys()): self.set_cell_borders(doc_cell, styles) self.add_text_align_or_margin_to(doc_cell.paragraphs[0], styles) def add_styles_to_run(self, style): - if 'font-size' in style: - font_size = utils.remove_important_from_style(style['font-size']) + if "font-size" in style: + font_size = utils.remove_important_from_style(style["font-size"]) # Adapt font_size when text, ex.: small, medium, etc. font_size = utils.adapt_font_size(font_size) @@ -1106,21 +1097,21 @@ def add_styles_to_run(self, style): for run in self.paragraph.runs: run.font.size = utils.unit_converter(font_size) - if 'color' in style: - colors = utils.parse_color(style['color']) + if "color" in style: + colors = utils.parse_color(style["color"]) self.run.font.color.rgb = RGBColor(*colors) - if 'background-color' in style: + if "background-color" in style: # This should stay here for div. # Little trick to apply background-color to paragraph # because `self.run.font.highlight_color` # has a very limited amount of colors - color = utils.parse_color(style['background-color'], return_hex=True) + color = utils.parse_color(style["background-color"], return_hex=True) - shd = OxmlElement('w:shd') - shd.set(qn('w:val'), 'clear') - shd.set(qn('w:color'), 'auto') - shd.set(qn('w:fill'), color.lstrip('#')) + shd = OxmlElement("w:shd") + shd.set(qn("w:val"), "clear") + shd.set(qn("w:color"), "auto") + shd.set(qn("w:fill"), color.lstrip("#")) # Make sure the paragraph styling element exists self.paragraph.paragraph_format.element.get_or_add_pPr() @@ -1129,15 +1120,15 @@ def add_styles_to_run(self, style): self.paragraph.paragraph_format.element.pPr.append(shd) def handle_li(self): - ''' - Handle li tags - source: https://stackoverflow.com/a/78685353/17274446 - ''' - list_depth = len(self.tags['list']) or 1 - list_type = self.tags['list'][-1] if self.tags['list'] else 'ul' + """ + Handle li tags + source: https://stackoverflow.com/a/78685353/17274446 + """ + list_depth = len(self.tags["list"]) or 1 + list_type = self.tags["list"][-1] if self.tags["list"] else "ul" level = min(list_depth, 3) style_key = list_type if level <= 1 else f"{list_type}{level}" - list_style = constants.STYLES.get(style_key, 'List Number' if list_type == 'ol' else 'List Bullet') + list_style = constants.STYLES.get(style_key, "List Number" if list_type == "ol" else "List Bullet") self.paragraph = self.doc.add_paragraph(style=list_style) self.in_li = True @@ -1152,7 +1143,7 @@ def handle_li(self): style_obj = self.paragraph.style num_id_style = None - if hasattr(style_obj._element.pPr, 'numPr'): + if hasattr(style_obj._element.pPr, "numPr"): num_id_style = style_obj._element.pPr.numPr.numId.val if num_id_style is not None: @@ -1176,13 +1167,13 @@ def handle_li(self): # Assign this numId to the paragraph pPr = self.paragraph._p.get_or_add_pPr() - numPr = OxmlElement('w:numPr') + numPr = OxmlElement("w:numPr") - numId_elem = OxmlElement('w:numId') - numId_elem.set(qn('w:val'), str(new_num_id)) + numId_elem = OxmlElement("w:numId") + numId_elem.set(qn("w:val"), str(new_num_id)) - ilvl = OxmlElement('w:ilvl') - ilvl.set(qn('w:val'), str(level - 1)) + ilvl = OxmlElement("w:ilvl") + ilvl.set(qn("w:val"), str(level - 1)) numPr.append(ilvl) numPr.append(numId_elem) @@ -1197,18 +1188,18 @@ def add_image_to_cell(self, cell, image, width=None, height=None): def handle_img(self, current_attrs): if not self.include_images: self.skip = True - self.skip_tag = 'img' + self.skip_tag = "img" return - if 'src' not in current_attrs: + if "src" not in current_attrs: self.doc.add_paragraph("") return - src = current_attrs['src'] + src = current_attrs["src"] # added image dimension, interpreting values as pixel only - height = utils.unit_converter(current_attrs['height']) if 'height' in current_attrs else None - width = utils.unit_converter(current_attrs['width']) if 'width' in current_attrs else None + height = utils.unit_converter(utils.add_px(current_attrs["height"])) if "height" in current_attrs else None + width = utils.unit_converter(utils.add_px(current_attrs["width"])) if "width" in current_attrs else None # fetch image image = utils.fetch_image_data(src) @@ -1235,14 +1226,14 @@ def handle_img(self, current_attrs): # avoid exposing filepaths in document self.doc.add_paragraph("" % utils.get_filename_from_url(src)) - ''' + """ #adding style For right-alignment: `'float: right;'` For center-alignment: `'display: block; margin-left: auto; margin-right: auto;'` Everything else would be Left aligned - ''' - if 'style' in current_attrs: - style = current_attrs['style'] + """ + if "style" in current_attrs: + style = current_attrs["style"] image_alignment = utils.get_image_alignment(style) last_paragraph = self.doc.paragraphs[-1] if image_alignment == utils.ImageAlignment.RIGHT: @@ -1268,7 +1259,7 @@ def handle_table(self, current_attrs): try: # Fixed 'style lookup by style_id is deprecated.' # https://stackoverflow.com/a/29567907/17274446 - self.table_style = ' '.join(re.findall(r'[A-Z][a-z]*|[0-9]', self.table_style)) + self.table_style = " ".join(re.findall(r"[A-Z][a-z]*|[0-9]", self.table_style)) # Available Table Styles # https://python-docx.readthedocs.io/en/latest/user/styles-understanding.html#table-styles-in-default-template self.table.style = self.table_style @@ -1281,14 +1272,17 @@ def handle_table(self, current_attrs): for cell_row, row in enumerate(self.get_table_rows(table_soup)): col_offset = 0 # Shift index if some columns are occupied for col in self.get_table_columns(row): - while used_cells[cell_row][col_offset]: + while col_offset < cols and used_cells[cell_row][col_offset]: col_offset += 1 + if col_offset >= cols: + raise ValueError(f"Table layout mismatch: exceeded column count ({cols}) at row {cell_row}") + current_row = cell_row current_col = col_offset cell_html = self.get_cell_html(col) - if col.name == 'th': + if col.name == "th": cell_html = "%s" % cell_html # Get _Cell object from table based on cell_row and cell_col @@ -1296,16 +1290,11 @@ def handle_table(self, current_attrs): # Reference: # https://python-docx.readthedocs.io/en/latest/dev/analysis/features/table/cell-merge.html - rowspan = utils.safe_int(col.get('rowspan', 1)) - colspan = utils.safe_int(col.get('colspan', 1)) + rowspan = utils.safe_int(col.get("rowspan", 1)) + colspan = utils.safe_int(col.get("colspan", 1)) if rowspan > 1 or colspan > 1: - docx_cell = docx_cell.merge( - self.table.cell( - current_row + (rowspan - 1), - current_col + (colspan - 1) - ) - ) + docx_cell = docx_cell.merge(self.table.cell(current_row + (rowspan - 1), current_col + (colspan - 1))) # Mark all merged cells as used for r in range(current_row, current_row + rowspan): @@ -1315,9 +1304,9 @@ def handle_table(self, current_attrs): used_cells[current_row][current_col] = True # Parse cell styles - cell_styles = utils.parse_dict_string(col.get('style', '')) + cell_styles = utils.parse_dict_string(col.get("style", "")) - if 'width' in cell_styles or 'height' in cell_styles: + if "width" in cell_styles or "height" in cell_styles: self.table.autofit = False self.table.allow_autofit = False @@ -1328,13 +1317,13 @@ def handle_table(self, current_attrs): col_offset += colspan # Move to the next real column - if 'style' in current_attrs and self.table: - style = utils.parse_dict_string(current_attrs['style']) + if "style" in current_attrs and self.table: + style = utils.parse_dict_string(current_attrs["style"]) self.add_text_align_or_margin_to(self.table, style) # skip all tags until corresponding closing tag - self.instances_to_skip = len(table_soup.find_all('table')) - self.skip_tag = 'table' + self.instances_to_skip = len(table_soup.find_all("table")) + self.skip_tag = "table" self.skip = True self.table = None @@ -1342,8 +1331,8 @@ def handle_div(self, current_attrs): self.paragraph = self.doc.add_paragraph() # handle page break - if 'style' in current_attrs: - style = current_attrs['style'] + if "style" in current_attrs: + style = current_attrs["style"] # Match CSS2 page-break-after: always or CSS3 break-after: page # Using regex to ensure we match the exact property-value pairs # Also handles optional !important flag @@ -1355,21 +1344,42 @@ def handle_hr(self): # https://github.com/python-openxml/python-docx/issues/105#issuecomment-62806373 self.paragraph = self.doc.add_paragraph() pPr = self.paragraph._p.get_or_add_pPr() - pBdr = OxmlElement('w:pBdr') + pBdr = OxmlElement("w:pBdr") pPr.insert_element_before( pBdr, - 'w:shd', 'w:tabs', 'w:suppressAutoHyphens', 'w:kinsoku', 'w:wordWrap', - 'w:overflowPunct', 'w:topLinePunct', 'w:autoSpaceDE', 'w:autoSpaceDN', - 'w:bidi', 'w:adjustRightInd', 'w:snapToGrid', 'w:spacing', 'w:ind', - 'w:contextualSpacing', 'w:mirrorIndents', 'w:suppressOverlap', 'w:jc', - 'w:textDirection', 'w:textAlignment', 'w:textboxTightWrap', - 'w:outlineLvl', 'w:divId', 'w:cnfStyle', 'w:rPr', 'w:sectPr', 'w:pPrChange' + "w:shd", + "w:tabs", + "w:suppressAutoHyphens", + "w:kinsoku", + "w:wordWrap", + "w:overflowPunct", + "w:topLinePunct", + "w:autoSpaceDE", + "w:autoSpaceDN", + "w:bidi", + "w:adjustRightInd", + "w:snapToGrid", + "w:spacing", + "w:ind", + "w:contextualSpacing", + "w:mirrorIndents", + "w:suppressOverlap", + "w:jc", + "w:textDirection", + "w:textAlignment", + "w:textboxTightWrap", + "w:outlineLvl", + "w:divId", + "w:cnfStyle", + "w:rPr", + "w:sectPr", + "w:pPrChange", ) - bottom = OxmlElement('w:bottom') - bottom.set(qn('w:val'), 'single') - bottom.set(qn('w:sz'), '6') - bottom.set(qn('w:space'), '1') - bottom.set(qn('w:color'), 'auto') + bottom = OxmlElement("w:bottom") + bottom.set(qn("w:val"), "single") + bottom.set(qn("w:sz"), "6") + bottom.set(qn("w:space"), "1") + bottom.set(qn("w:color"), "auto") pBdr.append(bottom) def handle_custom_tag_styles(self, tag): @@ -1386,13 +1396,13 @@ def handle_mark(self): """ Apply default styling using Word shading (yellow highlight). """ - shd = OxmlElement('w:shd') - shd.set(qn('w:val'), 'clear') - shd.set(qn('w:color'), 'auto') - shd.set(qn('w:fill'), 'FFFF00') # Yellow - default color + shd = OxmlElement("w:shd") + shd.set(qn("w:val"), "clear") + shd.set(qn("w:color"), "auto") + shd.set(qn("w:fill"), "FFFF00") # Yellow - default color r_pr = self.run._element.get_or_add_rPr() # Remove existing shading if present - existing_shd = r_pr.find(qn('w:shd')) + existing_shd = r_pr.find(qn("w:shd")) if existing_shd is not None: r_pr.remove(existing_shd) r_pr.append(shd) @@ -1406,39 +1416,35 @@ def handle_link(self, href, text, tooltip=None): text: The text displayed for the url. tooltip: The text displayed when holder link. """ - is_external = href.startswith('http') if href else False - hyperlink = OxmlElement('w:hyperlink') + is_external = href.startswith("http") if href else False + hyperlink = OxmlElement("w:hyperlink") if is_external: # Create external hyperlink - rel_id = self.paragraph.part.relate_to( - href, - docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, - is_external=True - ) + rel_id = self.paragraph.part.relate_to(href, docx.opc.constants.RELATIONSHIP_TYPE.HYPERLINK, is_external=True) # Create the w:hyperlink tag and add needed values - hyperlink.set(qn('r:id'), rel_id) + hyperlink.set(qn("r:id"), rel_id) else: # Create internal hyperlink (anchor) - hyperlink.set(qn('w:anchor'), href.replace('#', '')) + hyperlink.set(qn("w:anchor"), href.replace("#", "")) if tooltip is not None: # set tooltip to hyperlink - hyperlink.set(qn('w:tooltip'), tooltip) + hyperlink.set(qn("w:tooltip"), tooltip) # Create sub-run subrun = self.paragraph.add_run() - rPr = OxmlElement('w:rPr') + rPr = OxmlElement("w:rPr") # add default color - c = OxmlElement('w:color') - c.set(qn('w:val'), "0000EE") + c = OxmlElement("w:color") + c.set(qn("w:val"), "0000EE") rPr.append(c) # add underline - u = OxmlElement('w:u') - u.set(qn('w:val'), 'single') + u = OxmlElement("w:u") + u.set(qn("w:val"), "single") rPr.append(u) subrun._r.append(rPr) @@ -1453,41 +1459,39 @@ def handle_link(self, href, text, tooltip=None): def handle_starttag(self, tag, attrs): if self.skip: return - if tag == 'head': + if tag == "head": self.skip = True self.skip_tag = tag self.instances_to_skip = 0 return - elif tag == 'body': + elif tag == "body": return current_attrs = dict(attrs) - if tag == 'span': + if tag == "span": # Parse inline styles if present to check for !important if "style" in current_attrs: - normal_styles, important_styles = utils.parse_inline_styles( - current_attrs["style"] - ) + normal_styles, important_styles = utils.parse_inline_styles(current_attrs["style"]) # Store normal styles to apply to runs if normal_styles: self.pending_inline_styles = normal_styles # Store important styles to apply after parent's processing if important_styles: self.pending_important_styles = important_styles - self.tags['span'].append(current_attrs) + self.tags["span"].append(current_attrs) return - elif tag in ['ol', 'ul']: - if tag == 'ol': + elif tag in ["ol", "ul"]: + if tag == "ol": # Assign new ID if it's a fresh top-level list self.list_restart_counter += 1 self.current_ol_num_id = self.list_restart_counter else: self.current_ol_num_id = None # unordered list - self.tags['list'].append(tag) - return # don't apply styles for now - elif tag == 'br': + self.tags["list"].append(tag) + return # don't apply styles for now + elif tag == "br": try: self.run.add_break() except AttributeError: @@ -1500,9 +1504,7 @@ def handle_starttag(self, tag, attrs): # Control custom_style based on the Options. Default is True on both. custom_style = ( - self.get_word_style_for_element(tag, current_attrs) - if (self.use_styles or self.use_tag_overrides) - else None + self.get_word_style_for_element(tag, current_attrs) if (self.use_styles or self.use_tag_overrides) else None ) if custom_style: @@ -1522,9 +1524,7 @@ def handle_starttag(self, tag, attrs): # Parse inline styles on the paragraph itself to apply to runs within if "style" in current_attrs: - normal_styles, important_styles = utils.parse_inline_styles( - current_attrs["style"] - ) + normal_styles, important_styles = utils.parse_inline_styles(current_attrs["style"]) if normal_styles: self.pending_inline_styles = normal_styles if important_styles: @@ -1534,15 +1534,15 @@ def handle_starttag(self, tag, attrs): self.pending_div_style = custom_style else: self.handle_div(current_attrs) - elif tag == 'li': + elif tag == "li": self.handle_li() if custom_style and self.paragraph: self.apply_styles_to_paragraph(self.paragraph, custom_style, True) - elif tag == 'hr': + elif tag == "hr": self.handle_hr() - elif re.match('h[1-9]', tag): + elif re.match("h[1-9]", tag): if isinstance(self.doc, docx.document.Document): if custom_style: self.paragraph = self.doc.add_paragraph() @@ -1555,44 +1555,42 @@ def handle_starttag(self, tag, attrs): if custom_style: self.apply_styles_to_paragraph(self.paragraph, custom_style, True) - elif tag == 'img': + elif tag == "img": self.handle_img(current_attrs) self.paragraph = self.doc.paragraphs[-1] - elif tag == 'table': + elif tag == "table": if self.include_tables: self.handle_table(current_attrs) return - elif tag == 'code': + elif tag == "code": # Character style for inline code (pre uses paragraph style in the ["p", "pre"] branch) if custom_style: self.pending_character_style = custom_style if "style" in current_attrs: - normal_styles, important_styles = utils.parse_inline_styles( - current_attrs["style"] - ) + normal_styles, important_styles = utils.parse_inline_styles(current_attrs["style"]) if normal_styles: self.pending_inline_styles = normal_styles if important_styles: self.pending_important_styles = important_styles return - if 'id' in current_attrs: - self.add_bookmark(current_attrs['id']) + if "id" in current_attrs: + self.add_bookmark(current_attrs["id"]) # add style if not self.include_styles: return - if 'style' in current_attrs and self.paragraph and (tag in ['p'] or re.match(r'h[1-9]', tag)): - if not hasattr(self.paragraph, '_pending_styles'): + if "style" in current_attrs and self.paragraph and (tag in ["p"] or re.match(r"h[1-9]", tag)): + if not hasattr(self.paragraph, "_pending_styles"): self.paragraph._pending_styles = [] - style = utils.parse_dict_string(current_attrs['style']) + style = utils.parse_dict_string(current_attrs["style"]) self.paragraph._pending_styles.append(style) - elif 'style' in current_attrs and self.paragraph: - style = utils.parse_dict_string(current_attrs['style']) + elif "style" in current_attrs and self.paragraph: + style = utils.parse_dict_string(current_attrs["style"]) self.add_text_align_or_margin_to(self.paragraph.paragraph_format, style) def handle_endtag(self, tag): @@ -1615,7 +1613,7 @@ def handle_endtag(self, tag): self.pending_inline_styles = None self.pending_important_styles = None - if re.match('h[1-9]', tag): + if re.match("h[1-9]", tag): self.pending_inline_styles = None self.pending_important_styles = None @@ -1631,26 +1629,26 @@ def handle_endtag(self, tag): self.skip_tag = None self.paragraph = None - if tag == 'span': - if self.tags['span']: - self.tags['span'].pop() + if tag == "span": + if self.tags["span"]: + self.tags["span"].pop() return - elif tag in ['ol', 'ul']: - utils.remove_last_occurence(self.tags['list'], tag) - if tag == 'ol': + elif tag in ["ol", "ul"]: + utils.remove_last_occurence(self.tags["list"], tag) + if tag == "ol": self._list_num_ids.pop(self.current_ol_num_id, None) self.current_ol_num_id = None return - elif tag == 'table': + elif tag == "table": if self.include_tables: self.table_no += 1 self.table = None self.paragraph = None - elif tag == 'li': + elif tag == "li": self.in_li = False - if tag in ['p', 'pre'] or re.match(r'h[1-9]', tag): - if hasattr(self.paragraph, '_pending_styles'): + if tag in ["p", "pre"] or re.match(r"h[1-9]", tag): + if hasattr(self.paragraph, "_pending_styles"): for style in self.paragraph._pending_styles: self.apply_styles_to_paragraph(self.paragraph, style) # Clear the pending styles @@ -1666,7 +1664,7 @@ def handle_data(self, data): return # Only remove white space if we're not in a pre block. - if 'pre' not in self.tags: + if "pre" not in self.tags: # remove leading and trailing whitespace in all instances data = utils.remove_whitespace(data, True, True) @@ -1676,9 +1674,9 @@ def handle_data(self, data): # There can only be one nested link in a valid html document # You cannot have interactive content in an A tag, this includes links # https://html.spec.whatwg.org/#interactive-content - link = self.tags.get('a', {}) - href = link.get('href', None) - title = link.get('title', None) + link = self.tags.get("a", {}) + href = link.get("href", None) + title = link.get("title", None) if link and href: self.handle_link(href, data, title) @@ -1691,20 +1689,20 @@ def handle_data(self, data): if self.pending_character_style: self.apply_styles_to_run(self.run, self.pending_character_style, isCustom=True) - for span in self.tags['span']: - if 'style' in span: - span_style = utils.parse_dict_string(span['style']) + for span in self.tags["span"]: + if "style" in span: + span_style = utils.parse_dict_string(span["style"]) self.apply_styles_to_run(self.run, span_style) for tag, attrs in self.tags.items(): - if tag == 'div' and 'style' in attrs: - div_style = utils.parse_dict_string(attrs['style']) + if tag == "div" and "style" in attrs: + div_style = utils.parse_dict_string(attrs["style"]) for span_style_name in span_style.keys(): if span_style_name in div_style: del div_style[span_style_name] - self.tags[tag]['style'] = utils.dict_to_style_string(div_style) + self.tags[tag]["style"] = utils.dict_to_style_string(div_style) for tag, attrs in self.tags.items(): if self.use_tag_overrides and tag in self.tag_style_overrides: @@ -1712,7 +1710,7 @@ def handle_data(self, data): if tag in constants.FONT_STYLES: font_style = constants.FONT_STYLES[tag] - if font_style == 'custom': + if font_style == "custom": self.handle_custom_tag_styles(tag) else: setattr(self.run.font, font_style, True) @@ -1721,8 +1719,8 @@ def handle_data(self, data): font_name = constants.FONT_NAMES[tag] self.run.font.name = font_name - if 'style' in attrs and (tag in ['div', 'li', 'pre']): - style = utils.parse_dict_string(attrs['style']) + if "style" in attrs and (tag in ["div", "li", "pre"]): + style = utils.parse_dict_string(attrs["style"]) self.add_styles_to_run(style) def handle_comment(self, data): @@ -1741,7 +1739,8 @@ def handle_comment(self, data): # Style: Green color to mimic HTML comment styling dark_ish_green = "#008000" - run.font.color.rgb = utils.parse_color(dark_ish_green) + dark_ish_green_color = utils.parse_color(dark_ish_green) + run.font.color.rgb = RGBColor(*dark_ish_green_color) run.italic = True # makes it feel more like a comment def ignore_nested_tables(self, tables_soup): @@ -1759,16 +1758,16 @@ def ignore_nested_tables(self, tables_soup): nest -= 1 continue new_tables.append(table) - nest = len(table.find_all('table')) + nest = len(table.find_all("table")) return new_tables def get_table_rows(self, table_soup): # If there's a header, body, footer or direct child tr tags, add row dimensions from there - return table_soup.select(', '.join(self.table_row_selectors), recursive=False) + return table_soup.select(", ".join(self.table_row_selectors), recursive=False) def get_table_columns(self, row): # Get all columns for the specified row tag. - return row.find_all(['th', 'td'], recursive=False) if row else [] + return row.find_all(["th", "td"], recursive=False) if row else [] def get_table_dimensions(self, table_soup): # Get rows for the table @@ -1782,33 +1781,59 @@ def get_table_dimensions(self, table_soup): default_span = 1 max_cols = 0 - max_rows = len(rows) + + # Track occupied cells caused by rowspan + used_cells = [] for row_idx, row in enumerate(rows): cols = self.get_table_columns(row) - # Handle colspan - row_col_count = sum(utils.safe_int(col.get('colspan', default_span)) for col in cols) - max_cols = max(max_cols, row_col_count) - # Handle rowspan + # Ensure used_cells has current row + while len(used_cells) <= row_idx: + used_cells.append([]) + + col_offset = 0 + for col in cols: - rowspan = utils.safe_int(col.get('rowspan', default_span)) - if rowspan > default_span: - max_rows = max(max_rows, row_idx + rowspan) + # Expand row if needed + while len(used_cells[row_idx]) <= col_offset: + used_cells[row_idx].append(False) + + # Skip already occupied cells + while col_offset < len(used_cells[row_idx]) and used_cells[row_idx][col_offset]: + col_offset += 1 + + rowspan = utils.safe_int(col.get("rowspan", default_span)) + colspan = utils.safe_int(col.get("colspan", default_span)) + + # Mark occupied cells + for r in range(row_idx, row_idx + rowspan): + while len(used_cells) <= r: + used_cells.append([]) + for c in range(col_offset, col_offset + colspan): + while len(used_cells[r]) <= c: + used_cells[r].append(False) + used_cells[r][c] = True + + col_offset += colspan + + max_cols = max(max_cols, len(used_cells[row_idx])) + + max_rows = len(used_cells) return max_rows, max_cols def get_tables(self) -> None: - if not hasattr(self, 'soup'): - self.options['tables'] = False + if not hasattr(self, "soup"): + self.options["tables"] = False return - self.tables = self.ignore_nested_tables(self.soup.find_all('table')) + self.tables = self.ignore_nested_tables(self.soup.find_all("table")) self.table_no = 0 def run_process(self, html: str) -> None: if self.bs and BeautifulSoup: - self.soup = BeautifulSoup(html, 'html.parser') + self.soup = BeautifulSoup(html, "html.parser") html = str(self.soup) if self.include_tables: @@ -1817,16 +1842,16 @@ def run_process(self, html: str) -> None: def add_html_to_document(self, html: str, document) -> None: if not isinstance(html, str): - raise ValueError(f'First argument needs to be a {str}') + raise ValueError(f"First argument needs to be a {str}") elif not isinstance(document, docx.document.Document) and not isinstance(document, docx.table._Cell): - raise ValueError(f'Second argument needs to be a {docx.document.Document}') + raise ValueError(f"Second argument needs to be a {docx.document.Document}") self.set_initial_attrs(document) self.run_process(html) def add_html_to_cell(self, html: str, cell: docx.table._Cell) -> None: if not isinstance(cell, docx.table._Cell): - raise ValueError(f'Second argument needs to be a {docx.table._Cell}') + raise ValueError(f"Second argument needs to be a {docx.table._Cell}") unwanted_paragraph = cell.paragraphs[0] utils.delete_paragraph(unwanted_paragraph) @@ -1835,10 +1860,10 @@ def add_html_to_cell(self, html: str, cell: docx.table._Cell) -> None: # cells must end with a paragraph or will get message about corrupt file # https://stackoverflow.com/a/29287121 if not self.doc.paragraphs: - self.doc.add_paragraph('') + self.doc.add_paragraph("") - def parse_html_file(self, filename_html: str, filename_docx, encoding: str = 'utf-8') -> None: - with open(filename_html, 'r', encoding=encoding) as infile: + def parse_html_file(self, filename_html: str, filename_docx, encoding: str = "utf-8") -> None: + with open(filename_html, "r", encoding=encoding) as infile: html = infile.read() self.set_initial_attrs() @@ -1846,7 +1871,7 @@ def parse_html_file(self, filename_html: str, filename_docx, encoding: str = 'ut if not filename_docx: path, filename = os.path.split(filename_html) - filename_docx = f'{path}/new_docx_file_{filename}' + filename_docx = f"{path}/new_docx_file_{filename}" self.save(filename_docx) @@ -1855,19 +1880,19 @@ def parse_html_string(self, html: str) -> docx.document.Document: self.run_process(html) return self.doc -if __name__ == '__main__': - arg_parser = argparse.ArgumentParser(description='Convert .html file into .docx file with formatting') - arg_parser.add_argument('filename_html', help='The .html file to be parsed') + +if __name__ == "__main__": + arg_parser = argparse.ArgumentParser(description="Convert .html file into .docx file with formatting") + arg_parser.add_argument("filename_html", help="The .html file to be parsed") arg_parser.add_argument( - 'filename_docx', - nargs='?', - help='The name of the .docx file to be saved. Default new_docx_file_[filename_html]', - default=None + "filename_docx", + nargs="?", + help="The name of the .docx file to be saved. Default new_docx_file_[filename_html]", + default=None, ) - arg_parser.add_argument('--bs', action='store_true', - help='Attempt to fix html before parsing. Requires bs4. Default True') + arg_parser.add_argument("--bs", action="store_true", help="Attempt to fix html before parsing. Requires bs4. Default True") args = vars(arg_parser.parse_args()) - file_html = args.pop('filename_html') + file_html = args.pop("filename_html") html_parser = HtmlToDocx() html_parser.parse_html_file(file_html, **args) diff --git a/html4docx/utils.py b/html4docx/utils.py index 1f0cac4..9633e27 100644 --- a/html4docx/utils.py +++ b/html4docx/utils.py @@ -23,9 +23,11 @@ class ImageAlignment(Enum): def get_filename_from_url(url: str): return os.path.basename(urlparse(url).path) + def dict_to_style_string(style_dict): """Convert style dictionary back to CSS string""" - return '; '.join([f'{k}: {v}' for k, v in style_dict.items()]) + return "; ".join([f"{k}: {v}" for k, v in style_dict.items()]) + def is_url(url: str): """ @@ -87,18 +89,33 @@ def parse_dict_string(string: str, separator: str = ";"): return dict() new_string = re.sub(r"\s+", " ", string.replace("\n", "")).split(separator) - string_dict = dict( - (k.strip(), v.strip()) - for x in new_string - if ":" in x - for k, v in [x.split(":", 1)] - ) + string_dict = dict((k.strip(), v.strip()) for x in new_string if ":" in x for k, v in [x.split(":", 1)]) return string_dict +def add_px(value): + """ + Append 'px' only if the value is purely numeric (int, float, or numeric string). + If there is any non-numeric character, return the original value unchanged. + """ + if isinstance(value, (int, float)): + return f"{value}px" + + if isinstance(value, str): + stripped = value.strip() + + # Check if it's a valid number (integer or float) + if stripped.replace(".", "", 1).isdigit(): + return f"{stripped}px" + + return value + + return value + + def unit_converter(unit_value: str, target_unit: str = "pt"): """ - Converts a CSS unit value to a target unit (default is 'pt'). + Converts a CSS unit value (default px) to a target unit (default is 'pt'). Supported input units: px, pt, in, pc, cm, mm, em, rem, %. Supported target units: pt, px, in, cm, mm. @@ -141,9 +158,7 @@ def unit_converter(unit_value: str, target_unit: str = "pt"): return None # Clamp the value to MAX_INDENT (in points) - value_in_pt = min( - value_in_pt, constants.MAX_INDENT * 72.0 - ) # MAX_INDENT is in inches + value_in_pt = min(value_in_pt, constants.MAX_INDENT * 72.0) # Convert from points (pt) to the target unit conversion_from_pt = { @@ -181,9 +196,9 @@ def is_color(color: str) -> bool: >>> is_color("000000") False """ - is_rgb = 'rgb' in color - is_hex = color.startswith('#') - is_keyword = color == 'currentcolor' + is_rgb = "rgb" in color + is_hex = color.startswith("#") + is_keyword = color == "currentcolor" is_color_name = color in Color.__members__ return is_rgb or is_hex or is_keyword or is_color_name @@ -200,7 +215,7 @@ def parse_color(original_color: str, return_hex: bool = False): if "rgba" in color: color = re.sub(r"[^0-9,]", "", color) colors = [int(x) for x in color.split(",")] - colors = colors[:3] # remove opacity because it's not supported by python-docx + colors = colors[:3] # remove opacity because it's not supported by python-docx logging.warning("RGBA color is not supported by python-docx. Opacity will be ignored.") elif "rgb" in color: color = re.sub(r"[^0-9,]", "", color) @@ -209,7 +224,7 @@ def parse_color(original_color: str, return_hex: bool = False): raise ValueError(f"Invalid RGB color: {original_color}") elif color.startswith("#"): color = color.lstrip("#") - color = ("".join([x + x for x in color]) if len(color) == 3 else color) # convert short hex to full hex + color = "".join([x + x for x in color]) if len(color) == 3 else color # convert short hex to full hex colors = RGBColor.from_string(color) elif color in Color.__members__: colors = Color[color].value @@ -217,12 +232,28 @@ def parse_color(original_color: str, return_hex: bool = False): colors = [0, 0, 0] # Default to black for unexpected colors logging.warning(f"Could not parse color '{original_color}': Invalid color value. Fallback to black.") except Exception: - colors = [0, 0, 0] # Default to black for errors + colors = [0, 0, 0] # Default to black for errors logging.warning(f"Could not parse color '{original_color}': Invalid color value. Fallback to black.") return rgb_to_hex(colors) if return_hex else colors +def normalize_rgb_spaces(value: str) -> str: + """ + Removes spaces inside rgb()/rgba() so it can be safely split. + Example: + rgb(222, 222, 222) -> rgb(222,222,222) + """ + + def _replace(match): + prefix, content, suffix = match.groups() + # remove spaces only inside the function + content = content.replace(" ", "") + return f"{prefix}{content}{suffix}" + + return constants.RGB_SPACES_REGEX.sub(_replace, value) + + def remove_last_occurence(ls, x): ls.pop(len(ls) - ls[::-1].index(x) - 1) diff --git a/tests/assets/htmls/tables3.html b/tests/assets/htmls/tables3.html index b7ff21d..4ec19c0 100644 --- a/tests/assets/htmls/tables3.html +++ b/tests/assets/htmls/tables3.html @@ -4,26 +4,26 @@ width="641"> -

CATEGORY

+ rgba(250, 195, 42, 1);border-left-style:none;border-right:1em solid #fac32a;border-top:1.0pt solid + rgb(250, 195, 42);height:23.75pt;padding:0in;width:222.2pt;" width="296">

OBSERVATIONS/COMMENTS

- NETHERLANDS COURTS   - GERMANY COURTS ). Returns hex string like 'FFFF00' or None if no shading. """ - r_pr = run._element.find(qn('w:rPr')) + r_pr = run._element.find(qn("w:rPr")) if r_pr is None: return None - shd = r_pr.find(qn('w:shd')) + shd = r_pr.find(qn("w:shd")) if shd is None: return None - return (shd.get(qn('w:fill')) or "").upper() + return (shd.get(qn("w:fill")) or "").upper() @staticmethod def save_document_on_buffer(document: Document) -> BytesIO: @@ -76,19 +78,19 @@ def save_document_on_buffer(document: Document) -> BytesIO: def setUpClass(cls): cls.clean_up_docx() cls.document = Document() - cls.text1 = cls.get_html_from_file('text1.html') - cls.paragraph_line_height = cls.get_html_from_file('paragraph_line_height.html') - cls.paragraph_first_line_indent = cls.get_html_from_file('paragraph_first_line_indent.html') - cls.text_decoration = cls.get_html_from_file('text_decoration.html') - cls.css_properties = cls.get_html_from_file('css_properties.html') - cls.css_properties_header = cls.get_html_from_file('header.html') - cls.table_html = cls.get_html_from_file('tables1.html') - cls.table2_html = cls.get_html_from_file('tables2.html') - cls.table3_html = cls.get_html_from_file('tables3.html') + cls.text1 = cls.get_html_from_file("text1.html") + cls.paragraph_line_height = cls.get_html_from_file("paragraph_line_height.html") + cls.paragraph_first_line_indent = cls.get_html_from_file("paragraph_first_line_indent.html") + cls.text_decoration = cls.get_html_from_file("text_decoration.html") + cls.css_properties = cls.get_html_from_file("css_properties.html") + cls.css_properties_header = cls.get_html_from_file("header.html") + cls.table_html = cls.get_html_from_file("tables1.html") + cls.table2_html = cls.get_html_from_file("tables2.html") + cls.table3_html = cls.get_html_from_file("tables3.html") @classmethod def tearDownClass(cls): - outputpath = os.path.join(test_dir, 'test.docx') + outputpath = os.path.join(test_dir, "test.docx") cls.document.save(outputpath) def setUp(self): @@ -96,7 +98,7 @@ def setUp(self): # ============================== Tests ============================== # def test_save_docx_by_filename(self): - filename = os.path.join(test_dir, 'new_test.docx') + filename = os.path.join(test_dir, "new_test.docx") self.parser.set_initial_attrs(self.document) self.parser.save(filename) self.assertTrue(os.path.exists(filename)) @@ -110,59 +112,41 @@ def test_save_docx_by_buffer(self): self.assertTrue(buffer.getvalue()) def test_html_with_images_links_style(self): - self.document.add_heading( - 'Test: add regular html with images, links and some formatting to document', - level=1 - ) + self.document.add_heading("Test: add regular html with images, links and some formatting to document", level=1) self.parser.add_html_to_document(self.text1, self.document) def test_html_with_default_paragraph_style(self): - self.document.add_heading( - 'Test: add regular html with a default paragraph style defined', - level=1 - ) - self.parser.paragraph_style = 'Quote' + self.document.add_heading("Test: add regular html with a default paragraph style defined", level=1) + self.parser.paragraph_style = "Quote" self.parser.add_html_to_document(self.text1, self.document) def test_add_html_to_table_cell_with_default_paragraph_style(self): - self.document.add_heading( - 'Test: regular html to table cell with a default paragraph style defined', - level=1 - ) - self.parser.paragraph_style = 'Quote' - table = self.document.add_table(1, 2, style='Table Grid') + self.document.add_heading("Test: regular html to table cell with a default paragraph style defined", level=1) + self.parser.paragraph_style = "Quote" + table = self.document.add_table(1, 2, style="Table Grid") cell = table.cell(0, 1) self.parser.add_html_to_document(self.text1, cell) def test_add_html_to_table_cell(self): - self.document.add_heading( - 'Test: regular html with images, links, some formatting to table cell', - level=1 - ) - table = self.document.add_table(1, 2, style='Table Grid') + self.document.add_heading("Test: regular html with images, links, some formatting to table cell", level=1) + table = self.document.add_table(1, 2, style="Table Grid") cell = table.cell(0, 1) self.parser.add_html_to_document(self.text1, cell) def test_add_html_skip_images(self): - self.document.add_heading( - 'Test: regular html with images, but skip adding images', - level=1 - ) - self.parser.options['images'] = False + self.document.add_heading("Test: regular html with images, but skip adding images", level=1) + self.parser.options["images"] = False self.parser.add_html_to_document(self.text1, self.document) document = self.parser.parse_html_string(self.text1) - assert any(['Graphic' in paragraph._p.xml for paragraph in document.paragraphs]) is False + assert any(["Graphic" in paragraph._p.xml for paragraph in document.paragraphs]) is False def test_add_html_with_tables(self): - self.document.add_heading( - 'Test: add html with tables (by default no borders)', - level=1 - ) + self.document.add_heading("Test: add html with tables (by default no borders)", level=1) self.parser.add_html_to_document(self.table_html, self.document) # When no table style is set, use Normal Table as default - table_style = 'Normal Table' + table_style = "Normal Table" # Find the last table added to the document last_table = self.document.tables[-1] # Assumes the table was added at the end @@ -171,9 +155,9 @@ def test_add_html_with_tables(self): self.assertEqual(last_table.style.name, table_style, f"Table style does not match expected '{table_style}'") def test_add_html_with_tables_accent_style(self): - table_style = 'Light Grid Accent 6' + table_style = "Light Grid Accent 6" self.document.add_heading( - 'Test: add html with tables with accent', + "Test: add html with tables with accent", ) self.parser.table_style = table_style self.parser.add_html_to_document(self.table_html, self.document) @@ -185,9 +169,9 @@ def test_add_html_with_tables_accent_style(self): self.assertEqual(last_table.style.name, table_style, f"Table style does not match expected '{table_style}'") def test_add_html_with_tables_basic_style(self): - table_style = 'Table Grid' + table_style = "Table Grid" self.document.add_heading( - 'Test: add html with tables with basic style', + "Test: add html with tables with basic style", ) self.parser.table_style = table_style self.parser.add_html_to_document(self.table_html, self.document) @@ -200,31 +184,28 @@ def test_add_html_with_tables_basic_style(self): def test_add_nested_tables(self): self.document.add_heading( - 'Test: add nested tables', + "Test: add nested tables", ) self.parser.add_html_to_document(self.table2_html, self.document) def test_add_nested_tables_basic_style(self): self.document.add_heading( - 'Test: add nested tables with basic style', + "Test: add nested tables with basic style", ) - self.parser.table_style = 'Table Grid' + self.parser.table_style = "Table Grid" self.parser.add_html_to_document(self.table2_html, self.document) def test_add_nested_tables_accent_style(self): self.document.add_heading( - 'Test: add nested tables with accent style', + "Test: add nested tables with accent style", ) - self.parser.table_style = 'Light Grid Accent 6' + self.parser.table_style = "Light Grid Accent 6" self.parser.add_html_to_document(self.table2_html, self.document) def test_add_html_skip_tables(self): # broken until feature readded - self.document.add_heading( - 'Test: add html with tables, but skip adding tables', - level=1 - ) - self.parser.options['tables'] = False + self.document.add_heading("Test: add html with tables, but skip adding tables", level=1) + self.parser.options["tables"] = False self.parser.add_html_to_document(self.table_html, self.document) def test_wrong_argument_type_raises_error(self): @@ -246,39 +227,29 @@ def test_wrong_argument_type_raises_error(self): assert False, "Error not raised as expected" def test_add_html_to_cells_method(self): - self.document.add_heading( - 'Test: add_html_to_cells method', - level=1 - ) - table = self.document.add_table(2, 3, style='Table Grid') + self.document.add_heading("Test: add_html_to_cells method", level=1) + table = self.document.add_table(2, 3, style="Table Grid") cell = table.cell(0, 0) - html = '''Line 0 without p tags

Line 1 with P tags

''' + html = """Line 0 without p tags

Line 1 with P tags

""" self.parser.add_html_to_cell(html, cell) cell = table.cell(0, 1) - html = '''

Line 0 with p tags

Line 1 without p tags''' + html = """

Line 0 with p tags

Line 1 without p tags""" self.parser.add_html_to_cell(html, cell) cell = table.cell(0, 2) cell.text = "Pre-defined text that shouldn't be removed." - html = '''

Add HTML to non-empty cell.

''' + html = """

Add HTML to non-empty cell.

""" self.parser.add_html_to_cell(html, cell) def test_inline_code(self): - self.document.add_heading( - 'Test: inline code block', - level=1 - ) + self.document.add_heading("Test: inline code block", level=1) - html = "

This is a sentence that contains some code elements that " \ - "should appear as code.

" + html = "

This is a sentence that contains some code elements that should appear as code.

" self.parser.add_html_to_document(html, self.document) def test_code_block(self): - self.document.add_heading( - 'Test: code block', - level=1 - ) + self.document.add_heading("Test: code block", level=1) html = """

This is a code block. @@ -290,10 +261,7 @@ def test_code_block(self): self.parser.add_html_to_document(html, self.document) def test_pre_block(self): - self.document.add_heading( - 'Test: pre block', - level=1 - ) + self.document.add_heading("Test: pre block", level=1) html = """

 This is a pre-formatted block.
@@ -306,25 +274,19 @@ def test_pre_block(self):
         self.parser.add_html_to_document(html, self.document)
 
     def test_handling_hr(self):
-        hr_html_example = '

paragraph


paragraph

' + hr_html_example = "

paragraph


paragraph

" - self.document.add_heading( - 'Test: Handling of hr', - level=1 - ) + self.document.add_heading("Test: Handling of hr", level=1) # Add on document for human validation self.parser.add_html_to_document(hr_html_example, self.document) document = self.parser.parse_html_string(hr_html_example) - assert '' in document._body._body.xml + assert "" in document._body._body.xml def test_external_hyperlink(self): - hyperlink_html_example = "Google External Link" + hyperlink_html_example = 'Google External Link' - self.document.add_heading( - 'Test: Handling external hyperlink', - level=1 - ) + self.document.add_heading("Test: Handling external hyperlink", level=1) self.parser.add_html_to_document(hyperlink_html_example, self.document) document = self.parser.parse_html_string(hyperlink_html_example) @@ -335,19 +297,16 @@ def test_external_hyperlink(self): if "hyperlink" in rel.reltype: external_hyperlinks.append(rel.target_ref) - assert 'https://www.google.com' in external_hyperlinks - assert '

Introduction Header

" - "

Click here: Link to intro

" + '

Introduction Header

' + '

Click here: Link to intro

' ) - self.document.add_heading( - 'Test: Handling internal hyperlink', - level=1 - ) + self.document.add_heading("Test: Handling internal hyperlink", level=1) self.parser.add_html_to_document(hyperlink_html_example, self.document) document = self.parser.parse_html_string(hyperlink_html_example) @@ -358,8 +317,8 @@ def test_internal_hyperlink(self): def test_internal_hyperlink_without_paragraph(self): hyperlink_html_example = ( - "

Introduction Header

" - "

Click here: Link to intro

" + '

Introduction Header

' + '

Click here: Link to intro

' ) document = self.parser.parse_html_string(hyperlink_html_example) @@ -370,9 +329,7 @@ def test_internal_hyperlink_without_paragraph(self): assert '' in document_body def test_internal_hyperlink_without_anchor(self): - hyperlink_html_example = ( - "

Click here: Link to intro

" - ) + hyperlink_html_example = '

Click here: Link to intro

' document = self.parser.parse_html_string(hyperlink_html_example) document_body = document._body._body.xml @@ -382,18 +339,15 @@ def test_internal_hyperlink_without_anchor(self): assert '' in document_body def test_image_no_src(self): - self.document.add_heading( - 'Test: Handling img without src', - level=1 - ) - self.parser.add_html_to_document('', self.document) + self.document.add_heading("Test: Handling img without src", level=1) + self.parser.add_html_to_document("", self.document) - document = self.parser.parse_html_string('') - assert '' in document.paragraphs[0].text + document = self.parser.parse_html_string("") + assert "" in document.paragraphs[0].text def test_local_img(self): # A table with more td elements in latter rows than in the first - self.document.add_heading('Test: Local Image', level=1) + self.document.add_heading("Test: Local Image", level=1) html_local_img = '' self.parser.add_html_to_document(html_local_img, self.document) document = self.parser.parse_html_string(html_local_img) @@ -411,33 +365,50 @@ def test_local_img(self): assert image_found, "No image was found in the document" - def test_inline_images(self): - self.document.add_heading( - 'Test: Handling inline images', - level=1 + def test_img_with_dimensions(self): + self.document.add_heading("Test: Image With Dimensions", level=1) + + html = '' + self.parser.add_html_to_document(html, self.document) + document = self.parser.parse_html_string(html) + + # Ensure at least one image exists + assert len(document.inline_shapes) > 0, "No image was found in the document" + + shape = document.inline_shapes[-1] + + # Convert expected px → inches (assuming 96 DPI) + expected_width_in = 520 / 96 + expected_height_in = 306 / 96 + + actual_width_in = shape.width.inches + actual_height_in = shape.height.inches + + # Allow tolerance (floating point + conversion differences) + tolerance = 0.05 # ~1.27 mm + + assert abs(actual_width_in - expected_width_in) < tolerance, ( + f"Width mismatch: expected ~{expected_width_in:.2f}in, got {actual_width_in:.2f}in" ) - test_img_src = 'https://github.com/dfop02/html4docx/blob/main/tests/assets/images/test_img.png?raw=true' - html_example = ( - f"

" - f"" - f"

" + + assert abs(actual_height_in - expected_height_in) < tolerance, ( + f"Height mismatch: expected ~{expected_height_in:.2f}in, got {actual_height_in:.2f}in" ) + + def test_inline_images(self): + self.document.add_heading("Test: Handling inline images", level=1) + test_img_src = "https://github.com/dfop02/html4docx/blob/main/tests/assets/images/test_img.png?raw=true" + html_example = f"

" self.parser.add_html_to_document(html_example, self.document) document = self.parser.parse_html_string(html_example) # Find paragraphs containing inline pictures - img_paragraphs = [ - p for p in document.paragraphs - if any(r._element.xpath(".//pic:pic") for r in p.runs) - ] + img_paragraphs = [p for p in document.paragraphs if any(r._element.xpath(".//pic:pic") for r in p.runs)] assert img_paragraphs, "Expected at least one paragraph with inline images" first_img_para = img_paragraphs[0] - inline_img_runs = [ - r for r in first_img_para.runs - if r._element.xpath(".//pic:pic") - ] + inline_img_runs = [r for r in first_img_para.runs if r._element.xpath(".//pic:pic")] assert len(inline_img_runs) == 3, "Expected 3 inline image runs in a single paragraph" def test_single_image_without_paragraph(self): @@ -445,24 +416,15 @@ def test_single_image_without_paragraph(self): document = self.parser.parse_html_string(html_example) # Find paragraphs containing inline pictures - img_paragraphs = [ - p for p in document.paragraphs - if any(r._element.xpath(".//pic:pic") for r in p.runs) - ] + img_paragraphs = [p for p in document.paragraphs if any(r._element.xpath(".//pic:pic") for r in p.runs)] assert img_paragraphs, "Expected at least one paragraph with inline images" first_img_para = img_paragraphs[0] - inline_img_runs = [ - r for r in first_img_para.runs - if r._element.xpath(".//pic:pic") - ] + inline_img_runs = [r for r in first_img_para.runs if r._element.xpath(".//pic:pic")] assert len(inline_img_runs) == 1, "Expected 1 inline image runs in a single paragraph" def test_bold_italic_underline_and_strike(self): - self.document.add_heading( - 'Test: Bold, Italic, Underline, Inserted, Strike, Deleted and Marked tags', - level=1 - ) + self.document.add_heading("Test: Bold, Italic, Underline, Inserted, Strike, Deleted and Marked tags", level=1) html_example = ( "

This text has Bold Words.

" @@ -500,8 +462,7 @@ def test_bold_italic_underline_and_strike(self): self.assertIn("Marked Words", paragraphs[6].text) self.assertEqual( - self.get_run_shading_fill(paragraphs[6].runs[1]), 'FFFF00', - " should apply yellow shading (FFFF00)" + self.get_run_shading_fill(paragraphs[6].runs[1]), "FFFF00", " should apply yellow shading (FFFF00)" ) self.assertIn("Bold, Italic, Underline and Strike Words", paragraphs[7].text) @@ -513,64 +474,58 @@ def test_bold_italic_underline_and_strike(self): def test_font_size(self): font_size_html_example = ( - "

paragraph 8px

" - "

paragraph 1cm

" - "

paragraph 6em

" - "

paragraph 12cm

" - "

paragraph 12vh not supported

" - "

paragraph 5pc

" - "

paragraph 14pt

" - "

paragraph 16pt

" - "

paragraph 2mm

" - "

paragraph small

" - ) - - self.document.add_heading( - 'Test: Font-Size', - level=1 - ) + '

paragraph 8px

' + '

paragraph 1cm

' + '

paragraph 6em

' + '

paragraph 12cm

' + '

paragraph 12vh not supported

' + '

paragraph 5pc

' + '

paragraph 14pt

' + '

paragraph 16pt

' + '

paragraph 2mm

' + '

paragraph small

' + ) + + self.document.add_heading("Test: Font-Size", level=1) # Add on document for human validation self.parser.add_html_to_document(font_size_html_example, self.document) document = self.parser.parse_html_string(font_size_html_example) font_sizes = [str(p.runs[0].font.size) for p in document.paragraphs] - assert ['76200', '355600', '914400', '431800', 'None', '762000', '177800', '203200', '69850', '120650'] == font_sizes + assert ["76200", "355600", "914400", "431800", "None", "762000", "177800", "203200", "69850", "120650"] == font_sizes def test_font_size_paragraph(self): font_size_html_example = ( - "

paragraph 8px

" - "

paragraph 1cm

" - "

paragraph 6em

" - "

paragraph 12cm

" - "

paragraph 12vh not supported

" - "

paragraph 5pc

" - "

paragraph 14pt

" - "

paragraph 16pt

" - "

paragraph 2mm

" - "

paragraph small

" - ) - - self.document.add_heading( - 'Test: Font-Size on

', - level=1 - ) + '

paragraph 8px

' + '

paragraph 1cm

' + '

paragraph 6em

' + '

paragraph 12cm

' + '

paragraph 12vh not supported

' + '

paragraph 5pc

' + '

paragraph 14pt

' + '

paragraph 16pt

' + '

paragraph 2mm

' + '

paragraph small

' + ) + + self.document.add_heading("Test: Font-Size on

", level=1) self.parser.add_html_to_document(font_size_html_example, self.document) document = self.parser.parse_html_string(font_size_html_example) font_sizes = [str(p.runs[0].font.size) for p in document.paragraphs] - assert ['76200', '355600', '914400', '431800', 'None', '762000', '177800', '203200', '69850', '120650'] == font_sizes + assert ["76200", "355600", "914400", "431800", "None", "762000", "177800", "203200", "69850", "120650"] == font_sizes def test_font_weight_paragraph(self): - self.document.add_heading('Test: font weight on

', level=1) + self.document.add_heading("Test: font weight on

", level=1) font_weight_html_example = ( - "

bold text

" - "

bolder text

" - "

700 weight

" - "

900 weight

" - "

normal text

" - "

lighter text

" - "

400 weight

" - "

100 weight

" + '

bold text

' + '

bolder text

' + '

700 weight

' + '

900 weight

' + '

normal text

' + '

lighter text

' + '

400 weight

' + '

100 weight

' ) self.parser.add_html_to_document(font_weight_html_example, self.document) @@ -580,10 +535,10 @@ def test_font_weight_paragraph(self): font_weights = [p.runs[0].font.bold for p in document.paragraphs] expected_weights = [ - True, # bold - True, # bolder - True, # 700 - True, # 900 + True, # bold + True, # bolder + True, # 700 + True, # 900 False, # normal False, # lighter False, # 400 @@ -593,11 +548,11 @@ def test_font_weight_paragraph(self): self.assertEqual(font_weights, expected_weights) def test_font_style_paragraph(self): - self.document.add_heading('Test: font style on

', level=1) + self.document.add_heading("Test: font style on

", level=1) font_style_html_example = ( - "

italic text

" - "

oblique text

" - "

normal text

" + '

italic text

' + '

oblique text

' + '

normal text

' ) self.parser.add_html_to_document(font_style_html_example, self.document) @@ -607,76 +562,70 @@ def test_font_style_paragraph(self): font_styles = [p.runs[0].font.italic for p in document.paragraphs] expected_styles = [ - True, # italic - True, # oblique (should be treated as italic) + True, # italic + True, # oblique (should be treated as italic) False, # normal ] self.assertEqual(font_styles, expected_styles) def test_font_family_paragraph(self): - self.document.add_heading('Test: font family on

', level=1) + self.document.add_heading("Test: font family on

", level=1) font_family_html_example = ( - "

Arial font text

" + '

Arial font text

' "

Helvetica font text

" "

Noto Sans font text

" "

Times New Roman font text

" - "

Generic serif font text

" - "

Generic sans-serif font text

" - "

Generic monospace font text

" + '

Generic serif font text

' + '

Generic sans-serif font text

' + '

Generic monospace font text

' "

Courier New font text

" - "

Inherit font text

" + '

Inherit font text

' ) self.parser.add_html_to_document(font_family_html_example, self.document) def test_text_transform_paragraph(self): - self.document.add_heading('Test: text-transform on

', level=1) + self.document.add_heading("Test: text-transform on

", level=1) text_transform_html_example = ( - "

uppercase text

" - "

LOWERCASE TEXT

" - "

capitalize each word

" - "

normal text

" + '

uppercase text

' + '

LOWERCASE TEXT

' + '

capitalize each word

' + '

normal text

' "

default text

" ) self.parser.add_html_to_document(text_transform_html_example, self.document) def test_text_decoration_span(self): - self.document.add_heading('Test: text-decoration on ', level=1) + self.document.add_heading("Test: text-decoration on ", level=1) text_decoration_html_example = ( # Standalone spans - "underlined span (red)" - "no decoration span (rgb(0, 0, 0))" - "strikethrough span (gray) (not supported)" - "underline+line-through span (orange)\ - (should be strike)" - + 'underlined span (red)' + 'no decoration span (rgb(0, 0, 0))' + 'strikethrough span (gray) (not supported)' + 'underline+line-through span (orange)\ + (should be strike)' # Spans inside paragraphs - "

Normal text wavy underlined span (blue) continues

" - "

Normal text dotted underlined span (purple) continues

" - "

Normal text strikethrough span (red) continues

" - + '

Normal text wavy underlined span (blue) continues

' + '

Normal text dotted underlined span (purple) continues

' + '

Normal text strikethrough span (red) continues

' # Multiple spans with different decorations in same paragraph - "

Start underlined " - "strikethrough " - "dashed underline end

" - + '

Start underlined ' + 'strikethrough ' + 'dashed underline end

' # Span with no decoration inside decorated paragraph - "

Underlined paragraph with " - "normal span inside

" - + '

Underlined paragraph with ' + 'normal span inside

' # Span with decoration inside decorated paragraph (should override) - "

Strikethrough paragraph with " - "underlined red span inside

" - + '

Strikethrough paragraph with ' + 'underlined red span inside

' # Override behavior with individual properties - "

Blue underlined paragraph with " - "strikethrough span inside

" - + '

Blue underlined paragraph with ' + 'strikethrough span inside

' # Check if equal - shorthand vs individual properties - "

Blue underlined paragraph

" - "

Blue underlined paragraph

" + '

Blue underlined paragraph

' + '

Blue underlined paragraph

' ) self.parser.add_html_to_document(text_decoration_html_example, self.document) @@ -802,24 +751,24 @@ def test_text_decoration_span(self): assert self.get_underline_color(p9.runs[0]) == self.hexcolor("blue") def test_text_decoration_paragraph(self): - self.document.add_heading('Test: text-decoration on

', level=1) + self.document.add_heading("Test: text-decoration on

", level=1) text_decoration_html_example = ( - "

underlined text (red)

" - "

no decoration text (rgb(0, 0, 0))

" - "

strikethrough text (gray) (color not supported)

" - "

underline+line-through (orange)\ - (should be strike)

" - "

wavy underline (blue)

" - "

dotted underline (rgb(0, 128, 0))

" - "

dotted underline (rgb(0, 255, 0))

" - "

dashed underline (purple)

" - "

double underline (rgb(255, 69, 0))

" - "

overline text (hotpink) (not supported)

" - "

blink text (hotpink) (not supported)

" + '

underlined text (red)

' + '

no decoration text (rgb(0, 0, 0))

' + '

strikethrough text (gray) (color not supported)

' + '

underline+line-through (orange)\ + (should be strike)

' + '

wavy underline (blue)

' + '

dotted underline (rgb(0, 128, 0))

' + '

dotted underline (rgb(0, 255, 0))

' + '

dashed underline (purple)

' + '

double underline (rgb(255, 69, 0))

' + '

overline text (hotpink) (not supported)

' + '

blink text (hotpink) (not supported)

' ) self.parser.add_html_to_document(text_decoration_html_example, self.document) - with self.assertLogs(level='WARNING') as log: + with self.assertLogs(level="WARNING") as log: document = self.parser.parse_html_string(text_decoration_html_example) underline_states = [] @@ -856,45 +805,45 @@ def test_text_decoration_paragraph(self): strike_states.append(strike) expected_underline_states = [ - True, # underline (default single) - explicitly True + True, # underline (default single) - explicitly True False, # none - explicitly False for both underline and strike False, # line-through - explicitly False for underline when strike is True False, # underline + line-through - line-through wins, underline explicitly False - WD_UNDERLINE.WAVY, # wavy underline - explicitly set to wavy - WD_UNDERLINE.DOTTED, # dotted underline - explicitly set to dotted - WD_UNDERLINE.DOTTED, # dotted underline - explicitly set to dotted - WD_UNDERLINE.DASH, # dashed underline - explicitly set to dash - WD_UNDERLINE.DOUBLE, # double underline - explicitly set to double + WD_UNDERLINE.WAVY, # wavy underline - explicitly set to wavy + WD_UNDERLINE.DOTTED, # dotted underline - explicitly set to dotted + WD_UNDERLINE.DOTTED, # dotted underline - explicitly set to dotted + WD_UNDERLINE.DASH, # dashed underline - explicitly set to dash + WD_UNDERLINE.DOUBLE, # double underline - explicitly set to double None, # overline (not supported) - remains None/unchanged None, # blink (not supported) - remains None/unchanged ] expected_underline_colors = [ - self.hexcolor("red"), # underline red - None, # none rgb(0,0,0) - None, # line-through gray (strike only, but color captured) - None, # underline + line-through (color should be orange) - self.hexcolor("blue"), # wavy underline blue - self.hexcolor("rgb(0,128,0)"), # dotted underline rgb(0,128,0) - self.hexcolor("rgb(0,255,0)"), # dotted underline rgb(0,255,0) - self.hexcolor("purple"), # dashed underline purple - self.hexcolor("rgb(255,69,0)"), # double underline rgb(255,69,0) - None, # overline hotpink (unsupported → underline None, but color still parsed) - None, # blink hotpink (unsupported) + self.hexcolor("red"), # underline red + None, # none rgb(0,0,0) + None, # line-through gray (strike only, but color captured) + None, # underline + line-through (color should be orange) + self.hexcolor("blue"), # wavy underline blue + self.hexcolor("rgb(0,128,0)"), # dotted underline rgb(0,128,0) + self.hexcolor("rgb(0,255,0)"), # dotted underline rgb(0,255,0) + self.hexcolor("purple"), # dashed underline purple + self.hexcolor("rgb(255,69,0)"), # double underline rgb(255,69,0) + None, # overline hotpink (unsupported → underline None, but color still parsed) + None, # blink hotpink (unsupported) ] expected_strike_states = [ False, # underline only - explicitly False for strike when underline is True False, # none - explicitly False for both underline and strike - True, # line-through - explicitly True - True, # underline + line-through - line-through wins, strike explicitly True + True, # line-through - explicitly True + True, # underline + line-through - line-through wins, strike explicitly True False, # wavy underline only - explicitly False for strike when underline is set False, # dotted underline only - explicitly False for strike when underline is set False, # dotted underline only - explicitly False for strike when underline is set False, # dashed underline only - explicitly False for strike when underline is set False, # double underline only - explicitly False for strike when underline is set - None, # overline (not supported) - remains None/unchanged - None, # blink (not supported) - remains None/unchanged + None, # overline (not supported) - remains None/unchanged + None, # blink (not supported) - remains None/unchanged ] # Test that the underline states, colors, and strike states are correct @@ -904,13 +853,17 @@ def test_text_decoration_paragraph(self): # Test that the correct warnings are logged self.assertEqual(len(log.records), 4) - self.assertIn('Word does not support colored strike-through. Color \'gray\' will be ignored for line-through.', log.output[0]) - self.assertIn('Word does not support colored strike-through. Color \'orange\' will be ignored for line-through.', log.output[1]) - self.assertIn('Blink or overline not supported.', log.output[2]) - self.assertIn('Blink or overline not supported.', log.output[3]) + self.assertIn( + "Word does not support colored strike-through. Color 'gray' will be ignored for line-through.", log.output[0] + ) + self.assertIn( + "Word does not support colored strike-through. Color 'orange' will be ignored for line-through.", log.output[1] + ) + self.assertIn("Blink or overline not supported.", log.output[2]) + self.assertIn("Blink or overline not supported.", log.output[3]) def test_first_line_paragraph(self): - self.document.add_heading('Test text-indent on

tags', level=1) + self.document.add_heading("Test text-indent on

tags", level=1) self.parser.add_html_to_document(self.paragraph_first_line_indent, self.document) document = self.parser.parse_html_string(self.paragraph_first_line_indent) @@ -922,26 +875,26 @@ def test_first_line_paragraph(self): indent_values.append(indent_pt) expected_values = [ - 1080000, # 3cm - 254000, # 20pt - 381000, # 40px - 1260000, # 35mm - None, # Word does not support negative values here + 1080000, # 3cm + 254000, # 20pt + 381000, # 40px + 1260000, # 35mm + None, # Word does not support negative values here ] for actual, expected in zip(indent_values, expected_values): self.assertAlmostEqual(actual, expected, delta=634) def test_color_paragraph(self): - self.document.add_heading('Test: color on p tags', level=1) + self.document.add_heading("Test: color on p tags", level=1) color_html_example = ( - "

red text

" - "

green hex text

" - "

blue rgb text

" - "

inherit color text

" - "

transparent color text

" - "

current color text

" - "

red with other styles

" + '

red text

' + '

green hex text

' + '

blue rgb text

' + '

inherit color text

' + '

transparent color text

' + '

current color text

' + '

red with other styles

' "

default text

" ) @@ -974,7 +927,7 @@ def test_color_paragraph(self): self.assertEqual(color_states, expected_colors) def test_line_height_paragraph(self): - self.document.add_heading('Test: line-height on

', level=1) + self.document.add_heading("Test: line-height on

", level=1) self.parser.add_html_to_document(self.paragraph_line_height, self.document) document = self.parser.parse_html_string(self.paragraph_line_height) @@ -984,64 +937,67 @@ def test_line_height_paragraph(self): for p in document.paragraphs: line_spacing = p.paragraph_format.line_spacing line_rule = p.paragraph_format.line_spacing_rule - line_heights.append(str(line_spacing) if line_spacing is not None else 'None') - line_rules.append(str(line_rule) if line_rule is not None else 'None') + line_heights.append(str(line_spacing) if line_spacing is not None else "None") + line_rules.append(str(line_rule) if line_rule is not None else "None") expected_line_heights = [ - '1.0', - '1.15', - '1.5', - '2.0', - '190500', # line-height: 20px - '182880', # line-height: 1.2em - '228600', # line-height: 1.5em - '304800', # line-height: 2em - '182880', # line-height: 1.2rem - '228600', # line-height: 1.5rem - '304800', # line-height: 2rem - '1.5', # line-height: 150% - '2.0', # line-height: 200% + "1.0", + "1.15", + "1.5", + "2.0", + "190500", # line-height: 20px + "182880", # line-height: 1.2em + "228600", # line-height: 1.5em + "304800", # line-height: 2em + "182880", # line-height: 1.2rem + "228600", # line-height: 1.5rem + "304800", # line-height: 2rem + "1.5", # line-height: 150% + "2.0", # line-height: 200% ] - self.assertEqual(line_heights, expected_line_heights, - f"Line heights don't match expected values. Got {line_heights}, expected {expected_line_heights}") + self.assertEqual( + line_heights, + expected_line_heights, + f"Line heights don't match expected values. Got {line_heights}, expected {expected_line_heights}", + ) def test_margins_paragraph(self): margins_html_example = ( - "

centered paragraph

" - "

left margin 20px

" - "

right margin 1.5cm

" - "

left margin 1cm

" - "

both margins set

" - "

only left auto

" - "

only right auto

" - "

zero margins

" - "

left margin 2in

" - ) - - self.document.add_heading('Test margins on

', level=1) + '

centered paragraph

' + '

left margin 20px

' + '

right margin 1.5cm

' + '

left margin 1cm

' + '

both margins set

' + '

only left auto

' + '

only right auto

' + '

zero margins

' + '

left margin 2in

' + ) + + self.document.add_heading("Test margins on

", level=1) self.parser.add_html_to_document(margins_html_example, self.document) document = self.parser.parse_html_string(margins_html_example) expected_margins = [ # Paragraph 1: "centered paragraph" - auto margins (None values) - {'left': None, 'right': None}, + {"left": None, "right": None}, # Paragraph 2: "left margin 20px" - 20px = 20 * 9525 = 190500 EMU - {'left': 190500, 'right': None}, + {"left": 190500, "right": None}, # Paragraph 3: "right margin 1.5cm" - 1.5cm = 1.5 * 360000 = 540000 EMU - {'left': None, 'right': 540000}, + {"left": None, "right": 540000}, # Paragraph 4: "left margin 1cm" - 1cm = 360000 EMU - {'left': 360000, 'right': None}, + {"left": 360000, "right": None}, # Paragraph 5: "both margins set" - 10px=95250 EMU, 15px=142875 EMU - {'left': 95250, 'right': 142875}, + {"left": 95250, "right": 142875}, # Paragraph 6: "only left auto" - auto margin - {'left': None, 'right': None}, + {"left": None, "right": None}, # Paragraph 7: "only right auto" - auto margin - {'left': None, 'right': None}, + {"left": None, "right": None}, # Paragraph 8: "zero margins" - 0px = 0 EMU - {'left': 0, 'right': 0}, + {"left": 0, "right": 0}, # Paragraph 9: "left margin 2in" - 2in = 2 * 914400 = 1828800 EMU - {'left': 1828800, 'right': None}, + {"left": 1828800, "right": None}, ] self.assertEqual(len(document.paragraphs), len(expected_margins)) @@ -1052,23 +1008,27 @@ def test_margins_paragraph(self): actual_right = paragraph.paragraph_format.right_indent # Check left margin - if expected['left'] is None: + if expected["left"] is None: self.assertIsNone(actual_left, f"Paragraph {i} left margin should be None") else: self.assertIsNotNone(actual_left, f"Paragraph {i} left margin should not be None") - self.assertTrue(abs(actual_left - expected['left']) <= 634, - f"Paragraph {i} left margin: expected {expected['left']} EMU, got {actual_left} EMU") + self.assertTrue( + abs(actual_left - expected["left"]) <= 634, + f"Paragraph {i} left margin: expected {expected['left']} EMU, got {actual_left} EMU", + ) # Check right margin - if expected['right'] is None: + if expected["right"] is None: self.assertIsNone(actual_right, f"Paragraph {i} right margin should be None") else: self.assertIsNotNone(actual_right, f"Paragraph {i} right margin should not be None") - self.assertTrue(abs(actual_right - expected['right']) <= 634, - f"Paragraph {i} right margin: expected {expected['right']} EMU, got {actual_right} EMU") + self.assertTrue( + abs(actual_right - expected["right"]) <= 634, + f"Paragraph {i} right margin: expected {expected['right']} EMU, got {actual_right} EMU", + ) def test_background_color_styles(self): - self.document.add_heading('Test background color on

, multiple cases', level=1) + self.document.add_heading("Test background color on

, multiple cases", level=1) html_example2 = """

Start of paragraph @@ -1185,61 +1145,61 @@ def test_background_color_styles(self): self.parser.add_html_to_document(html_example9, self.document) def test_headers_with_css(self): - self.document.add_heading('Test: headers with css', level=1) + self.document.add_heading("Test: headers with css", level=1) self.parser.add_html_to_document(self.css_properties_header, self.document) document = self.parser.parse_html_string(self.css_properties_header) # Test H1 - Large and Centered h1 = document.paragraphs[0] - assert h1.style.name.startswith('Heading 1') - assert str(h1.runs[0].font.color.rgb) == '2C3E50' + assert h1.style.name.startswith("Heading 1") + assert str(h1.runs[0].font.color.rgb) == "2C3E50" assert h1.runs[0].font.bold is True assert h1.runs[0].font.size == 342900 assert h1.alignment == WD_ALIGN_PARAGRAPH.CENTER - assert h1.runs[0].text == 'MAIN HEADING H1 - LARGE AND CENTERED' # uppercase due to text-transform + assert h1.runs[0].text == "MAIN HEADING H1 - LARGE AND CENTERED" # uppercase due to text-transform # Test H2 - Underlined with Background (no span in this one) h2 = document.paragraphs[1] - assert h2.style.name.startswith('Heading 2') - assert str(h2.runs[0].font.color.rgb) == '34495E' + assert h2.style.name.startswith("Heading 2") + assert str(h2.runs[0].font.color.rgb) == "34495E" assert h2.runs[0].font.underline is True - assert h2.runs[0].font.name == 'Arial' + assert h2.runs[0].font.name == "Arial" assert h2.runs[0].font.size == 266700 # Test H3 - Italic and Right Aligned h3 = document.paragraphs[2] - assert h3.style.name.startswith('Heading 3') - assert str(h3.runs[0].font.color.rgb) == '7F8C8D' + assert h3.style.name.startswith("Heading 3") + assert str(h3.runs[0].font.color.rgb) == "7F8C8D" assert h3.runs[0].font.italic is True assert h3.runs[0].font.size == 209550 assert h3.alignment == WD_ALIGN_PARAGRAPH.RIGHT # Test H4 - Normal Weight and Capitalized h4 = document.paragraphs[3] - assert h4.style.name.startswith('Heading 4') - assert str(h4.runs[0].font.color.rgb) == '95A5A6' + assert h4.style.name.startswith("Heading 4") + assert str(h4.runs[0].font.color.rgb) == "95A5A6" assert h4.runs[0].font.bold is False # font-weight: normal - assert h4.runs[0].font.name == 'Georgia' + assert h4.runs[0].font.name == "Georgia" assert h4.runs[0].font.size == 171450 - assert h4.runs[0].text == 'Quaternary Heading H4 - Normal Weight And Capitalized' # capitalized + assert h4.runs[0].text == "Quaternary Heading H4 - Normal Weight And Capitalized" # capitalized # Test H1 with Complex Text Decoration and Span h1_complex = document.paragraphs[4] assert h1_complex.runs[0].font.strike is True # line-through - assert str(h1_complex.runs[0].font.color.rgb) == '8E44AD' + assert str(h1_complex.runs[0].font.color.rgb) == "8E44AD" assert h1_complex.runs[0].font.size == 381000 # Test span in complex H1 assert len(h1_complex.runs) >= 2 span_in_h1 = h1_complex.runs[1] assert span_in_h1.font.underline is True # underline in span - assert str(span_in_h1.font.color.rgb) == '2980B9' + assert str(span_in_h1.font.color.rgb) == "2980B9" # Test H3 with Light Weight and Span h3_light = document.paragraphs[5] assert h3_light.runs[0].font.bold is False # font-weight: 100 - assert str(h3_light.runs[0].font.color.rgb) == 'D35400' + assert str(h3_light.runs[0].font.color.rgb) == "D35400" assert h3_light.runs[0].font.size == 190500 # Test bold span in light H3 @@ -1249,21 +1209,21 @@ def test_headers_with_css(self): # Test H3 with Text Transform h3_transform = document.paragraphs[6] - assert h3_transform.runs[0].text == 'h3 forced to lowercase with text-transform ' + assert h3_transform.runs[0].text == "h3 forced to lowercase with text-transform " assert len(h3_transform.runs) >= 2 uppercase_span = h3_transform.runs[1] - assert uppercase_span.text == 'SPAN FORCED TO UPPERCASE' + assert uppercase_span.text == "SPAN FORCED TO UPPERCASE" # Test H4 with Serif Font h4_serif = document.paragraphs[7] - assert h4_serif.runs[0].font.name == 'Times New Roman' - assert str(h4_serif.runs[0].font.color.rgb) == '7D3C98' + assert h4_serif.runs[0].font.name == "Times New Roman" + assert str(h4_serif.runs[0].font.color.rgb) == "7D3C98" assert h4_serif.alignment == WD_ALIGN_PARAGRAPH.CENTER # Test H1 with Auto Margins and Background h1_centered = document.paragraphs[8] assert h1_centered.alignment == WD_ALIGN_PARAGRAPH.CENTER - assert str(h1_centered.runs[0].font.color.rgb) == 'FFFFFF' + assert str(h1_centered.runs[0].font.color.rgb) == "FFFFFF" # Test H2 with Lighter Weight and Span h2_lighter = document.paragraphs[9] @@ -1278,13 +1238,13 @@ def test_headers_with_css(self): # Test H3 with RGB Colors and Span h3_rgb = document.paragraphs[10] - assert str(h3_rgb.runs[0].font.color.rgb) == '3498DB' # rgb(52, 152, 219) + assert str(h3_rgb.runs[0].font.color.rgb) == "3498DB" # rgb(52, 152, 219) assert h3_rgb.runs[0].font.size == 177800 # Test RGB span assert len(h3_rgb.runs) >= 2 rgb_span = h3_rgb.runs[1] - assert str(rgb_span.font.color.rgb) == 'E74C3C' # rgb(231, 76, 60) + assert str(rgb_span.font.color.rgb) == "E74C3C" # rgb(231, 76, 60) # Test H4 with Strike-through and Span h4_strike = document.paragraphs[11] @@ -1295,17 +1255,17 @@ def test_headers_with_css(self): assert len(h4_strike.runs) >= 2 no_strike_span = h4_strike.runs[1] assert no_strike_span.font.strike is False - assert str(no_strike_span.font.color.rgb) == 'E74C3C' + assert str(no_strike_span.font.color.rgb) == "E74C3C" # Test H3 with Unsupported Transform and Span h3_unsupported = document.paragraphs[12] - assert str(h3_unsupported.runs[0].font.color.rgb) == 'F39C12' + assert str(h3_unsupported.runs[0].font.color.rgb) == "F39C12" assert h3_unsupported.runs[0].font.size == 196850 # Test supported transform in span assert len(h3_unsupported.runs) >= 2 supported_span = h3_unsupported.runs[1] - assert supported_span.text == 'Supported Transform In Span' # capitalize + assert supported_span.text == "Supported Transform In Span" # capitalize # Test H4 with Reset Styles and Span h4_reset = document.paragraphs[13] @@ -1324,7 +1284,7 @@ def test_headers_with_css(self): h1_transparent = document.paragraphs[14] assert h1_transparent.runs[0].font.size == 361950 visible_span = h1_transparent.runs[1] - assert str(visible_span.font.color.rgb) == 'ECF0F1' + assert str(visible_span.font.color.rgb) == "ECF0F1" # Test H3 with All Three Decorations and Span h3_all_decorations = document.paragraphs[15] @@ -1352,40 +1312,34 @@ def test_headers_with_css(self): def test_color_by_name(self): color_html_example = ( - "

paragraph red

" - "

paragraph yellow

" - "

paragraph blue

" - "

paragraph green

" - "

paragraph darkgray

" - "

paragraph magenta

" - "

paragraph has default black because of invalid color name

" + '

paragraph red

' + '

paragraph yellow

' + '

paragraph blue

' + '

paragraph green

' + '

paragraph darkgray

' + '

paragraph magenta

' + '

paragraph has default black because of invalid color name

' ) - self.document.add_heading( - 'Test: Color by name', - level=1 - ) + self.document.add_heading("Test: Color by name", level=1) # Add on document for human validation self.parser.add_html_to_document(color_html_example, self.document) document = self.parser.parse_html_string(color_html_example) colors = [str(p.runs[0].font.color.rgb) for p in document.paragraphs] - assert 'FF0000' in colors # Red - assert 'FFFF00' in colors # Yellow - assert '0000FF' in colors # Blue - assert '008000' in colors # Green - assert 'A9A9A9' in colors # Darkgray - assert '000000' in colors # Black - assert 'FF00FF' in colors # Magenta + assert "FF0000" in colors # Red + assert "FFFF00" in colors # Yellow + assert "0000FF" in colors # Blue + assert "008000" in colors # Green + assert "A9A9A9" in colors # Darkgray + assert "000000" in colors # Black + assert "FF00FF" in colors # Magenta def test_table_cell_border_properties(self): """Validates that all table cells have the expected border size, style, and color.""" - self.document.add_heading( - 'Test: Table Cell Border Properties', - level=1 - ) + self.document.add_heading("Test: Table Cell Border Properties", level=1) # Add on document for human validation self.parser.add_html_to_document(self.table3_html, self.document) document = self.parser.parse_html_string(self.table3_html) @@ -1396,38 +1350,38 @@ def test_table_cell_border_properties(self): "top": {"color": "D95B48", "style": "single", "size": "1.0"}, "bottom": {"color": "D95B48", "style": "single", "size": "1.0"}, "left": {"color": "FF0000", "style": "single", "size": "1.0"}, - "right": {"color": "8B0000", "style": "single", "size": "1.0"} + "right": {"color": "8B0000", "style": "single", "size": "1.0"}, }, { "top": {"color": "FAC32A", "style": "single", "size": "1.0"}, "bottom": {"color": "FAC32A", "style": "single", "size": "1.125"}, "left": {"color": "none", "style": "none", "size": "none"}, - "right": {"color": "FAC32A", "style": "single", "size": "12.0"} + "right": {"color": "FAC32A", "style": "single", "size": "12.0"}, }, { "top": {"color": "30E667", "style": "none", "size": "5.67"}, "bottom": {"color": "30E667", "style": "single", "size": "5.67"}, "left": {"color": "30E667", "style": "single", "size": "5.67"}, - "right": {"color": "30E667", "style": "single", "size": "5.67"} + "right": {"color": "30E667", "style": "single", "size": "5.67"}, }, { "top": {"color": "none", "style": "none", "size": "none"}, "bottom": {"color": "D948CF", "style": "single", "size": "1.5"}, "left": {"color": "none", "style": "none", "size": "none"}, - "right": {"color": "D948CF", "style": "single", "size": "5.67"} + "right": {"color": "D948CF", "style": "single", "size": "5.67"}, }, { "top": {"color": "EAAAA7", "style": "single", "size": "1.1"}, "bottom": {"color": "EAAAA7", "style": "single", "size": "1.1"}, "left": {"color": "EAAAA7", "style": "single", "size": "1.1"}, - "right": {"color": "EAAAA7", "style": "single", "size": "1.1"} + "right": {"color": "EAAAA7", "style": "single", "size": "1.1"}, }, { "top": {"color": "none", "style": "none", "size": "none"}, "bottom": {"color": "ACC4AA", "style": "dashed", "size": "7.2"}, "left": {"color": "none", "style": "none", "size": "none"}, - "right": {"color": "ACC4AA", "style": "dotted", "size": "4.8"} - } + "right": {"color": "ACC4AA", "style": "dotted", "size": "4.8"}, + }, ] # Validate border properties for each cell @@ -1437,21 +1391,21 @@ def test_table_cell_border_properties(self): # Get the table cell element and properties tc = cell._tc tcPr = tc.get_or_add_tcPr() - tcBorders = tcPr.find(qn('w:tcBorders')) + tcBorders = tcPr.find(qn("w:tcBorders")) # Extract border properties border_sides = { - 'top': tcBorders.find(qn('w:top')) if tcBorders is not None else None, - 'bottom': tcBorders.find(qn('w:bottom')) if tcBorders is not None else None, - 'left': tcBorders.find(qn('w:left')) if tcBorders is not None else None, - 'right': tcBorders.find(qn('w:right')) if tcBorders is not None else None, + "top": tcBorders.find(qn("w:top")) if tcBorders is not None else None, + "bottom": tcBorders.find(qn("w:bottom")) if tcBorders is not None else None, + "left": tcBorders.find(qn("w:left")) if tcBorders is not None else None, + "right": tcBorders.find(qn("w:right")) if tcBorders is not None else None, } for side, border in border_sides.items(): if border is not None: - color = border.get(qn('w:color'), "").upper() # Ensure uppercase and no # - size = border.get(qn('w:sz')) - style = border.get(qn('w:val')) + color = border.get(qn("w:color"), "").upper() # Ensure uppercase and no # + size = border.get(qn("w:sz")) + style = border.get(qn("w:val")) else: color, size, style = "none", "none", "none" @@ -1480,22 +1434,19 @@ def test_table_cell_border_properties(self): def test_table_cell_background_color(self): """Validates that all table cells have the expected background color.""" - self.document.add_heading( - 'Test: Table Cell Background Color', - level=1 - ) + self.document.add_heading("Test: Table Cell Background Color", level=1) # Add on document for human validation self.parser.add_html_to_document(self.table3_html, self.document) document = self.parser.parse_html_string(self.table3_html) # Define expected background colors for each cell expected_background_colors = [ - "3749EF", # Row 1 Column 1 - "33b32e", # Row 1 Column 2 - "BFBFBF", # Row 2 Column 1 - "2eaab3", # Row 2 Column 2 - "99fffa", # Row 3 Column 1 - "2eaab3" # Row 3 Column 2 + "3749EF", # Row 1 Column 1 + "33b32e", # Row 1 Column 2 + "BFBFBF", # Row 2 Column 1 + "2eaab3", # Row 2 Column 2 + "99fffa", # Row 3 Column 1 + "2eaab3", # Row 3 Column 2 ] # Validate background colors for each cell @@ -1507,9 +1458,9 @@ def test_table_cell_background_color(self): tcPr = tc.get_or_add_tcPr() # Get the background color (shading) if it exists - shading = tcPr.find(qn('w:shd')) + shading = tcPr.find(qn("w:shd")) if shading is not None: - background_color = shading.get(qn('w:fill'), "").upper() # Ensure uppercase and no # + background_color = shading.get(qn("w:fill"), "").upper() # Ensure uppercase and no # else: background_color = "None" @@ -1525,10 +1476,7 @@ def test_table_cell_background_color(self): def test_table_cell_dimensions(self): """Validates that all table cells have the expected width and height.""" - self.document.add_heading( - 'Test: Table Cell Dimensions', - level=1 - ) + self.document.add_heading("Test: Table Cell Dimensions", level=1) # Add on document for human validation self.parser.add_html_to_document(self.table3_html, self.document) document = self.parser.parse_html_string(self.table3_html) @@ -1539,35 +1487,35 @@ def test_table_cell_dimensions(self): [ { "width": "258.35px", # Width for the first cell - "height": "23.75pt" # Height for the first cell + "height": "23.75pt", # Height for the first cell }, { - "width": "222.2pt", # Width for the second cell - "height": "23.75pt" # Height for the second cell - } + "width": "222.2pt", # Width for the second cell + "height": "23.75pt", # Height for the second cell + }, ], # Second row [ { "width": "258.35in", # Width for the first cell - "height": "15.5pt" # Height for the first cell + "height": "15.5pt", # Height for the first cell }, { - "width": "6cm", # Width for the second cell - "height": "15.5pt" # Height for the second cell - } + "width": "6cm", # Width for the second cell + "height": "15.5pt", # Height for the second cell + }, ], # Third row [ { "width": "258.35pt", # Width for the first cell - "height": "2rem" # Height for the first cell + "height": "2rem", # Height for the first cell }, { - "width": "6cm", # Width for the second cell - "height": "2em" # Height for the second cell - } - ] + "width": "6cm", # Width for the second cell + "height": "2em", # Height for the second cell + }, + ], ] # Validate dimensions for each cell @@ -1589,8 +1537,7 @@ def test_table_cell_dimensions(self): expected_height_px = unit_converter(expected_height, "px") assert round(abs(cell_width_px - expected_width_px), 2) <= 0.03, ( - f"Width mismatch for cell ({row_idx}, {cell_idx}): " - f"expected {expected_width_px}px, got {cell_width_px}px" + f"Width mismatch for cell ({row_idx}, {cell_idx}): expected {expected_width_px}px, got {cell_width_px}px" ) assert round(abs(cell_height_px - expected_height_px), 2) <= 0.03, ( f"Height mismatch for cell ({row_idx}, {cell_idx}): " @@ -1624,18 +1571,18 @@ def test_border_with_keywords(self): # Get the table cell element and properties tc = cell._tc tcPr = tc.get_or_add_tcPr() - tcBorders = tcPr.find(qn('w:tcBorders')) + tcBorders = tcPr.find(qn("w:tcBorders")) # Extract border properties border_sides = { - 'top': tcBorders.find(qn('w:top')) if tcBorders is not None else None, - 'bottom': tcBorders.find(qn('w:bottom')) if tcBorders is not None else None, - 'left': tcBorders.find(qn('w:left')) if tcBorders is not None else None, - 'right': tcBorders.find(qn('w:right')) if tcBorders is not None else None, + "top": tcBorders.find(qn("w:top")) if tcBorders is not None else None, + "bottom": tcBorders.find(qn("w:bottom")) if tcBorders is not None else None, + "left": tcBorders.find(qn("w:left")) if tcBorders is not None else None, + "right": tcBorders.find(qn("w:right")) if tcBorders is not None else None, } for side, border in border_sides.items(): - size = border.get(qn('w:sz')) if border is not None else "none" + size = border.get(qn("w:sz")) if border is not None else "none" # Convert size from eighths of a point to points size_in_pt = str(float(size) / 8) if size and size != "none" else "none" @@ -1667,20 +1614,20 @@ def test_border_style_with_diff_formats(self): "top": {"color": "ADD8E6", "style": "single", "size": "1.0"}, "bottom": {"color": "none", "style": "none", "size": "none"}, "left": {"color": "000000", "style": "none", "size": "2.25"}, - "right": {"color": "000000", "style": "single", "size": "1.0"} + "right": {"color": "000000", "style": "single", "size": "1.0"}, }, { "top": {"color": "000000", "style": "single", "size": "3.75"}, "bottom": {"color": "none", "style": "none", "size": "none"}, "left": {"color": "000000", "style": "single", "size": "0.75"}, - "right": {"color": "773366", "style": "single", "size": "0.75"} + "right": {"color": "773366", "style": "single", "size": "0.75"}, }, { "top": {"color": "FFA500", "style": "single", "size": "1.0"}, "bottom": {"color": "FF00FF", "style": "single", "size": "3.75"}, "left": {"color": "000000", "style": "dashed", "size": "2.25"}, - "right": {"color": "none", "style": "none", "size": "none"} - } + "right": {"color": "none", "style": "none", "size": "none"}, + }, ] cell_idx = 0 @@ -1689,21 +1636,21 @@ def test_border_style_with_diff_formats(self): # Get the table cell element and properties tc = cell._tc tcPr = tc.get_or_add_tcPr() - tcBorders = tcPr.find(qn('w:tcBorders')) + tcBorders = tcPr.find(qn("w:tcBorders")) # Extract border properties border_sides = { - 'top': tcBorders.find(qn('w:top')) if tcBorders is not None else None, - 'bottom': tcBorders.find(qn('w:bottom')) if tcBorders is not None else None, - 'left': tcBorders.find(qn('w:left')) if tcBorders is not None else None, - 'right': tcBorders.find(qn('w:right')) if tcBorders is not None else None, + "top": tcBorders.find(qn("w:top")) if tcBorders is not None else None, + "bottom": tcBorders.find(qn("w:bottom")) if tcBorders is not None else None, + "left": tcBorders.find(qn("w:left")) if tcBorders is not None else None, + "right": tcBorders.find(qn("w:right")) if tcBorders is not None else None, } for side, border in border_sides.items(): if border is not None: - color = border.get(qn('w:color'), "").upper() # Ensure uppercase and no # - size = border.get(qn('w:sz')) - style = border.get(qn('w:val')) + color = border.get(qn("w:color"), "").upper() # Ensure uppercase and no # + size = border.get(qn("w:sz")) + style = border.get(qn("w:val")) else: color, size, style = "none", "none", "none" @@ -1731,7 +1678,7 @@ def test_border_style_with_diff_formats(self): def test_unbalanced_table(self): # A table with more td elements in latter rows than in the first - self.document.add_heading('Test: Handling unbalanced tables', level=1) + self.document.add_heading("Test: Handling unbalanced tables", level=1) html_unbalanced_table = """ @@ -1766,7 +1713,7 @@ def test_html_comment_rendering(self): """ # Process document using parser - self.parser.options['html-comments'] = True + self.parser.options["html-comments"] = True self.parser.add_html_to_document(html_with_comment, self.document) document = self.parser.parse_html_string(html_with_comment) @@ -1777,15 +1724,12 @@ def test_html_comment_rendering(self): expected_comment = "# This is a comment" # Assert the comment paragraph exists - assert any( - expected_comment == text for text in paragraph_texts - ), f"Expected comment '{expected_comment}' to appear in the document, but it was not found." + assert any(expected_comment == text for text in paragraph_texts), ( + f"Expected comment '{expected_comment}' to appear in the document, but it was not found." + ) # (Optional) Check styling if needed: green color or italic - comment_paragraph = next( - (p for p in document.paragraphs if p.text.strip() == expected_comment), - None - ) + comment_paragraph = next((p for p in document.paragraphs if p.text.strip() == expected_comment), None) assert comment_paragraph is not None, "Comment paragraph not found for style checks." comment_run = comment_paragraph.runs[0] @@ -1795,10 +1739,9 @@ def test_html_comment_rendering(self): # color assertion (dark-ish green #008000) expected_rgb = parse_color("#008000") - assert ( - comment_run.font.color.rgb is not None - and comment_run.font.color.rgb == RGBColor(*expected_rgb) - ), f"Comment run color should be green ({expected_rgb})." + assert comment_run.font.color.rgb is not None and comment_run.font.color.rgb == RGBColor(*expected_rgb), ( + f"Comment run color should be green ({expected_rgb})." + ) def test_emojis_and_special_characters(self): emojis_and_special_chars_html_example = """ @@ -1811,10 +1754,7 @@ def test_emojis_and_special_characters(self): """ - self.document.add_heading( - 'Test: Emojis and Special Characters', - level=1 - ) + self.document.add_heading("Test: Emojis and Special Characters", level=1) # Add on document for human validation self.parser.add_html_to_document(emojis_and_special_chars_html_example, self.document) @@ -1849,9 +1789,7 @@ def test_ordered_list(self): document = self.parser.parse_html_string(ordered_list_html_example) # Extract paragraphs with 'ListNumber' style (ordered list) - ordered_list_paragraphs = [ - p for p in document.paragraphs if p.style.name == "List Number" - ] + ordered_list_paragraphs = [p for p in document.paragraphs if p.style.name == "List Number"] # Expected items in order expected_items = [ @@ -1860,15 +1798,13 @@ def test_ordered_list(self): "first list, item 3 within a paragraph", ] - assert len(ordered_list_paragraphs) >= len( - expected_items - ), f"Expected at least {len(expected_items)} ordered list items, found {len(ordered_list_paragraphs)}" + assert len(ordered_list_paragraphs) >= len(expected_items), ( + f"Expected at least {len(expected_items)} ordered list items, found {len(ordered_list_paragraphs)}" + ) for i, expected_text in enumerate(expected_items): actual_text = ordered_list_paragraphs[i].text.strip() - assert ( - actual_text == expected_text - ), f"Expected ordered list item '{expected_text}', but got '{actual_text}'" + assert actual_text == expected_text, f"Expected ordered list item '{expected_text}', but got '{actual_text}'" def test_unordered_list(self): self.document.add_heading("Test: Unordered List", level=1) @@ -1885,21 +1821,19 @@ def test_unordered_list(self): document = self.parser.parse_html_string(unordered_list_html_example) # Extract paragraphs with 'ListBullet' style (unordered list) - unordered_list_paragraphs = [ - p for p in document.paragraphs if p.style.name == "List Bullet" - ] + unordered_list_paragraphs = [p for p in document.paragraphs if p.style.name == "List Bullet"] # Expected unordered items expected_items = ["Unorderd list", "with circle markers", "last option"] - assert len(unordered_list_paragraphs) >= len( - expected_items - ), f"Expected at least {len(expected_items)} unordered list items, found {len(unordered_list_paragraphs)}" + assert len(unordered_list_paragraphs) >= len(expected_items), ( + f"Expected at least {len(expected_items)} unordered list items, found {len(unordered_list_paragraphs)}" + ) for expected_text in expected_items: - assert any( - expected_text == p.text.strip() for p in unordered_list_paragraphs - ), f"Unordered list item '{expected_text}' not found in List Bullet paragraphs" + assert any(expected_text == p.text.strip() for p in unordered_list_paragraphs), ( + f"Unordered list item '{expected_text}' not found in List Bullet paragraphs" + ) def test_table_rowspan_and_colspan(self): self.document.add_heading("Test: Table rowspan and colspan", level=1) @@ -1925,7 +1859,7 @@ def test_table_rowspan_and_colspan(self):
7,634
""" - self.parser.table_style = 'Table Grid' + self.parser.table_style = "Table Grid" self.parser.add_html_to_document(rowspan_and_colspan_html_example, self.document) document = self.parser.parse_html_string(rowspan_and_colspan_html_example) @@ -1958,7 +1892,7 @@ def test_table_rowspan_and_colspan(self): assert "7,634" in table.cell(3, 2).text def test_complex_colspan_rowspan_combinations(self): - self.document.add_heading('Test: Complex Colspan and Rowspan Combinations', level=1) + self.document.add_heading("Test: Complex Colspan and Rowspan Combinations", level=1) complex_table_html = """ @@ -1985,7 +1919,7 @@ def test_complex_colspan_rowspan_combinations(self): """ try: - self.parser.table_style = 'Table Grid' + self.parser.table_style = "Table Grid" self.parser.add_html_to_document(complex_table_html, self.document) document = self.parser.parse_html_string(complex_table_html) @@ -2005,8 +1939,8 @@ def test_complex_colspan_rowspan_combinations(self): self.fail(f"Processing complex table failed with unexpected error: {e}") def test_extreme_colspan_rowspan_cases(self): - """ Test extreme colspan and rowspan cases """ - self.document.add_heading('Test: Extreme Colspan and Rowspan Cases', level=1) + """Test extreme colspan and rowspan cases""" + self.document.add_heading("Test: Extreme Colspan and Rowspan Cases", level=1) extreme_table_html = """
@@ -2035,7 +1969,7 @@ def test_extreme_colspan_rowspan_cases(self): """ try: - self.parser.table_style = 'Table Grid' + self.parser.table_style = "Table Grid" self.parser.add_html_to_document(extreme_table_html, self.document) document = self.parser.parse_html_string(extreme_table_html) @@ -2055,9 +1989,59 @@ def test_extreme_colspan_rowspan_cases(self): except Exception as e: self.fail(f"Processing extreme table failed with unexpected error: {e}") + def test_malformed_table_overflow(self): + """Test table where rowspan or colspan causes column overflow beyond initial dimension calculation""" + self.document.add_heading('Test: Malformed Table Overflow', level=1) + + malformed_html = """ +
+ + + + + + + + +
spans downB1
A2B2
+ """ + + try: + self.parser.table_style = 'Table Grid' + self.parser.add_html_to_document(malformed_html, self.document) + document = self.parser.parse_html_string(malformed_html) + + tables = document.tables + assert len(tables) == 1, "Should create exactly one table" + + table = tables[0] + + assert len(table.columns) == 3, ( + f"Expected 3 columns due to rowspan shift, but got {len(table.columns)}" + ) + + assert len(table.rows) == 2, ( + f"Expected 2 rows, but got {len(table.rows)}" + ) + + # Validate content placement + assert "spans down" in table.cell(0, 0).text, "Rowspan cell not in correct position" + assert "B1" in table.cell(0, 1).text, "B1 should be in row 0, col 1" + + # Second row: + # col 0 is occupied by rowspan + # so A2 → col 1, B2 → col 2 + assert "A2" in table.cell(1, 1).text, "A2 should be shifted to column 1" + assert "B2" in table.cell(1, 2).text, "B2 should be in column 2" + + except IndexError as e: + self.fail(f"Malformed table caused IndexError (regression): {e}") + except Exception as e: + self.fail(f"Malformed table failed with unexpected error: {e}") + def test_nested_styles_on_multiple_tags(self): - """ Test nested styles on multiple tags """ - self.document.add_heading('Test: Test nested styles on multiple tags', level=1) + """Test nested styles on multiple tags""" + self.document.add_heading("Test: Test nested styles on multiple tags", level=1) nested_styles_html = """

Title Text

@@ -2080,42 +2064,42 @@ def test_nested_styles_on_multiple_tags(self): document = self.parser.parse_html_string(nested_styles_html) # -------- H3 ---------- - h3_paragraphs = [p for p in document.paragraphs if 'Title Text' in p.text] + h3_paragraphs = [p for p in document.paragraphs if "Title Text" in p.text] assert len(h3_paragraphs) == 1 h3_run = h3_paragraphs[0].runs[0] - assert h3_run.text == 'Title Text' - assert h3_run.font.color.rgb == Color['red'].value + assert h3_run.text == "Title Text" + assert h3_run.font.color.rgb == Color["red"].value assert h3_run.font.size is not None # -------- Div ---------- - div_paragraphs = [p for p in document.paragraphs if 'Div Text' in p.text] + div_paragraphs = [p for p in document.paragraphs if "Div Text" in p.text] assert len(div_paragraphs) == 1 div_run = div_paragraphs[0].runs[0] - assert div_run.text.strip() == 'Div Text' - assert div_run.font.color.rgb == Color['white'].value + assert div_run.text.strip() == "Div Text" + assert div_run.font.color.rgb == Color["white"].value assert div_paragraphs[0].alignment == WD_ALIGN_PARAGRAPH.CENTER # -------- P inside div ---------- - p_paragraphs = [p for p in document.paragraphs if 'P Text' in p.text] + p_paragraphs = [p for p in document.paragraphs if "P Text" in p.text] assert len(p_paragraphs) == 1 p_run = p_paragraphs[0].runs[0] - assert p_run.text.strip() == 'P Text' - assert p_run.font.color.rgb == Color['lightgreen'].value + assert p_run.text.strip() == "P Text" + assert p_run.font.color.rgb == Color["lightgreen"].value assert p_paragraphs[0].alignment == WD_ALIGN_PARAGRAPH.CENTER # -------- List items ---------- - li1_paragraphs = [p for p in document.paragraphs if 'Li Text 1' in p.text] + li1_paragraphs = [p for p in document.paragraphs if "Li Text 1" in p.text] assert len(li1_paragraphs) == 1 li1_run = li1_paragraphs[0].runs[0] - assert li1_run.text.strip() == 'Li Text 1' - assert li1_run.font.color.rgb == Color['lightblue'].value + assert li1_run.text.strip() == "Li Text 1" + assert li1_run.font.color.rgb == Color["lightblue"].value assert li1_run.font.size is not None - li2_paragraphs = [p for p in document.paragraphs if 'Li Text 2' in p.text] + li2_paragraphs = [p for p in document.paragraphs if "Li Text 2" in p.text] assert len(li2_paragraphs) == 1 li2_run = li2_paragraphs[0].runs[0] - assert li2_run.text.strip() == 'Li Text 2' - assert li2_run.font.color.rgb == Color['lightyellow'].value + assert li2_run.text.strip() == "Li Text 2" + assert li2_run.font.color.rgb == Color["lightyellow"].value assert li2_run.font.size is not None def test_basic_class_mapping(self): @@ -2138,9 +2122,7 @@ def test_basic_class_mapping(self): def test_multiple_classes(self): """Test that first matching class in style_map wins""" - self.document.add_heading( - "Test: Test that first matching class in style_map wins", level=1 - ) + self.document.add_heading("Test: Test that first matching class in style_map wins", level=1) style_map = { "first": "Heading 2", "second": "Heading 3", @@ -2159,9 +2141,7 @@ def test_multiple_classes(self): def test_unmapped_class_uses_default(self): """Test that unmapped classes fall back to default behavior""" - self.document.add_heading( - "Test: Test that unmapped classes fall back to default behavior", level=1 - ) + self.document.add_heading("Test: Test that unmapped classes fall back to default behavior", level=1) style_map = { "mapped": "Heading 450", } @@ -2197,9 +2177,7 @@ def test_h1_override(self): def test_class_overrides_tag_override(self): """Test that class mapping has priority over tag override""" - self.document.add_heading( - "Test: Test class mapping priority over tag override", level=1 - ) + self.document.add_heading("Test: Test class mapping priority over tag override", level=1) style_map = {"custom": "Heading 3"} tag_overrides = {"h1": "Heading 2"} @@ -2217,9 +2195,7 @@ def test_class_overrides_tag_override(self): def test_code_and_pre_tag_overrides(self): """Test that code and pre tag_style_overrides apply Word styles when they exist in the document.""" - self.document.add_heading( - "Test: Test code and pre tag style overrides", level=1 - ) + self.document.add_heading("Test: Test code and pre tag style overrides", level=1) tag_overrides = { "code": "Inline Code", "pre": "Code Block", @@ -2267,9 +2243,7 @@ def test_code_and_pre_tag_overrides_from_template(self): """Test that code and pre use custom styles from an imported Word template (template.docx). Saves the template-based document to tests/assets/template_output.docx so custom styles are preserved; open that file in Word to verify (test.docx is the default doc, not the template).""" - self.document.add_heading( - "Test: Test code and pre from template.docx custom styles", level=1 - ) + self.document.add_heading("Test: Test code and pre from template.docx custom styles", level=1) doc = Document(os.path.join(test_dir, "assets", "templates", "template.docx")) markdown_style = "Custom Markdown" code_block_style = "Code Block" @@ -2327,9 +2301,7 @@ def test_code_and_pre_tag_overrides_from_template(self): def test_normal_default(self): """Test that Normal is used as default by default""" - self.document.add_heading( - "Test: Test that Normal style is used as default", level=1 - ) + self.document.add_heading("Test: Test that Normal style is used as default", level=1) html = "

Test paragraph

" doc = Document() @@ -2353,9 +2325,7 @@ def test_custom_default(self): def test_none_default_uses_normal(self): """Test that None uses Word's default Normal style""" - self.document.add_heading( - "Test: Test default of None will use 'Normal' as default style", level=1 - ) + self.document.add_heading("Test: Test default of None will use 'Normal' as default style", level=1) html = "

Test paragraph

" doc = Document() @@ -2527,9 +2497,7 @@ def test_important_conflict_last_wins(self): def test_important_on_paragraph(self): """Test !important on paragraph inline style""" - self.document.add_heading( - "Test: Test !important override for paragraph", level=1 - ) + self.document.add_heading("Test: Test !important override for paragraph", level=1) html = '

Blue important

' doc = Document() @@ -2593,9 +2561,7 @@ def test_numbered_headings(self): def test_basic_html_still_works(self): """Test that basic HTML conversion works without new features""" - self.document.add_heading( - "Test: Test Basic HTML still works after changes", level=1 - ) + self.document.add_heading("Test: Test Basic HTML still works after changes", level=1) html = "

Simple paragraph

and here we have heading 3

" doc = Document() @@ -2621,9 +2587,7 @@ def test_existing_span_styles_work(self): def test_bold_italic_tags_work(self): """Test that , , tags still work""" - self.document.add_heading( - "Test: bold, itatlic, and underline tags to ensure they still work", level=1 - ) + self.document.add_heading("Test: bold, itatlic, and underline tags to ensure they still work", level=1) html = "

Bold Italic Underline

" doc = Document() @@ -2640,15 +2604,11 @@ def test_bold_italic_tags_work(self): self.assertTrue(len(bold_runs) > 0, "Should have at least one bold run") self.assertTrue(len(italic_runs) > 0, "Should have at least one italic run") - self.assertTrue( - len(underline_runs) > 0, "Should have at least one underline run" - ) + self.assertTrue(len(underline_runs) > 0, "Should have at least one underline run") def test_nonexistent_style_graceful_failure(self): """Test that non-existent styles don't crash""" - self.document.add_heading( - "Test: Test crash protection when style doesn't exist", level=1 - ) + self.document.add_heading("Test: Test crash protection when style doesn't exist", level=1) style_map = { "custom": "NonExistentStyle", } @@ -2708,7 +2668,9 @@ def test_page_break_css2_and_css3(self): html_whitespace = '
Content before break

Content after break

' # Test with !important (should still work) - html_important = '
Content before break

Content after break

' + html_important = ( + '
Content before break

Content after break

' + ) # Test both properties in separate documents doc_css2 = Document() @@ -2732,9 +2694,9 @@ def test_page_break_css2_and_css3(self): def has_page_break(doc): for paragraph in doc.paragraphs: for run in paragraph.runs: - br_elements = run._element.findall('.//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br') + br_elements = run._element.findall(".//{http://schemas.openxmlformats.org/wordprocessingml/2006/main}br") for br in br_elements: - if br.get('{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type') == 'page': + if br.get("{http://schemas.openxmlformats.org/wordprocessingml/2006/main}type") == "page": return True return False @@ -2767,18 +2729,18 @@ def has_page_break(doc): def test_invalid_color_fallback_to_black(self): """Test with invalid color fallback to black""" self.document.add_heading("Test: Test invalid color fallback to black", level=1) - html = ''' + html = """

Test Unsupported RGBA Color with opacity Fallback to Black

Test Invalid RGBA Color with letters Fallback to Black

Test Invalid RGB Color with extra value Fallback to Black

Test Invalid Color Name Fallback to Black

Test Invalid Hex Color with extra value Fallback to Black

- ''' + """ doc = Document() parser = HtmlToDocx() parser.add_html_to_document(html, self.document) - with self.assertLogs(level='WARNING') as log: + with self.assertLogs(level="WARNING") as log: parser.add_html_to_document(html, doc) self.assertEqual(doc.paragraphs[1].runs[0].font.color.rgb, RGBColor(*Color["red"].value)) @@ -2786,11 +2748,11 @@ def test_invalid_color_fallback_to_black(self): self.assertEqual(paragraph.runs[0].font.color.rgb, RGBColor(*Color["black"].value)) self.assertEqual(len(log.records), 5) - self.assertIn('RGBA color is not supported by python-docx. Opacity will be ignored.', log.output[0]) - self.assertIn('Could not parse color \'rgba(a, b, c, d, e)\': Invalid color value. Fallback to black.', log.output[1]) - self.assertIn('Could not parse color \'rgb(255, 0, 0, 0)\': Invalid color value. Fallback to black.', log.output[2]) - self.assertIn('Could not parse color \'invalidcolorname\': Invalid color value. Fallback to black.', log.output[3]) - self.assertIn('Could not parse color \'#f7272626161\': Invalid color value. Fallback to black.', log.output[4]) + self.assertIn("RGBA color is not supported by python-docx. Opacity will be ignored.", log.output[0]) + self.assertIn("Could not parse color 'rgba(a, b, c, d, e)': Invalid color value. Fallback to black.", log.output[1]) + self.assertIn("Could not parse color 'rgb(255, 0, 0, 0)': Invalid color value. Fallback to black.", log.output[2]) + self.assertIn("Could not parse color 'invalidcolorname': Invalid color value. Fallback to black.", log.output[3]) + self.assertIn("Could not parse color '#f7272626161': Invalid color value. Fallback to black.", log.output[4]) def test_invalid_rowspan_and_colspan(self): """Test with invalid rowspan and colspan"""