From f44eb81475ebbea6d293d0c506a22f957a6a2f48 Mon Sep 17 00:00:00 2001 From: Andy Zhang Date: Fri, 8 May 2026 00:02:02 -0700 Subject: [PATCH] Lazy-load document conversion dependencies --- docs_agent/tools/ConvertDocument.py | 53 ++++-- docs_agent/tools/CreateDocument.py | 10 +- docs_agent/tools/ModifyDocument.py | 6 +- docs_agent/tools/RestoreDocument.py | 57 ++++--- docs_agent/tools/ViewDocument.py | 8 +- docs_agent/tools/utils/doc_file_utils.py | 44 ++++- tests/test_convert_document_pdf_dependency.py | 156 ++++++++++++++++++ 7 files changed, 271 insertions(+), 63 deletions(-) create mode 100644 tests/test_convert_document_pdf_dependency.py diff --git a/docs_agent/tools/ConvertDocument.py b/docs_agent/tools/ConvertDocument.py index 6c943fd7..67e87d60 100644 --- a/docs_agent/tools/ConvertDocument.py +++ b/docs_agent/tools/ConvertDocument.py @@ -3,19 +3,11 @@ from pathlib import Path from typing import Literal -import html2text from agency_swarm.tools import BaseTool, ToolOutputText, tool_output_file_from_path -from bs4 import BeautifulSoup from pydantic import Field -from weasyprint import HTML - -from .CreateDocument import CreateDocument -from .utils.html_docx_core import html_to_docx -from .utils.html_docx_images import embed_local_images -from .utils.html_docx_playwright import auto_page_breaks # Base directory for all document files -from .utils.doc_file_utils import get_project_dir, next_docx_version +from .utils.doc_file_utils import get_project_dir, next_docx_version, normalize_document_name # Characters that PDF fonts commonly lack glyphs for. # Includes both proper Unicode typographic chars and ASCII control-char @@ -44,6 +36,30 @@ def _normalize_unicode(html: str) -> str: return html.translate(_UNICODE_TO_ASCII) +def _load_weasyprint_html(): + try: + from weasyprint import HTML + except (ImportError, OSError) as exc: + raise RuntimeError( + "PDF export requires WeasyPrint and its native system libraries. " + "Install the WeasyPrint platform dependencies, then retry PDF export. " + f"Original error: {exc}" + ) from exc + return HTML + + +def _embed_local_images(html_content: str, project_dir: Path) -> str: + from .utils.html_docx_images import embed_local_images + + return embed_local_images(html_content, project_dir) + + +def _auto_page_breaks(html_content: str) -> str: + from .utils.html_docx_playwright import auto_page_breaks + + return auto_page_breaks(html_content) + + class ConvertDocument(BaseTool): """ Convert a document to different formats. @@ -95,11 +111,7 @@ def run(self): if not project_dir.exists(): return f"Error: Project '{self.project_name}' not found." - doc_name = ( - self.document_name.replace(".html", "") - .replace(".docx", "") - .replace(".md", "") - ) + doc_name = normalize_document_name(self.document_name) source_path = project_dir / f"{doc_name}.source.html" if not source_path.exists(): @@ -124,9 +136,9 @@ def run(self): ) html_content = source_path.read_text(encoding="utf-8") - html_content = embed_local_images(html_content, project_dir) if self.output_format in ("pdf", "docx"): - html_content = auto_page_breaks(html_content) + html_content = _embed_local_images(html_content, project_dir) + html_content = _auto_page_breaks(html_content) if self.output_format == "pdf": self._convert_to_pdf(html_content, output_path) @@ -169,14 +181,19 @@ def run(self): def _convert_to_pdf(self, html_content: str, output_path: Path): """Convert HTML to PDF using weasyprint.""" + HTML = _load_weasyprint_html() HTML(string=_normalize_unicode(html_content)).write_pdf(output_path) def _convert_to_docx(self, html_content: str, output_path: Path): """Convert HTML to DOCX using the internal converter.""" + from .utils.html_docx_core import html_to_docx + html_to_docx(html_content, output_path) def _convert_to_markdown(self, html_content: str, output_path: Path): """Convert HTML to Markdown.""" + import html2text + converter = html2text.HTML2Text() converter.body_width = 0 # Don't wrap text markdown = converter.handle(html_content) @@ -184,12 +201,16 @@ def _convert_to_markdown(self, html_content: str, output_path: Path): def _convert_to_txt(self, html_content: str, output_path: Path): """Convert HTML to plain text.""" + from bs4 import BeautifulSoup + soup = BeautifulSoup(html_content, "html.parser") text = soup.get_text(separator="\n", strip=True) output_path.write_text(text, encoding="utf-8") if __name__ == "__main__": + from .CreateDocument import CreateDocument + print("=" * 70) print("TEST: ConvertDocument Tool") print("=" * 70) diff --git a/docs_agent/tools/CreateDocument.py b/docs_agent/tools/CreateDocument.py index 1c7f9e71..19cf23c2 100644 --- a/docs_agent/tools/CreateDocument.py +++ b/docs_agent/tools/CreateDocument.py @@ -10,7 +10,7 @@ from .utils.html_validation import build_unsupported_error, find_unsupported_html from .utils.html_docx_playwright import _launch_chromium_with_install from .utils.html_docx_constants import _UA_RESET_STYLE -from .utils.doc_file_utils import get_project_dir +from .utils.doc_file_utils import get_project_dir, normalize_document_name from .utils.html_docx_images import embed_local_images @@ -87,11 +87,7 @@ def run(self): (project_dir / "assets").mkdir(exist_ok=True) # Strip extension if the caller included one - doc_name = ( - self.document_name.replace(".html", "") - .replace(".docx", "") - .replace(".md", "") - ) + doc_name = normalize_document_name(self.document_name) content_value = self.content.value if not content_value: @@ -284,4 +280,4 @@ def _ensure_ua_reset(html_content: str) -> str: content={"type": "html", "value": html_simple}, ) print(tool.run()) - print() \ No newline at end of file + print() diff --git a/docs_agent/tools/ModifyDocument.py b/docs_agent/tools/ModifyDocument.py index 74d36bf2..e17729f7 100644 --- a/docs_agent/tools/ModifyDocument.py +++ b/docs_agent/tools/ModifyDocument.py @@ -7,7 +7,7 @@ from agency_swarm.tools import BaseTool from pydantic import Field -from .utils.doc_file_utils import get_project_dir +from .utils.doc_file_utils import get_project_dir, normalize_document_name from .utils.html_validation import build_unsupported_error, find_unsupported_html @@ -94,9 +94,7 @@ def run(self) -> str: if not project_dir.exists(): return f"Error: Project '{self.project_name}' not found." - doc_name = ( - self.document_name.replace(".html", "").replace(".docx", "").replace(".md", "") - ) + doc_name = normalize_document_name(self.document_name) source_path = project_dir / f"{doc_name}.source.html" md_path = project_dir / f"{doc_name}.md" diff --git a/docs_agent/tools/RestoreDocument.py b/docs_agent/tools/RestoreDocument.py index 30c2a105..40907280 100644 --- a/docs_agent/tools/RestoreDocument.py +++ b/docs_agent/tools/RestoreDocument.py @@ -5,7 +5,7 @@ from agency_swarm.tools import BaseTool from pydantic import Field -from .utils.doc_file_utils import get_project_dir +from .utils.doc_file_utils import get_project_dir, normalize_docx_filename class RestoreDocument(BaseTool): @@ -42,35 +42,34 @@ class RestoreDocument(BaseTool): ) def run(self) -> str: - project_dir = get_project_dir(self.project_name) - docx_name = ( - self.docx_filename - if self.docx_filename.endswith(".docx") - else f"{self.docx_filename}.docx" - ) - snapshot_path = project_dir / f"{docx_name}.snapshot.html" - - if not snapshot_path.exists(): - available = sorted( - p.name for p in project_dir.glob("*.docx.snapshot.html") + try: + project_dir = get_project_dir(self.project_name) + docx_name = normalize_docx_filename(self.docx_filename) + snapshot_path = project_dir / f"{docx_name}.snapshot.html" + + if not snapshot_path.exists(): + available = sorted( + p.name for p in project_dir.glob("*.docx.snapshot.html") + ) + hint = ( + "\nAvailable snapshots:\n" + "\n".join(f" {s}" for s in available) + if available + else "\nNo snapshots found in this project." + ) + return f"Error: No snapshot found for '{docx_name}'.{hint}" + + doc_name = Path(docx_name).stem + doc_name = _strip_version(doc_name) + source_path = project_dir / f"{doc_name}.source.html" + + source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8") + + return ( + f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n" + f"Working source: {source_path}" ) - hint = ( - "\nAvailable snapshots:\n" + "\n".join(f" {s}" for s in available) - if available - else "\nNo snapshots found in this project." - ) - return f"Error: No snapshot found for '{docx_name}'.{hint}" - - doc_name = Path(docx_name).stem - doc_name = _strip_version(doc_name) - source_path = project_dir / f"{doc_name}.source.html" - - source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8") - - return ( - f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n" - f"Working source: {source_path}" - ) + except Exception as e: + return f"Error restoring document: {str(e)}" def _strip_version(stem: str) -> str: diff --git a/docs_agent/tools/ViewDocument.py b/docs_agent/tools/ViewDocument.py index 2a6b9efa..79946101 100644 --- a/docs_agent/tools/ViewDocument.py +++ b/docs_agent/tools/ViewDocument.py @@ -4,7 +4,7 @@ from agency_swarm.tools import BaseTool from pydantic import Field -from .utils.doc_file_utils import get_project_dir +from .utils.doc_file_utils import get_project_dir, normalize_document_name class ViewDocument(BaseTool): @@ -43,11 +43,7 @@ def run(self): if not project_dir.exists(): return f"Error: Project '{self.project_name}' not found." - doc_name = ( - self.document_name.replace(".html", "") - .replace(".docx", "") - .replace(".md", "") - ) + doc_name = normalize_document_name(self.document_name) source_path = project_dir / f"{doc_name}.source.html" docx_path = project_dir / f"{doc_name}.docx" md_path = project_dir / f"{doc_name}.md" diff --git a/docs_agent/tools/utils/doc_file_utils.py b/docs_agent/tools/utils/doc_file_utils.py index e26f2085..bddffb68 100644 --- a/docs_agent/tools/utils/doc_file_utils.py +++ b/docs_agent/tools/utils/doc_file_utils.py @@ -4,12 +4,54 @@ from pathlib import Path +_DOCUMENT_SUFFIXES = (".source.html", ".html", ".docx", ".md", ".pdf", ".txt") + + def get_mnt_dir() -> Path: return Path("/app/mnt") if Path("/.dockerenv").is_file() else Path(__file__).parents[3] / "mnt" def get_project_dir(project_name: str) -> Path: - return get_mnt_dir() / project_name / "documents" + return get_mnt_dir() / validate_path_component(project_name, "project_name") / "documents" + + +def validate_path_component(value: str, field_name: str) -> str: + """Return a safe single path component or raise ValueError. + + Document tools accept user-provided project and file names. Keep those names + inside the managed mnt tree by rejecting separators, absolute paths, parent + traversal, and NUL bytes before composing paths. + """ + component = str(value or "").strip() + if not component: + raise ValueError(f"{field_name} must not be empty.") + if "\x00" in component: + raise ValueError(f"{field_name} must not contain NUL bytes.") + if "/" in component or "\\" in component: + raise ValueError(f"{field_name} must be a name, not a path.") + if component in {".", ".."}: + raise ValueError(f"{field_name} must not be a relative path marker.") + if Path(component).is_absolute(): + raise ValueError(f"{field_name} must not be an absolute path.") + return component + + +def normalize_document_name(document_name: str) -> str: + name = validate_path_component(document_name, "document_name") + for suffix in _DOCUMENT_SUFFIXES: + if name.endswith(suffix): + name = name[: -len(suffix)] + break + return validate_path_component(name, "document_name") + + +def normalize_docx_filename(docx_filename: str) -> str: + name = validate_path_component(docx_filename, "docx_filename") + if not name.endswith(".docx"): + name = f"{name}.docx" + stem = name[: -len(".docx")] + validate_path_component(stem, "docx_filename") + return name def next_docx_version(desired: Path) -> Path: diff --git a/tests/test_convert_document_pdf_dependency.py b/tests/test_convert_document_pdf_dependency.py new file mode 100644 index 00000000..2cbe7562 --- /dev/null +++ b/tests/test_convert_document_pdf_dependency.py @@ -0,0 +1,156 @@ +import builtins + +import docs_agent.tools.ConvertDocument as convert_module +from docs_agent.tools.ConvertDocument import ConvertDocument +from docs_agent.tools.RestoreDocument import RestoreDocument +from docs_agent.tools.utils.doc_file_utils import ( + get_project_dir, + normalize_document_name, + normalize_docx_filename, +) + + +def test_convert_document_import_does_not_require_weasyprint(): + assert ConvertDocument.__name__ == "ConvertDocument" + + +def test_pdf_conversion_reports_missing_weasyprint_native_dependency(monkeypatch, tmp_path): + project_dir = tmp_path / "demo_project" / "documents" + project_dir.mkdir(parents=True) + (project_dir / "report.source.html").write_text( + "

Report

", + encoding="utf-8", + ) + + real_import = builtins.__import__ + + def guarded_import(name, *args, **kwargs): + if name == "weasyprint": + raise OSError("libpango-1.0-0: cannot open shared object file") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", guarded_import) + monkeypatch.setattr(convert_module, "get_project_dir", lambda _project_name: project_dir) + monkeypatch.setattr(convert_module, "_embed_local_images", lambda html, _project_dir: html) + monkeypatch.setattr(convert_module, "_auto_page_breaks", lambda html: html) + + result = ConvertDocument( + project_name="demo_project", + document_name="report", + output_format="pdf", + ).run() + + assert "PDF export requires WeasyPrint" in result + assert "libpango-1.0-0" in result + assert not (project_dir / "report.pdf").exists() + + +def test_markdown_conversion_still_works_without_weasyprint(monkeypatch, tmp_path): + project_dir = tmp_path / "demo_project" / "documents" + project_dir.mkdir(parents=True) + (project_dir / "report.source.html").write_text( + "

Report

Hello

", + encoding="utf-8", + ) + + real_import = builtins.__import__ + + def guarded_import(name, *args, **kwargs): + if name == "weasyprint": + raise OSError("libpango-1.0-0: cannot open shared object file") + return real_import(name, *args, **kwargs) + + monkeypatch.setattr(builtins, "__import__", guarded_import) + monkeypatch.setattr(convert_module, "get_project_dir", lambda _project_name: project_dir) + def fail_if_embedding_runs(_html, _project_dir): + raise AssertionError("Markdown conversion should not embed local images") + + monkeypatch.setattr(convert_module, "_embed_local_images", fail_if_embedding_runs) + + result = ConvertDocument( + project_name="demo_project", + document_name="report", + output_format="markdown", + ).run() + + assert "Successfully converted document to MARKDOWN" in result + markdown = (project_dir / "report.md").read_text(encoding="utf-8") + assert "# Report" in markdown + assert "Hello" in markdown + + +def test_txt_conversion_skips_render_only_image_embedding(monkeypatch, tmp_path): + project_dir = tmp_path / "demo_project" / "documents" + project_dir.mkdir(parents=True) + (project_dir / "report.source.html").write_text( + "

Report

Hello

", + encoding="utf-8", + ) + + def fail_if_embedding_runs(_html, _project_dir): + raise AssertionError("TXT conversion should not embed local images") + + monkeypatch.setattr(convert_module, "get_project_dir", lambda _project_name: project_dir) + monkeypatch.setattr(convert_module, "_embed_local_images", fail_if_embedding_runs) + + result = ConvertDocument( + project_name="demo_project", + document_name="report", + output_format="txt", + ).run() + + assert "Successfully converted document to TXT" in result + text = (project_dir / "report.txt").read_text(encoding="utf-8") + assert "Report" in text + assert "Hello" in text + + +def test_document_path_components_reject_traversal(): + invalid_values = [ + "../escape", + "nested/project", + "nested\\project", + "/tmp/project", + ".", + "..", + "", + "safe\x00name", + ] + + for value in invalid_values: + try: + get_project_dir(value) + except ValueError as exc: + assert "project_name" in str(exc) + else: + raise AssertionError(f"project_name accepted unsafe value: {value!r}") + + try: + normalize_document_name(value) + except ValueError as exc: + assert "document_name" in str(exc) + else: + raise AssertionError(f"document_name accepted unsafe value: {value!r}") + + try: + normalize_docx_filename(value) + except ValueError as exc: + assert "docx_filename" in str(exc) + else: + raise AssertionError(f"docx_filename accepted unsafe value: {value!r}") + + +def test_document_name_normalization_keeps_single_safe_stem(): + assert normalize_document_name("report.source.html") == "report" + assert normalize_document_name("report.html") == "report" + assert normalize_document_name("report.docx") == "report" + assert normalize_document_name("report.md") == "report" + assert normalize_docx_filename("report") == "report.docx" + assert normalize_docx_filename("report_v2.docx") == "report_v2.docx" + + +def test_invalid_restore_filename_returns_tool_error(): + result = RestoreDocument(project_name="safe_project", docx_filename="../report.docx").run() + + assert result.startswith("Error restoring document:") + assert "docx_filename" in result