From f44eb81475ebbea6d293d0c506a22f957a6a2f48 Mon Sep 17 00:00:00 2001
From: Andy Zhang <fortybyte@gmail.com>
Date: Fri, 8 May 2026 00:02:02 -0700
Subject: [PATCH] Lazy-load document conversion dependencies

---
 docs_agent/tools/ConvertDocument.py           |  53 ++++--
 docs_agent/tools/CreateDocument.py            |  10 +-
 docs_agent/tools/ModifyDocument.py            |   6 +-
 docs_agent/tools/RestoreDocument.py           |  57 ++++---
 docs_agent/tools/ViewDocument.py              |   8 +-
 docs_agent/tools/utils/doc_file_utils.py      |  44 ++++-
 tests/test_convert_document_pdf_dependency.py | 156 ++++++++++++++++++
 7 files changed, 271 insertions(+), 63 deletions(-)
 create mode 100644 tests/test_convert_document_pdf_dependency.py

diff --git a/docs_agent/tools/ConvertDocument.py b/docs_agent/tools/ConvertDocument.py
index 6c943fd7..67e87d60 100644
--- a/docs_agent/tools/ConvertDocument.py
+++ b/docs_agent/tools/ConvertDocument.py
@@ -3,19 +3,11 @@
 from pathlib import Path
 from typing import Literal
 
-import html2text
 from agency_swarm.tools import BaseTool, ToolOutputText, tool_output_file_from_path
-from bs4 import BeautifulSoup
 from pydantic import Field
-from weasyprint import HTML
-
-from .CreateDocument import CreateDocument
-from .utils.html_docx_core import html_to_docx
-from .utils.html_docx_images import embed_local_images
-from .utils.html_docx_playwright import auto_page_breaks
 
 # Base directory for all document files
-from .utils.doc_file_utils import get_project_dir, next_docx_version
+from .utils.doc_file_utils import get_project_dir, next_docx_version, normalize_document_name
 
 # Characters that PDF fonts commonly lack glyphs for.
 # Includes both proper Unicode typographic chars and ASCII control-char
@@ -44,6 +36,30 @@ def _normalize_unicode(html: str) -> str:
     return html.translate(_UNICODE_TO_ASCII)
 
 
+def _load_weasyprint_html():
+    try:
+        from weasyprint import HTML
+    except (ImportError, OSError) as exc:
+        raise RuntimeError(
+            "PDF export requires WeasyPrint and its native system libraries. "
+            "Install the WeasyPrint platform dependencies, then retry PDF export. "
+            f"Original error: {exc}"
+        ) from exc
+    return HTML
+
+
+def _embed_local_images(html_content: str, project_dir: Path) -> str:
+    from .utils.html_docx_images import embed_local_images
+
+    return embed_local_images(html_content, project_dir)
+
+
+def _auto_page_breaks(html_content: str) -> str:
+    from .utils.html_docx_playwright import auto_page_breaks
+
+    return auto_page_breaks(html_content)
+
+
 class ConvertDocument(BaseTool):
     """
     Convert a document to different formats.
@@ -95,11 +111,7 @@ def run(self):
             if not project_dir.exists():
                 return f"Error: Project '{self.project_name}' not found."
 
-            doc_name = (
-                self.document_name.replace(".html", "")
-                .replace(".docx", "")
-                .replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
             source_path = project_dir / f"{doc_name}.source.html"
 
             if not source_path.exists():
@@ -124,9 +136,9 @@ def run(self):
                 )
 
             html_content = source_path.read_text(encoding="utf-8")
-            html_content = embed_local_images(html_content, project_dir)
             if self.output_format in ("pdf", "docx"):
-                html_content = auto_page_breaks(html_content)
+                html_content = _embed_local_images(html_content, project_dir)
+                html_content = _auto_page_breaks(html_content)
 
             if self.output_format == "pdf":
                 self._convert_to_pdf(html_content, output_path)
@@ -169,14 +181,19 @@ def run(self):
     
     def _convert_to_pdf(self, html_content: str, output_path: Path):
         """Convert HTML to PDF using weasyprint."""
+        HTML = _load_weasyprint_html()
         HTML(string=_normalize_unicode(html_content)).write_pdf(output_path)
 
     def _convert_to_docx(self, html_content: str, output_path: Path):
         """Convert HTML to DOCX using the internal converter."""
+        from .utils.html_docx_core import html_to_docx
+
         html_to_docx(html_content, output_path)
     
     def _convert_to_markdown(self, html_content: str, output_path: Path):
         """Convert HTML to Markdown."""
+        import html2text
+
         converter = html2text.HTML2Text()
         converter.body_width = 0  # Don't wrap text
         markdown = converter.handle(html_content)
@@ -184,12 +201,16 @@ def _convert_to_markdown(self, html_content: str, output_path: Path):
     
     def _convert_to_txt(self, html_content: str, output_path: Path):
         """Convert HTML to plain text."""
+        from bs4 import BeautifulSoup
+
         soup = BeautifulSoup(html_content, "html.parser")
         text = soup.get_text(separator="\n", strip=True)
         output_path.write_text(text, encoding="utf-8")
 
 
 if __name__ == "__main__":
+    from .CreateDocument import CreateDocument
+
     print("=" * 70)
     print("TEST: ConvertDocument Tool")
     print("=" * 70)
diff --git a/docs_agent/tools/CreateDocument.py b/docs_agent/tools/CreateDocument.py
index 1c7f9e71..19cf23c2 100644
--- a/docs_agent/tools/CreateDocument.py
+++ b/docs_agent/tools/CreateDocument.py
@@ -10,7 +10,7 @@
 from .utils.html_validation import build_unsupported_error, find_unsupported_html
 from .utils.html_docx_playwright import _launch_chromium_with_install
 from .utils.html_docx_constants import _UA_RESET_STYLE
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_document_name
 from .utils.html_docx_images import embed_local_images
 
 
@@ -87,11 +87,7 @@ def run(self):
             (project_dir / "assets").mkdir(exist_ok=True)
 
             # Strip extension if the caller included one
-            doc_name = (
-                self.document_name.replace(".html", "")
-                .replace(".docx", "")
-                .replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
 
             content_value = self.content.value
             if not content_value:
@@ -284,4 +280,4 @@ def _ensure_ua_reset(html_content: str) -> str:
         content={"type": "html", "value": html_simple},
     )
     print(tool.run())
-    print()
\ No newline at end of file
+    print()
diff --git a/docs_agent/tools/ModifyDocument.py b/docs_agent/tools/ModifyDocument.py
index 74d36bf2..e17729f7 100644
--- a/docs_agent/tools/ModifyDocument.py
+++ b/docs_agent/tools/ModifyDocument.py
@@ -7,7 +7,7 @@
 from agency_swarm.tools import BaseTool
 from pydantic import Field
 
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_document_name
 from .utils.html_validation import build_unsupported_error, find_unsupported_html
 
 
@@ -94,9 +94,7 @@ def run(self) -> str:
             if not project_dir.exists():
                 return f"Error: Project '{self.project_name}' not found."
 
-            doc_name = (
-                self.document_name.replace(".html", "").replace(".docx", "").replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
             source_path = project_dir / f"{doc_name}.source.html"
             md_path = project_dir / f"{doc_name}.md"
 
diff --git a/docs_agent/tools/RestoreDocument.py b/docs_agent/tools/RestoreDocument.py
index 30c2a105..40907280 100644
--- a/docs_agent/tools/RestoreDocument.py
+++ b/docs_agent/tools/RestoreDocument.py
@@ -5,7 +5,7 @@
 from agency_swarm.tools import BaseTool
 from pydantic import Field
 
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_docx_filename
 
 
 class RestoreDocument(BaseTool):
@@ -42,35 +42,34 @@ class RestoreDocument(BaseTool):
     )
 
     def run(self) -> str:
-        project_dir = get_project_dir(self.project_name)
-        docx_name = (
-            self.docx_filename
-            if self.docx_filename.endswith(".docx")
-            else f"{self.docx_filename}.docx"
-        )
-        snapshot_path = project_dir / f"{docx_name}.snapshot.html"
-
-        if not snapshot_path.exists():
-            available = sorted(
-                p.name for p in project_dir.glob("*.docx.snapshot.html")
+        try:
+            project_dir = get_project_dir(self.project_name)
+            docx_name = normalize_docx_filename(self.docx_filename)
+            snapshot_path = project_dir / f"{docx_name}.snapshot.html"
+
+            if not snapshot_path.exists():
+                available = sorted(
+                    p.name for p in project_dir.glob("*.docx.snapshot.html")
+                )
+                hint = (
+                    "\nAvailable snapshots:\n" + "\n".join(f"  {s}" for s in available)
+                    if available
+                    else "\nNo snapshots found in this project."
+                )
+                return f"Error: No snapshot found for '{docx_name}'.{hint}"
+
+            doc_name = Path(docx_name).stem
+            doc_name = _strip_version(doc_name)
+            source_path = project_dir / f"{doc_name}.source.html"
+
+            source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8")
+
+            return (
+                f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n"
+                f"Working source: {source_path}"
             )
-            hint = (
-                "\nAvailable snapshots:\n" + "\n".join(f"  {s}" for s in available)
-                if available
-                else "\nNo snapshots found in this project."
-            )
-            return f"Error: No snapshot found for '{docx_name}'.{hint}"
-
-        doc_name = Path(docx_name).stem
-        doc_name = _strip_version(doc_name)
-        source_path = project_dir / f"{doc_name}.source.html"
-
-        source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8")
-
-        return (
-            f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n"
-            f"Working source: {source_path}"
-        )
+        except Exception as e:
+            return f"Error restoring document: {str(e)}"
 
 
 def _strip_version(stem: str) -> str:
diff --git a/docs_agent/tools/ViewDocument.py b/docs_agent/tools/ViewDocument.py
index 2a6b9efa..79946101 100644
--- a/docs_agent/tools/ViewDocument.py
+++ b/docs_agent/tools/ViewDocument.py
@@ -4,7 +4,7 @@
 from agency_swarm.tools import BaseTool
 from pydantic import Field
 
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_document_name
 
 
 class ViewDocument(BaseTool):
@@ -43,11 +43,7 @@ def run(self):
             if not project_dir.exists():
                 return f"Error: Project '{self.project_name}' not found."
 
-            doc_name = (
-                self.document_name.replace(".html", "")
-                .replace(".docx", "")
-                .replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
             source_path = project_dir / f"{doc_name}.source.html"
             docx_path = project_dir / f"{doc_name}.docx"
             md_path = project_dir / f"{doc_name}.md"
diff --git a/docs_agent/tools/utils/doc_file_utils.py b/docs_agent/tools/utils/doc_file_utils.py
index e26f2085..bddffb68 100644
--- a/docs_agent/tools/utils/doc_file_utils.py
+++ b/docs_agent/tools/utils/doc_file_utils.py
@@ -4,12 +4,54 @@
 from pathlib import Path
 
 
+_DOCUMENT_SUFFIXES = (".source.html", ".html", ".docx", ".md", ".pdf", ".txt")
+
+
 def get_mnt_dir() -> Path:
     return Path("/app/mnt") if Path("/.dockerenv").is_file() else Path(__file__).parents[3] / "mnt"
 
 
 def get_project_dir(project_name: str) -> Path:
-    return get_mnt_dir() / project_name / "documents"
+    return get_mnt_dir() / validate_path_component(project_name, "project_name") / "documents"
+
+
+def validate_path_component(value: str, field_name: str) -> str:
+    """Return a safe single path component or raise ValueError.
+
+    Document tools accept user-provided project and file names. Keep those names
+    inside the managed mnt tree by rejecting separators, absolute paths, parent
+    traversal, and NUL bytes before composing paths.
+    """
+    component = str(value or "").strip()
+    if not component:
+        raise ValueError(f"{field_name} must not be empty.")
+    if "\x00" in component:
+        raise ValueError(f"{field_name} must not contain NUL bytes.")
+    if "/" in component or "\\" in component:
+        raise ValueError(f"{field_name} must be a name, not a path.")
+    if component in {".", ".."}:
+        raise ValueError(f"{field_name} must not be a relative path marker.")
+    if Path(component).is_absolute():
+        raise ValueError(f"{field_name} must not be an absolute path.")
+    return component
+
+
+def normalize_document_name(document_name: str) -> str:
+    name = validate_path_component(document_name, "document_name")
+    for suffix in _DOCUMENT_SUFFIXES:
+        if name.endswith(suffix):
+            name = name[: -len(suffix)]
+            break
+    return validate_path_component(name, "document_name")
+
+
+def normalize_docx_filename(docx_filename: str) -> str:
+    name = validate_path_component(docx_filename, "docx_filename")
+    if not name.endswith(".docx"):
+        name = f"{name}.docx"
+    stem = name[: -len(".docx")]
+    validate_path_component(stem, "docx_filename")
+    return name
 
 
 def next_docx_version(desired: Path) -> Path:
diff --git a/tests/test_convert_document_pdf_dependency.py b/tests/test_convert_document_pdf_dependency.py
new file mode 100644
index 00000000..2cbe7562
--- /dev/null
+++ b/tests/test_convert_document_pdf_dependency.py
@@ -0,0 +1,156 @@
+import builtins
+
+import docs_agent.tools.ConvertDocument as convert_module
+from docs_agent.tools.ConvertDocument import ConvertDocument
+from docs_agent.tools.RestoreDocument import RestoreDocument
+from docs_agent.tools.utils.doc_file_utils import (
+    get_project_dir,
+    normalize_document_name,
+    normalize_docx_filename,
+)
+
+
+def test_convert_document_import_does_not_require_weasyprint():
+    assert ConvertDocument.__name__ == "ConvertDocument"
+
+
+def test_pdf_conversion_reports_missing_weasyprint_native_dependency(monkeypatch, tmp_path):
+    project_dir = tmp_path / "demo_project" / "documents"
+    project_dir.mkdir(parents=True)
+    (project_dir / "report.source.html").write_text(
+        "<!doctype html><html><body><h1>Report</h1></body></html>",
+        encoding="utf-8",
+    )
+
+    real_import = builtins.__import__
+
+    def guarded_import(name, *args, **kwargs):
+        if name == "weasyprint":
+            raise OSError("libpango-1.0-0: cannot open shared object file")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", guarded_import)
+    monkeypatch.setattr(convert_module, "get_project_dir", lambda _project_name: project_dir)
+    monkeypatch.setattr(convert_module, "_embed_local_images", lambda html, _project_dir: html)
+    monkeypatch.setattr(convert_module, "_auto_page_breaks", lambda html: html)
+
+    result = ConvertDocument(
+        project_name="demo_project",
+        document_name="report",
+        output_format="pdf",
+    ).run()
+
+    assert "PDF export requires WeasyPrint" in result
+    assert "libpango-1.0-0" in result
+    assert not (project_dir / "report.pdf").exists()
+
+
+def test_markdown_conversion_still_works_without_weasyprint(monkeypatch, tmp_path):
+    project_dir = tmp_path / "demo_project" / "documents"
+    project_dir.mkdir(parents=True)
+    (project_dir / "report.source.html").write_text(
+        "<!doctype html><html><body><h1>Report</h1><p>Hello</p></body></html>",
+        encoding="utf-8",
+    )
+
+    real_import = builtins.__import__
+
+    def guarded_import(name, *args, **kwargs):
+        if name == "weasyprint":
+            raise OSError("libpango-1.0-0: cannot open shared object file")
+        return real_import(name, *args, **kwargs)
+
+    monkeypatch.setattr(builtins, "__import__", guarded_import)
+    monkeypatch.setattr(convert_module, "get_project_dir", lambda _project_name: project_dir)
+    def fail_if_embedding_runs(_html, _project_dir):
+        raise AssertionError("Markdown conversion should not embed local images")
+
+    monkeypatch.setattr(convert_module, "_embed_local_images", fail_if_embedding_runs)
+
+    result = ConvertDocument(
+        project_name="demo_project",
+        document_name="report",
+        output_format="markdown",
+    ).run()
+
+    assert "Successfully converted document to MARKDOWN" in result
+    markdown = (project_dir / "report.md").read_text(encoding="utf-8")
+    assert "# Report" in markdown
+    assert "Hello" in markdown
+
+
+def test_txt_conversion_skips_render_only_image_embedding(monkeypatch, tmp_path):
+    project_dir = tmp_path / "demo_project" / "documents"
+    project_dir.mkdir(parents=True)
+    (project_dir / "report.source.html").write_text(
+        "<!doctype html><html><body><h1>Report</h1><img src='assets/chart.png'><p>Hello</p></body></html>",
+        encoding="utf-8",
+    )
+
+    def fail_if_embedding_runs(_html, _project_dir):
+        raise AssertionError("TXT conversion should not embed local images")
+
+    monkeypatch.setattr(convert_module, "get_project_dir", lambda _project_name: project_dir)
+    monkeypatch.setattr(convert_module, "_embed_local_images", fail_if_embedding_runs)
+
+    result = ConvertDocument(
+        project_name="demo_project",
+        document_name="report",
+        output_format="txt",
+    ).run()
+
+    assert "Successfully converted document to TXT" in result
+    text = (project_dir / "report.txt").read_text(encoding="utf-8")
+    assert "Report" in text
+    assert "Hello" in text
+
+
+def test_document_path_components_reject_traversal():
+    invalid_values = [
+        "../escape",
+        "nested/project",
+        "nested\\project",
+        "/tmp/project",
+        ".",
+        "..",
+        "",
+        "safe\x00name",
+    ]
+
+    for value in invalid_values:
+        try:
+            get_project_dir(value)
+        except ValueError as exc:
+            assert "project_name" in str(exc)
+        else:
+            raise AssertionError(f"project_name accepted unsafe value: {value!r}")
+
+        try:
+            normalize_document_name(value)
+        except ValueError as exc:
+            assert "document_name" in str(exc)
+        else:
+            raise AssertionError(f"document_name accepted unsafe value: {value!r}")
+
+        try:
+            normalize_docx_filename(value)
+        except ValueError as exc:
+            assert "docx_filename" in str(exc)
+        else:
+            raise AssertionError(f"docx_filename accepted unsafe value: {value!r}")
+
+
+def test_document_name_normalization_keeps_single_safe_stem():
+    assert normalize_document_name("report.source.html") == "report"
+    assert normalize_document_name("report.html") == "report"
+    assert normalize_document_name("report.docx") == "report"
+    assert normalize_document_name("report.md") == "report"
+    assert normalize_docx_filename("report") == "report.docx"
+    assert normalize_docx_filename("report_v2.docx") == "report_v2.docx"
+
+
+def test_invalid_restore_filename_returns_tool_error():
+    result = RestoreDocument(project_name="safe_project", docx_filename="../report.docx").run()
+
+    assert result.startswith("Error restoring document:")
+    assert "docx_filename" in result