VRSEN · fortybyte · May 8, 2026
diff --git a/docs_agent/tools/ConvertDocument.py b/docs_agent/tools/ConvertDocument.py
@@ -3,19 +3,11 @@
 from pathlib import Path
 from typing import Literal
 
-import html2text
 from agency_swarm.tools import BaseTool, ToolOutputText, tool_output_file_from_path
-from bs4 import BeautifulSoup
 from pydantic import Field
-from weasyprint import HTML
-
-from .CreateDocument import CreateDocument
-from .utils.html_docx_core import html_to_docx
-from .utils.html_docx_images import embed_local_images
-from .utils.html_docx_playwright import auto_page_breaks
 
 # Base directory for all document files
-from .utils.doc_file_utils import get_project_dir, next_docx_version
+from .utils.doc_file_utils import get_project_dir, next_docx_version, normalize_document_name
 
 # Characters that PDF fonts commonly lack glyphs for.
 # Includes both proper Unicode typographic chars and ASCII control-char
@@ -44,6 +36,30 @@ def _normalize_unicode(html: str) -> str:
     return html.translate(_UNICODE_TO_ASCII)
 
 
+def _load_weasyprint_html():
+    try:
+        from weasyprint import HTML
+    except (ImportError, OSError) as exc:
+        raise RuntimeError(
+            "PDF export requires WeasyPrint and its native system libraries. "
+            "Install the WeasyPrint platform dependencies, then retry PDF export. "
+            f"Original error: {exc}"
+        ) from exc
+    return HTML
+
+
+def _embed_local_images(html_content: str, project_dir: Path) -> str:
+    from .utils.html_docx_images import embed_local_images
+
+    return embed_local_images(html_content, project_dir)
+
+
+def _auto_page_breaks(html_content: str) -> str:
+    from .utils.html_docx_playwright import auto_page_breaks
+
+    return auto_page_breaks(html_content)
+
+
 class ConvertDocument(BaseTool):
     """
     Convert a document to different formats.
@@ -95,11 +111,7 @@ def run(self):
             if not project_dir.exists():
                 return f"Error: Project '{self.project_name}' not found."
 
-            doc_name = (
-                self.document_name.replace(".html", "")
-                .replace(".docx", "")
-                .replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
             source_path = project_dir / f"{doc_name}.source.html"
 
             if not source_path.exists():
@@ -124,9 +136,9 @@ def run(self):
                 )
 
             html_content = source_path.read_text(encoding="utf-8")
-            html_content = embed_local_images(html_content, project_dir)
             if self.output_format in ("pdf", "docx"):
-                html_content = auto_page_breaks(html_content)
+                html_content = _embed_local_images(html_content, project_dir)
+                html_content = _auto_page_breaks(html_content)
 
             if self.output_format == "pdf":
                 self._convert_to_pdf(html_content, output_path)
@@ -169,27 +181,36 @@ def run(self):
 
     def _convert_to_pdf(self, html_content: str, output_path: Path):
         """Convert HTML to PDF using weasyprint."""
+        HTML = _load_weasyprint_html()
         HTML(string=_normalize_unicode(html_content)).write_pdf(output_path)
 
     def _convert_to_docx(self, html_content: str, output_path: Path):
         """Convert HTML to DOCX using the internal converter."""
+        from .utils.html_docx_core import html_to_docx
+
         html_to_docx(html_content, output_path)
 
     def _convert_to_markdown(self, html_content: str, output_path: Path):
         """Convert HTML to Markdown."""
+        import html2text
+
         converter = html2text.HTML2Text()
         converter.body_width = 0  # Don't wrap text
         markdown = converter.handle(html_content)
         output_path.write_text(markdown, encoding="utf-8")
 
     def _convert_to_txt(self, html_content: str, output_path: Path):
         """Convert HTML to plain text."""
+        from bs4 import BeautifulSoup
+
         soup = BeautifulSoup(html_content, "html.parser")
         text = soup.get_text(separator="\n", strip=True)
         output_path.write_text(text, encoding="utf-8")
 
 
 if __name__ == "__main__":
+    from .CreateDocument import CreateDocument
+
     print("=" * 70)
     print("TEST: ConvertDocument Tool")
     print("=" * 70)

diff --git a/docs_agent/tools/CreateDocument.py b/docs_agent/tools/CreateDocument.py
@@ -10,7 +10,7 @@
 from .utils.html_validation import build_unsupported_error, find_unsupported_html
 from .utils.html_docx_playwright import _launch_chromium_with_install
 from .utils.html_docx_constants import _UA_RESET_STYLE
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_document_name
 from .utils.html_docx_images import embed_local_images
 
 
@@ -87,11 +87,7 @@ def run(self):
             (project_dir / "assets").mkdir(exist_ok=True)
 
             # Strip extension if the caller included one
-            doc_name = (
-                self.document_name.replace(".html", "")
-                .replace(".docx", "")
-                .replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
 
             content_value = self.content.value
             if not content_value:
@@ -284,4 +280,4 @@ def _ensure_ua_reset(html_content: str) -> str:
         content={"type": "html", "value": html_simple},
     )
     print(tool.run())
-    print()
+    print()
diff --git a/docs_agent/tools/ModifyDocument.py b/docs_agent/tools/ModifyDocument.py
@@ -7,7 +7,7 @@
 from agency_swarm.tools import BaseTool
 from pydantic import Field
 
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_document_name
 from .utils.html_validation import build_unsupported_error, find_unsupported_html
 
 
@@ -94,9 +94,7 @@ def run(self) -> str:
             if not project_dir.exists():
                 return f"Error: Project '{self.project_name}' not found."
 
-            doc_name = (
-                self.document_name.replace(".html", "").replace(".docx", "").replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
             source_path = project_dir / f"{doc_name}.source.html"
             md_path = project_dir / f"{doc_name}.md"
 

diff --git a/docs_agent/tools/RestoreDocument.py b/docs_agent/tools/RestoreDocument.py
@@ -5,7 +5,7 @@
 from agency_swarm.tools import BaseTool
 from pydantic import Field
 
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_docx_filename
 
 
 class RestoreDocument(BaseTool):
@@ -42,35 +42,34 @@ class RestoreDocument(BaseTool):
     )
 
     def run(self) -> str:
-        project_dir = get_project_dir(self.project_name)
-        docx_name = (
-            self.docx_filename
-            if self.docx_filename.endswith(".docx")
-            else f"{self.docx_filename}.docx"
-        )
-        snapshot_path = project_dir / f"{docx_name}.snapshot.html"
-
-        if not snapshot_path.exists():
-            available = sorted(
-                p.name for p in project_dir.glob("*.docx.snapshot.html")
+        try:
+            project_dir = get_project_dir(self.project_name)
+            docx_name = normalize_docx_filename(self.docx_filename)
+            snapshot_path = project_dir / f"{docx_name}.snapshot.html"
+
+            if not snapshot_path.exists():
+                available = sorted(
+                    p.name for p in project_dir.glob("*.docx.snapshot.html")
+                )
+                hint = (
+                    "\nAvailable snapshots:\n" + "\n".join(f"  {s}" for s in available)
+                    if available
+                    else "\nNo snapshots found in this project."
+                )
+                return f"Error: No snapshot found for '{docx_name}'.{hint}"
+
+            doc_name = Path(docx_name).stem
+            doc_name = _strip_version(doc_name)
+            source_path = project_dir / f"{doc_name}.source.html"
+
+            source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8")
+
+            return (
+                f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n"
+                f"Working source: {source_path}"
             )
-            hint = (
-                "\nAvailable snapshots:\n" + "\n".join(f"  {s}" for s in available)
-                if available
-                else "\nNo snapshots found in this project."
-            )
-            return f"Error: No snapshot found for '{docx_name}'.{hint}"
-
-        doc_name = Path(docx_name).stem
-        doc_name = _strip_version(doc_name)
-        source_path = project_dir / f"{doc_name}.source.html"
-
-        source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8")
-
-        return (
-            f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n"
-            f"Working source: {source_path}"
-        )
+        except Exception as e:
+            return f"Error restoring document: {str(e)}"
 
 
 def _strip_version(stem: str) -> str:

diff --git a/docs_agent/tools/ViewDocument.py b/docs_agent/tools/ViewDocument.py
@@ -4,7 +4,7 @@
 from agency_swarm.tools import BaseTool
 from pydantic import Field
 
-from .utils.doc_file_utils import get_project_dir
+from .utils.doc_file_utils import get_project_dir, normalize_document_name
 
 
 class ViewDocument(BaseTool):
@@ -43,11 +43,7 @@ def run(self):
             if not project_dir.exists():
                 return f"Error: Project '{self.project_name}' not found."
 
-            doc_name = (
-                self.document_name.replace(".html", "")
-                .replace(".docx", "")
-                .replace(".md", "")
-            )
+            doc_name = normalize_document_name(self.document_name)
             source_path = project_dir / f"{doc_name}.source.html"
             docx_path = project_dir / f"{doc_name}.docx"
             md_path = project_dir / f"{doc_name}.md"

diff --git a/docs_agent/tools/utils/doc_file_utils.py b/docs_agent/tools/utils/doc_file_utils.py
@@ -4,12 +4,54 @@
 from pathlib import Path
 
 
+_DOCUMENT_SUFFIXES = (".source.html", ".html", ".docx", ".md", ".pdf", ".txt")
+
+
 def get_mnt_dir() -> Path:
     return Path("/app/mnt") if Path("/.dockerenv").is_file() else Path(__file__).parents[3] / "mnt"
 
 
 def get_project_dir(project_name: str) -> Path:
-    return get_mnt_dir() / project_name / "documents"
+    return get_mnt_dir() / validate_path_component(project_name, "project_name") / "documents"
+
+
+def validate_path_component(value: str, field_name: str) -> str:
+    """Return a safe single path component or raise ValueError.
+
+    Document tools accept user-provided project and file names. Keep those names
+    inside the managed mnt tree by rejecting separators, absolute paths, parent
+    traversal, and NUL bytes before composing paths.
+    """
+    component = str(value or "").strip()
+    if not component:
+        raise ValueError(f"{field_name} must not be empty.")
+    if "\x00" in component:
+        raise ValueError(f"{field_name} must not contain NUL bytes.")
+    if "/" in component or "\\" in component:
+        raise ValueError(f"{field_name} must be a name, not a path.")
+    if component in {".", ".."}:
+        raise ValueError(f"{field_name} must not be a relative path marker.")
+    if Path(component).is_absolute():
+        raise ValueError(f"{field_name} must not be an absolute path.")
+    return component
+
+
+def normalize_document_name(document_name: str) -> str:
+    name = validate_path_component(document_name, "document_name")
+    for suffix in _DOCUMENT_SUFFIXES:
+        if name.endswith(suffix):
+            name = name[: -len(suffix)]
+            break
+    return validate_path_component(name, "document_name")
+
+
+def normalize_docx_filename(docx_filename: str) -> str:
+    name = validate_path_component(docx_filename, "docx_filename")
+    if not name.endswith(".docx"):
+        name = f"{name}.docx"
+    stem = name[: -len(".docx")]
+    validate_path_component(stem, "docx_filename")
+    return name
 
 
 def next_docx_version(desired: Path) -> Path: