Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
53 changes: 37 additions & 16 deletions docs_agent/tools/ConvertDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,19 +3,11 @@
from pathlib import Path
from typing import Literal

import html2text
from agency_swarm.tools import BaseTool, ToolOutputText, tool_output_file_from_path
from bs4 import BeautifulSoup
from pydantic import Field
from weasyprint import HTML

from .CreateDocument import CreateDocument
from .utils.html_docx_core import html_to_docx
from .utils.html_docx_images import embed_local_images
from .utils.html_docx_playwright import auto_page_breaks

# Base directory for all document files
from .utils.doc_file_utils import get_project_dir, next_docx_version
from .utils.doc_file_utils import get_project_dir, next_docx_version, normalize_document_name

# Characters that PDF fonts commonly lack glyphs for.
# Includes both proper Unicode typographic chars and ASCII control-char
Expand Down Expand Up @@ -44,6 +36,30 @@ def _normalize_unicode(html: str) -> str:
return html.translate(_UNICODE_TO_ASCII)


def _load_weasyprint_html():
try:
from weasyprint import HTML
except (ImportError, OSError) as exc:
raise RuntimeError(
"PDF export requires WeasyPrint and its native system libraries. "
"Install the WeasyPrint platform dependencies, then retry PDF export. "
f"Original error: {exc}"
) from exc
return HTML


def _embed_local_images(html_content: str, project_dir: Path) -> str:
from .utils.html_docx_images import embed_local_images

return embed_local_images(html_content, project_dir)


def _auto_page_breaks(html_content: str) -> str:
from .utils.html_docx_playwright import auto_page_breaks

return auto_page_breaks(html_content)


class ConvertDocument(BaseTool):
"""
Convert a document to different formats.
Expand Down Expand Up @@ -95,11 +111,7 @@ def run(self):
if not project_dir.exists():
return f"Error: Project '{self.project_name}' not found."

doc_name = (
self.document_name.replace(".html", "")
.replace(".docx", "")
.replace(".md", "")
)
doc_name = normalize_document_name(self.document_name)
source_path = project_dir / f"{doc_name}.source.html"

if not source_path.exists():
Expand All @@ -124,9 +136,9 @@ def run(self):
)

html_content = source_path.read_text(encoding="utf-8")
html_content = embed_local_images(html_content, project_dir)
if self.output_format in ("pdf", "docx"):
html_content = auto_page_breaks(html_content)
html_content = _embed_local_images(html_content, project_dir)
html_content = _auto_page_breaks(html_content)

if self.output_format == "pdf":
self._convert_to_pdf(html_content, output_path)
Expand Down Expand Up @@ -169,27 +181,36 @@ def run(self):

def _convert_to_pdf(self, html_content: str, output_path: Path):
"""Convert HTML to PDF using weasyprint."""
HTML = _load_weasyprint_html()
HTML(string=_normalize_unicode(html_content)).write_pdf(output_path)

def _convert_to_docx(self, html_content: str, output_path: Path):
"""Convert HTML to DOCX using the internal converter."""
from .utils.html_docx_core import html_to_docx

html_to_docx(html_content, output_path)

def _convert_to_markdown(self, html_content: str, output_path: Path):
"""Convert HTML to Markdown."""
import html2text

converter = html2text.HTML2Text()
converter.body_width = 0 # Don't wrap text
markdown = converter.handle(html_content)
output_path.write_text(markdown, encoding="utf-8")

def _convert_to_txt(self, html_content: str, output_path: Path):
"""Convert HTML to plain text."""
from bs4 import BeautifulSoup

soup = BeautifulSoup(html_content, "html.parser")
text = soup.get_text(separator="\n", strip=True)
output_path.write_text(text, encoding="utf-8")


if __name__ == "__main__":
from .CreateDocument import CreateDocument

print("=" * 70)
print("TEST: ConvertDocument Tool")
print("=" * 70)
Expand Down
10 changes: 3 additions & 7 deletions docs_agent/tools/CreateDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
from .utils.html_validation import build_unsupported_error, find_unsupported_html
from .utils.html_docx_playwright import _launch_chromium_with_install
from .utils.html_docx_constants import _UA_RESET_STYLE
from .utils.doc_file_utils import get_project_dir
from .utils.doc_file_utils import get_project_dir, normalize_document_name
from .utils.html_docx_images import embed_local_images


Expand Down Expand Up @@ -87,11 +87,7 @@ def run(self):
(project_dir / "assets").mkdir(exist_ok=True)

# Strip extension if the caller included one
doc_name = (
self.document_name.replace(".html", "")
.replace(".docx", "")
.replace(".md", "")
)
doc_name = normalize_document_name(self.document_name)

content_value = self.content.value
if not content_value:
Expand Down Expand Up @@ -284,4 +280,4 @@ def _ensure_ua_reset(html_content: str) -> str:
content={"type": "html", "value": html_simple},
)
print(tool.run())
print()
print()
6 changes: 2 additions & 4 deletions docs_agent/tools/ModifyDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from agency_swarm.tools import BaseTool
from pydantic import Field

from .utils.doc_file_utils import get_project_dir
from .utils.doc_file_utils import get_project_dir, normalize_document_name
from .utils.html_validation import build_unsupported_error, find_unsupported_html


Expand Down Expand Up @@ -94,9 +94,7 @@ def run(self) -> str:
if not project_dir.exists():
return f"Error: Project '{self.project_name}' not found."

doc_name = (
self.document_name.replace(".html", "").replace(".docx", "").replace(".md", "")
)
doc_name = normalize_document_name(self.document_name)
source_path = project_dir / f"{doc_name}.source.html"
md_path = project_dir / f"{doc_name}.md"

Expand Down
57 changes: 28 additions & 29 deletions docs_agent/tools/RestoreDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
from agency_swarm.tools import BaseTool
from pydantic import Field

from .utils.doc_file_utils import get_project_dir
from .utils.doc_file_utils import get_project_dir, normalize_docx_filename


class RestoreDocument(BaseTool):
Expand Down Expand Up @@ -42,35 +42,34 @@ class RestoreDocument(BaseTool):
)

def run(self) -> str:
project_dir = get_project_dir(self.project_name)
docx_name = (
self.docx_filename
if self.docx_filename.endswith(".docx")
else f"{self.docx_filename}.docx"
)
snapshot_path = project_dir / f"{docx_name}.snapshot.html"

if not snapshot_path.exists():
available = sorted(
p.name for p in project_dir.glob("*.docx.snapshot.html")
try:
project_dir = get_project_dir(self.project_name)
docx_name = normalize_docx_filename(self.docx_filename)
snapshot_path = project_dir / f"{docx_name}.snapshot.html"

if not snapshot_path.exists():
available = sorted(
p.name for p in project_dir.glob("*.docx.snapshot.html")
)
hint = (
"\nAvailable snapshots:\n" + "\n".join(f" {s}" for s in available)
if available
else "\nNo snapshots found in this project."
)
return f"Error: No snapshot found for '{docx_name}'.{hint}"

doc_name = Path(docx_name).stem
doc_name = _strip_version(doc_name)
source_path = project_dir / f"{doc_name}.source.html"

source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8")

return (
f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n"
f"Working source: {source_path}"
)
hint = (
"\nAvailable snapshots:\n" + "\n".join(f" {s}" for s in available)
if available
else "\nNo snapshots found in this project."
)
return f"Error: No snapshot found for '{docx_name}'.{hint}"

doc_name = Path(docx_name).stem
doc_name = _strip_version(doc_name)
source_path = project_dir / f"{doc_name}.source.html"

source_path.write_text(snapshot_path.read_text(encoding="utf-8"), encoding="utf-8")

return (
f"Restored '{doc_name}' to the version captured in '{docx_name}'.\n"
f"Working source: {source_path}"
)
except Exception as e:
return f"Error restoring document: {str(e)}"


def _strip_version(stem: str) -> str:
Expand Down
8 changes: 2 additions & 6 deletions docs_agent/tools/ViewDocument.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from agency_swarm.tools import BaseTool
from pydantic import Field

from .utils.doc_file_utils import get_project_dir
from .utils.doc_file_utils import get_project_dir, normalize_document_name


class ViewDocument(BaseTool):
Expand Down Expand Up @@ -43,11 +43,7 @@ def run(self):
if not project_dir.exists():
return f"Error: Project '{self.project_name}' not found."

doc_name = (
self.document_name.replace(".html", "")
.replace(".docx", "")
.replace(".md", "")
)
doc_name = normalize_document_name(self.document_name)
source_path = project_dir / f"{doc_name}.source.html"
docx_path = project_dir / f"{doc_name}.docx"
md_path = project_dir / f"{doc_name}.md"
Expand Down
44 changes: 43 additions & 1 deletion docs_agent/tools/utils/doc_file_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,54 @@
from pathlib import Path


_DOCUMENT_SUFFIXES = (".source.html", ".html", ".docx", ".md", ".pdf", ".txt")


def get_mnt_dir() -> Path:
return Path("/app/mnt") if Path("/.dockerenv").is_file() else Path(__file__).parents[3] / "mnt"


def get_project_dir(project_name: str) -> Path:
return get_mnt_dir() / project_name / "documents"
return get_mnt_dir() / validate_path_component(project_name, "project_name") / "documents"


def validate_path_component(value: str, field_name: str) -> str:
"""Return a safe single path component or raise ValueError.

Document tools accept user-provided project and file names. Keep those names
inside the managed mnt tree by rejecting separators, absolute paths, parent
traversal, and NUL bytes before composing paths.
"""
component = str(value or "").strip()
if not component:
raise ValueError(f"{field_name} must not be empty.")
if "\x00" in component:
raise ValueError(f"{field_name} must not contain NUL bytes.")
if "/" in component or "\\" in component:
raise ValueError(f"{field_name} must be a name, not a path.")
if component in {".", ".."}:
raise ValueError(f"{field_name} must not be a relative path marker.")
if Path(component).is_absolute():
raise ValueError(f"{field_name} must not be an absolute path.")
return component


def normalize_document_name(document_name: str) -> str:
name = validate_path_component(document_name, "document_name")
for suffix in _DOCUMENT_SUFFIXES:
if name.endswith(suffix):
name = name[: -len(suffix)]
break
return validate_path_component(name, "document_name")


def normalize_docx_filename(docx_filename: str) -> str:
name = validate_path_component(docx_filename, "docx_filename")
if not name.endswith(".docx"):
name = f"{name}.docx"
stem = name[: -len(".docx")]
validate_path_component(stem, "docx_filename")
return name


def next_docx_version(desired: Path) -> Path:
Expand Down
Loading