From 1cd907f71f52a198f4e847486c68960108603955 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Fri, 19 Jun 2026 19:38:48 +0200 Subject: [PATCH] feat(export): opt-in APA .docx export of agentic answers Generalizable rework of the approach in #20 (closed as draft): - OFF by default via config.rag_modes.agentic.export_apa_docx (+ _dir). - python-docx is an optional [docx] extra (uv sync --extra docx), imported lazily; absent extra raises a clear ImportError, swallowed off the answer path. - Exporter tolerates Paper objects, dicts, and Author objects/strings (the original treated papers/authors as dicts/strings and silently failed). - No debug prints; export failures never break the answer; references deduped. - Hermetic tests: APA formatting, config default-off, disabled-helper no-op, and a real .docx write (skipped without the extra). Co-Authored-By: Claude Opus 4.7 --- config.example.yml | 4 + pyproject.toml | 6 ++ src/perspicacite/config/schema.py | 4 + src/perspicacite/rag/agentic/orchestrator.py | 28 ++++++ src/perspicacite/rag/export/__init__.py | 1 + .../rag/export/apa_docx_exporter.py | 99 +++++++++++++++++++ src/perspicacite/web/state.py | 2 + tests/unit/test_apa_docx_export.py | 83 ++++++++++++++++ 8 files changed, 227 insertions(+) create mode 100644 src/perspicacite/rag/export/__init__.py create mode 100644 src/perspicacite/rag/export/apa_docx_exporter.py create mode 100644 tests/unit/test_apa_docx_export.py diff --git a/config.example.yml b/config.example.yml index 2f213af..1c97097 100644 --- a/config.example.yml +++ b/config.example.yml @@ -163,6 +163,10 @@ rag_modes: # Higher values improve coverage but increase latency and token cost. # Range: 1–64; default is 8. map_reduce_max_papers: 8 + # Opt-in: write each agentic answer + an APA reference list to a .docx. + # Off by default; requires the optional [docx] extra (`uv sync --extra docx`). + export_apa_docx: false + export_apa_docx_dir: "output" literature_survey: seed_known_max: 5 # known papers to keep as survey anchors when all results are already in KB diff --git a/pyproject.toml b/pyproject.toml index a7a7144..c7212ff 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -163,6 +163,12 @@ docling = [ "pandas>=2.0,<3", ] +# Optional: APA .docx export of agentic answers +# (config.rag_modes.agentic.export_apa_docx). Off by default. +docx = [ + "python-docx>=1.1.0", +] + [project.scripts] perspicacite = "perspicacite.cli:main" diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py index 9bc1eb1..f6d152d 100644 --- a/src/perspicacite/config/schema.py +++ b/src/perspicacite/config/schema.py @@ -752,6 +752,10 @@ class RAGModeSettings(BaseModel): # deep_research: allow live web / academic-API searches in research cycles. # Set to false for KB-only evaluation environments to prevent rate-limit errors. use_websearch: bool = True + # Opt-in: write each agentic answer + an APA reference list to a .docx. + # Off by default; requires the optional [docx] extra (`uv sync --extra docx`). + export_apa_docx: bool = False + export_apa_docx_dir: str = "output" class RAGModesConfig(BaseModel): diff --git a/src/perspicacite/rag/agentic/orchestrator.py b/src/perspicacite/rag/agentic/orchestrator.py index 67d972d..a746831 100644 --- a/src/perspicacite/rag/agentic/orchestrator.py +++ b/src/perspicacite/rag/agentic/orchestrator.py @@ -441,10 +441,15 @@ def __init__( kb_metas: list | None = None, config: Any = None, app_state: Any = None, + export_apa_docx: bool = False, + export_apa_docx_dir: str = "output", ): self.llm = llm_client self.config = config self.app_state = app_state + # Opt-in APA .docx export of agentic answers (default off). + self.export_apa_docx = export_apa_docx + self.export_apa_docx_dir = export_apa_docx_dir or "output" self.tools = tool_registry self.embeddings = embedding_provider self.vector_store = vector_store @@ -2233,6 +2238,27 @@ def _build_facet_overview(self, session: AgentSession) -> str: sections.append(f' [{status}] "{facet.query}" — {n} source(s): {title_list}') return "\n".join(sections) + def _maybe_export_apa_docx(self, answer: str, papers: list) -> None: + """Opt-in: write the answer + an APA reference list to a .docx. + + No-op unless ``export_apa_docx`` is enabled. Never raises into the + answer path — export failures (including the optional [docx] extra + being absent) are logged and swallowed. + """ + if not getattr(self, "export_apa_docx", False): + return + try: + import hashlib + + from perspicacite.rag.export.apa_docx_exporter import export_apa_docx + + out_dir = getattr(self, "export_apa_docx_dir", "output") or "output" + doc_id = hashlib.sha1((answer or "").encode("utf-8")).hexdigest()[:8] + path = export_apa_docx(answer, papers, f"{out_dir}/manuscript_{doc_id}.docx") + logger.info("agentic_manuscript_exported", path=str(path)) + except Exception: + logger.warning("agentic_manuscript_export_failed", exc_info=True) + async def _generate_answer( self, query: str, @@ -2427,6 +2453,7 @@ async def _generate_answer( answer = answer.rstrip() + "\n\n" + references_section logger.info("agentic_references_section_added", answer_chars=len(answer)) + self._maybe_export_apa_docx(answer, papers) return answer, citation_map _CITE_RE = re.compile(r"\[(\d+(?:\s*,\s*\d+)*)\]") @@ -2626,6 +2653,7 @@ async def _generate_single_paper_answer( if references_section: answer = answer.rstrip() + "\n\n" + references_section + self._maybe_export_apa_docx(answer, papers) return answer, citation_map def _build_numbered_paper_list( diff --git a/src/perspicacite/rag/export/__init__.py b/src/perspicacite/rag/export/__init__.py new file mode 100644 index 0000000..fe48fa3 --- /dev/null +++ b/src/perspicacite/rag/export/__init__.py @@ -0,0 +1 @@ +"""Export helpers for RAG answers (optional formats).""" diff --git a/src/perspicacite/rag/export/apa_docx_exporter.py b/src/perspicacite/rag/export/apa_docx_exporter.py new file mode 100644 index 0000000..504a728 --- /dev/null +++ b/src/perspicacite/rag/export/apa_docx_exporter.py @@ -0,0 +1,99 @@ +"""Export an agentic answer + its cited papers to an APA-formatted ``.docx``. + +Opt-in via ``config.rag_modes.agentic.export_apa_docx`` (default off). Requires +the optional ``[docx]`` extra (``python-docx``), imported lazily so the +dependency is only needed when the feature is actually enabled. + +Inputs are tolerant: ``papers`` may be ``Paper`` model instances, plain dicts, +or a mix. Authors may be ``Author`` objects (``.name``), dicts, or strings. +""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + + +def _author_name(author: Any) -> str: + """Best-effort author display name from an Author object / dict / string.""" + if isinstance(author, str): + return author.strip() + name = getattr(author, "name", None) + if name is None and isinstance(author, dict): + name = author.get("name") + return str(name).strip() if name else "" + + +def format_authors(authors: list[Any] | None) -> str: + """APA author list: ``A``, ``A & B``, or ``A, B, & C``.""" + names = [n for n in (_author_name(a) for a in (authors or [])) if n] + if not names: + return "" + if len(names) == 1: + return names[0] + if len(names) == 2: + return f"{names[0]} & {names[1]}" + return ", ".join(names[:-1]) + f", & {names[-1]}" + + +def _field(paper: Any, name: str, default: Any = None) -> Any: + """Read ``name`` from a Paper object or a dict.""" + if isinstance(paper, dict): + return paper.get(name, default) + return getattr(paper, name, default) + + +def paper_to_apa(paper: Any) -> str: + """Render a single APA-style reference string from a Paper / dict.""" + authors = format_authors(_field(paper, "authors", []) or []) + year = _field(paper, "year") or "n.d." + title = (_field(paper, "title", "") or "").strip() + journal = (_field(paper, "journal", "") or "").strip() + doi = (_field(paper, "doi", "") or "").strip() + + text = f"{authors} ({year}). {title}.".strip() + if journal: + text += f" {journal}." + if doi: + doi_clean = doi.replace("https://doi.org/", "").strip() + if doi_clean: + text += f" https://doi.org/{doi_clean}" + return text.strip() + + +def export_apa_docx( + manuscript_text: str, + papers: list[Any] | None, + output_path: str | Path, +) -> Path: + """Write *manuscript_text* + an APA reference list to a ``.docx``. + + Returns the written path. Lazily imports ``python-docx`` and raises a clear + ``ImportError`` (with the install hint) when the optional extra is absent. + Deduplicates references by their rendered string, preserving order. + """ + try: + from docx import Document + except ImportError as exc: # pragma: no cover - only without the [docx] extra + raise ImportError( + "APA .docx export requires the optional [docx] extra: " + "`uv sync --extra docx` (installs python-docx)." + ) from exc + + out = Path(output_path) + out.parent.mkdir(parents=True, exist_ok=True) + + doc = Document() + doc.add_heading("Manuscript", level=1) + doc.add_paragraph(manuscript_text or "") + + doc.add_heading("References", level=1) + seen: set[str] = set() + for paper in papers or []: + ref = paper_to_apa(paper) + if ref and ref not in seen: + seen.add(ref) + doc.add_paragraph(ref) + + doc.save(str(out)) + return out diff --git a/src/perspicacite/web/state.py b/src/perspicacite/web/state.py index 1f615b4..add4860 100644 --- a/src/perspicacite/web/state.py +++ b/src/perspicacite/web/state.py @@ -185,6 +185,8 @@ async def initialize(self): max_iterations=5, use_two_pass=getattr(config.knowledge_base, "use_two_pass", True), map_reduce_max_papers=getattr(config.rag_modes.agentic, "map_reduce_max_papers", 8), + export_apa_docx=getattr(config.rag_modes.agentic, "export_apa_docx", False), + export_apa_docx_dir=getattr(config.rag_modes.agentic, "export_apa_docx_dir", "output"), app_state=self, ) logger.info("Agentic orchestrator initialized") diff --git a/tests/unit/test_apa_docx_export.py b/tests/unit/test_apa_docx_export.py new file mode 100644 index 0000000..a15593b --- /dev/null +++ b/tests/unit/test_apa_docx_export.py @@ -0,0 +1,83 @@ +"""APA .docx export — opt-in, optional-extra, tolerant of Paper objects/dicts.""" + +import importlib.util +import tempfile +import unittest +from pathlib import Path + +from perspicacite.rag.export.apa_docx_exporter import ( + export_apa_docx, + format_authors, + paper_to_apa, +) + +_HAS_DOCX = importlib.util.find_spec("docx") is not None + + +class _Author: + def __init__(self, name): + self.name = name + + +class _Paper: + def __init__(self, **kw): + for k, v in kw.items(): + setattr(self, k, v) + + +class TestApaFormatting(unittest.TestCase): + def test_format_authors_variants(self): + assert format_authors([]) == "" + assert format_authors([_Author("Smith, J.")]) == "Smith, J." + assert format_authors([_Author("A"), _Author("B")]) == "A & B" + assert format_authors([_Author("A"), _Author("B"), _Author("C")]) == "A, B, & C" + # dicts and plain strings are also accepted + assert format_authors([{"name": "X"}, "Y"]) == "X & Y" + + def test_paper_to_apa_with_object(self): + p = _Paper( + authors=[_Author("Doe, J."), _Author("Roe, R.")], + year=2021, title="A study of things", journal="Journal of Things", + doi="https://doi.org/10.1/abc", + ) + ref = paper_to_apa(p) + assert "Doe, J. & Roe, R." in ref + assert "(2021)." in ref + assert "A study of things." in ref + assert "Journal of Things." in ref + assert "https://doi.org/10.1/abc" in ref + assert "https://doi.org/https://" not in ref # doi prefix not doubled + + def test_paper_to_apa_with_dict_and_missing_year(self): + ref = paper_to_apa({"authors": [{"name": "Solo, H."}], "title": "T"}) + assert "Solo, H." in ref and "(n.d.)." in ref and "T." in ref + + +class TestConfigDefaultOff(unittest.TestCase): + def test_export_flag_defaults_false(self): + from perspicacite.config.schema import Config + agentic = Config().rag_modes.agentic + assert agentic.export_apa_docx is False + assert agentic.export_apa_docx_dir == "output" + + +class TestOrchestratorGate(unittest.TestCase): + def test_helper_noop_when_disabled(self): + # Unbound-method call on a duck-typed stub: disabled → returns without writing. + from perspicacite.rag.agentic.orchestrator import AgenticOrchestrator + + class _Stub: + export_apa_docx = False + # Must return None and not raise even though no exporter is reachable. + assert AgenticOrchestrator._maybe_export_apa_docx(_Stub(), "answer", []) is None + + +@unittest.skipUnless(_HAS_DOCX, "python-docx ([docx] extra) required") +class TestDocxWrite(unittest.TestCase): + def test_export_writes_file(self): + with tempfile.TemporaryDirectory() as d: + out = Path(d) / "sub" / "m.docx" + p = _Paper(authors=[_Author("A")], year=2020, title="T", journal="J", doi="10.1/x") + written = export_apa_docx("Body text here.", [p, p], out) # dup paper → dedup + assert written == out + assert out.exists() and out.stat().st_size > 0