Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions config.example.yml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,10 @@ rag_modes:
# Higher values improve coverage but increase latency and token cost.
# Range: 1–64; default is 8.
map_reduce_max_papers: 8
# Opt-in: write each agentic answer + an APA reference list to a .docx.
# Off by default; requires the optional [docx] extra (`uv sync --extra docx`).
export_apa_docx: false
export_apa_docx_dir: "output"

literature_survey:
seed_known_max: 5 # known papers to keep as survey anchors when all results are already in KB
Expand Down
6 changes: 6 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,12 @@ docling = [
"pandas>=2.0,<3",
]

# Optional: APA .docx export of agentic answers
# (config.rag_modes.agentic.export_apa_docx). Off by default.
docx = [
"python-docx>=1.1.0",
]

[project.scripts]
perspicacite = "perspicacite.cli:main"

Expand Down
4 changes: 4 additions & 0 deletions src/perspicacite/config/schema.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,6 +752,10 @@ class RAGModeSettings(BaseModel):
# deep_research: allow live web / academic-API searches in research cycles.
# Set to false for KB-only evaluation environments to prevent rate-limit errors.
use_websearch: bool = True
# Opt-in: write each agentic answer + an APA reference list to a .docx.
# Off by default; requires the optional [docx] extra (`uv sync --extra docx`).
export_apa_docx: bool = False
export_apa_docx_dir: str = "output"


class RAGModesConfig(BaseModel):
Expand Down
28 changes: 28 additions & 0 deletions src/perspicacite/rag/agentic/orchestrator.py
Original file line number Diff line number Diff line change
Expand Up @@ -441,10 +441,15 @@ def __init__(
kb_metas: list | None = None,
config: Any = None,
app_state: Any = None,
export_apa_docx: bool = False,
export_apa_docx_dir: str = "output",
):
self.llm = llm_client
self.config = config
self.app_state = app_state
# Opt-in APA .docx export of agentic answers (default off).
self.export_apa_docx = export_apa_docx
self.export_apa_docx_dir = export_apa_docx_dir or "output"
self.tools = tool_registry
self.embeddings = embedding_provider
self.vector_store = vector_store
Expand Down Expand Up @@ -2233,6 +2238,27 @@ def _build_facet_overview(self, session: AgentSession) -> str:
sections.append(f' [{status}] "{facet.query}" — {n} source(s): {title_list}')
return "\n".join(sections)

def _maybe_export_apa_docx(self, answer: str, papers: list) -> None:
"""Opt-in: write the answer + an APA reference list to a .docx.

No-op unless ``export_apa_docx`` is enabled. Never raises into the
answer path — export failures (including the optional [docx] extra
being absent) are logged and swallowed.
"""
if not getattr(self, "export_apa_docx", False):
return
try:
import hashlib

from perspicacite.rag.export.apa_docx_exporter import export_apa_docx

out_dir = getattr(self, "export_apa_docx_dir", "output") or "output"
doc_id = hashlib.sha1((answer or "").encode("utf-8")).hexdigest()[:8]
path = export_apa_docx(answer, papers, f"{out_dir}/manuscript_{doc_id}.docx")
logger.info("agentic_manuscript_exported", path=str(path))
except Exception:
logger.warning("agentic_manuscript_export_failed", exc_info=True)

async def _generate_answer(
self,
query: str,
Expand Down Expand Up @@ -2427,6 +2453,7 @@ async def _generate_answer(
answer = answer.rstrip() + "\n\n" + references_section
logger.info("agentic_references_section_added", answer_chars=len(answer))

self._maybe_export_apa_docx(answer, papers)
return answer, citation_map

_CITE_RE = re.compile(r"\[(\d+(?:\s*,\s*\d+)*)\]")
Expand Down Expand Up @@ -2626,6 +2653,7 @@ async def _generate_single_paper_answer(
if references_section:
answer = answer.rstrip() + "\n\n" + references_section

self._maybe_export_apa_docx(answer, papers)
return answer, citation_map

def _build_numbered_paper_list(
Expand Down
1 change: 1 addition & 0 deletions src/perspicacite/rag/export/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
"""Export helpers for RAG answers (optional formats)."""
99 changes: 99 additions & 0 deletions src/perspicacite/rag/export/apa_docx_exporter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
"""Export an agentic answer + its cited papers to an APA-formatted ``.docx``.

Opt-in via ``config.rag_modes.agentic.export_apa_docx`` (default off). Requires
the optional ``[docx]`` extra (``python-docx``), imported lazily so the
dependency is only needed when the feature is actually enabled.

Inputs are tolerant: ``papers`` may be ``Paper`` model instances, plain dicts,
or a mix. Authors may be ``Author`` objects (``.name``), dicts, or strings.
"""

from __future__ import annotations

from pathlib import Path
from typing import Any


def _author_name(author: Any) -> str:
"""Best-effort author display name from an Author object / dict / string."""
if isinstance(author, str):
return author.strip()
name = getattr(author, "name", None)
if name is None and isinstance(author, dict):
name = author.get("name")
return str(name).strip() if name else ""


def format_authors(authors: list[Any] | None) -> str:
"""APA author list: ``A``, ``A & B``, or ``A, B, & C``."""
names = [n for n in (_author_name(a) for a in (authors or [])) if n]
if not names:
return ""
if len(names) == 1:
return names[0]
if len(names) == 2:
return f"{names[0]} & {names[1]}"
return ", ".join(names[:-1]) + f", & {names[-1]}"


def _field(paper: Any, name: str, default: Any = None) -> Any:
"""Read ``name`` from a Paper object or a dict."""
if isinstance(paper, dict):
return paper.get(name, default)
return getattr(paper, name, default)


def paper_to_apa(paper: Any) -> str:
"""Render a single APA-style reference string from a Paper / dict."""
authors = format_authors(_field(paper, "authors", []) or [])
year = _field(paper, "year") or "n.d."
title = (_field(paper, "title", "") or "").strip()
journal = (_field(paper, "journal", "") or "").strip()
doi = (_field(paper, "doi", "") or "").strip()

text = f"{authors} ({year}). {title}.".strip()
if journal:
text += f" {journal}."
if doi:
doi_clean = doi.replace("https://doi.org/", "").strip()
if doi_clean:
text += f" https://doi.org/{doi_clean}"
return text.strip()


def export_apa_docx(
manuscript_text: str,
papers: list[Any] | None,
output_path: str | Path,
) -> Path:
"""Write *manuscript_text* + an APA reference list to a ``.docx``.

Returns the written path. Lazily imports ``python-docx`` and raises a clear
``ImportError`` (with the install hint) when the optional extra is absent.
Deduplicates references by their rendered string, preserving order.
"""
try:
from docx import Document
except ImportError as exc: # pragma: no cover - only without the [docx] extra
raise ImportError(
"APA .docx export requires the optional [docx] extra: "
"`uv sync --extra docx` (installs python-docx)."
) from exc

out = Path(output_path)
out.parent.mkdir(parents=True, exist_ok=True)

doc = Document()
doc.add_heading("Manuscript", level=1)
doc.add_paragraph(manuscript_text or "")

doc.add_heading("References", level=1)
seen: set[str] = set()
for paper in papers or []:
ref = paper_to_apa(paper)
if ref and ref not in seen:
seen.add(ref)
doc.add_paragraph(ref)

doc.save(str(out))
return out
2 changes: 2 additions & 0 deletions src/perspicacite/web/state.py
Original file line number Diff line number Diff line change
Expand Up @@ -185,6 +185,8 @@ async def initialize(self):
max_iterations=5,
use_two_pass=getattr(config.knowledge_base, "use_two_pass", True),
map_reduce_max_papers=getattr(config.rag_modes.agentic, "map_reduce_max_papers", 8),
export_apa_docx=getattr(config.rag_modes.agentic, "export_apa_docx", False),
export_apa_docx_dir=getattr(config.rag_modes.agentic, "export_apa_docx_dir", "output"),
app_state=self,
)
logger.info("Agentic orchestrator initialized")
Expand Down
83 changes: 83 additions & 0 deletions tests/unit/test_apa_docx_export.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
"""APA .docx export — opt-in, optional-extra, tolerant of Paper objects/dicts."""

import importlib.util
import tempfile
import unittest
from pathlib import Path

from perspicacite.rag.export.apa_docx_exporter import (
export_apa_docx,
format_authors,
paper_to_apa,
)

_HAS_DOCX = importlib.util.find_spec("docx") is not None


class _Author:
def __init__(self, name):
self.name = name


class _Paper:
def __init__(self, **kw):
for k, v in kw.items():
setattr(self, k, v)


class TestApaFormatting(unittest.TestCase):
def test_format_authors_variants(self):
assert format_authors([]) == ""
assert format_authors([_Author("Smith, J.")]) == "Smith, J."
assert format_authors([_Author("A"), _Author("B")]) == "A & B"
assert format_authors([_Author("A"), _Author("B"), _Author("C")]) == "A, B, & C"
# dicts and plain strings are also accepted
assert format_authors([{"name": "X"}, "Y"]) == "X & Y"

def test_paper_to_apa_with_object(self):
p = _Paper(
authors=[_Author("Doe, J."), _Author("Roe, R.")],
year=2021, title="A study of things", journal="Journal of Things",
doi="https://doi.org/10.1/abc",
)
ref = paper_to_apa(p)
assert "Doe, J. & Roe, R." in ref
assert "(2021)." in ref
assert "A study of things." in ref
assert "Journal of Things." in ref
assert "https://doi.org/10.1/abc" in ref
assert "https://doi.org/https://" not in ref # doi prefix not doubled

def test_paper_to_apa_with_dict_and_missing_year(self):
ref = paper_to_apa({"authors": [{"name": "Solo, H."}], "title": "T"})
assert "Solo, H." in ref and "(n.d.)." in ref and "T." in ref


class TestConfigDefaultOff(unittest.TestCase):
def test_export_flag_defaults_false(self):
from perspicacite.config.schema import Config
agentic = Config().rag_modes.agentic
assert agentic.export_apa_docx is False
assert agentic.export_apa_docx_dir == "output"


class TestOrchestratorGate(unittest.TestCase):
def test_helper_noop_when_disabled(self):
# Unbound-method call on a duck-typed stub: disabled → returns without writing.
from perspicacite.rag.agentic.orchestrator import AgenticOrchestrator

class _Stub:
export_apa_docx = False
# Must return None and not raise even though no exporter is reachable.
assert AgenticOrchestrator._maybe_export_apa_docx(_Stub(), "answer", []) is None


@unittest.skipUnless(_HAS_DOCX, "python-docx ([docx] extra) required")
class TestDocxWrite(unittest.TestCase):
def test_export_writes_file(self):
with tempfile.TemporaryDirectory() as d:
out = Path(d) / "sub" / "m.docx"
p = _Paper(authors=[_Author("A")], year=2020, title="T", journal="J", doi="10.1/x")
written = export_apa_docx("Body text here.", [p, p], out) # dup paper → dedup
assert written == out
assert out.exists() and out.stat().st_size > 0
Loading