diff --git a/pyproject.toml b/pyproject.toml index a7a7144..ab8e1fc 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -36,6 +36,7 @@ dependencies = [ "nltk>=3.8", "langchain-text-splitters>=0.3.0", "pathspec>=1.1.1", + "python-docx>=1.1.0", ] [project.optional-dependencies] diff --git a/src/perspicacite/rag/agentic/orchestrator.py b/src/perspicacite/rag/agentic/orchestrator.py index 67d972d..6e7df59 100644 --- a/src/perspicacite/rag/agentic/orchestrator.py +++ b/src/perspicacite/rag/agentic/orchestrator.py @@ -14,6 +14,7 @@ from perspicacite.models.kb import chroma_collection_name_for_kb from perspicacite.provenance.context import get_collector from perspicacite.rag.dynamic_kb import DynamicKnowledgeBase +from perspicacite.rag.export.apa_docx_exporter import export_apa_docx from perspicacite.rag.utils import format_references_academic from perspicacite.retrieval.hybrid import hybrid_retrieval @@ -2427,6 +2428,15 @@ async def _generate_answer( answer = answer.rstrip() + "\n\n" + references_section logger.info("agentic_references_section_added", answer_chars=len(answer)) + doc_id = session.session_id.split("-")[0] if session else "manuscript" + output_path = f"output/{doc_id}_manuscript.docx" + print("OUTPUT PATH:", output_path) + try: + export_apa_docx(answer, papers, output_path) + logger.info("agentic_manuscript_exported", path=output_path) + except Exception: + logger.warning("agentic_manuscript_export_failed", exc_info=True) + return answer, citation_map _CITE_RE = re.compile(r"\[(\d+(?:\s*,\s*\d+)*)\]") @@ -2626,6 +2636,15 @@ async def _generate_single_paper_answer( if references_section: answer = answer.rstrip() + "\n\n" + references_section + doc_id = session.session_id.split("-")[0] if session else "manuscript" + output_path = f"output/{doc_id}_manuscript.docx" + print("OUTPUT PATH:", output_path) + try: + export_apa_docx(answer, papers, output_path) + logger.info("agentic_manuscript_exported", path=output_path) + except Exception: + logger.warning("agentic_manuscript_export_failed", exc_info=True) + return answer, citation_map def _build_numbered_paper_list( diff --git a/src/perspicacite/rag/export/apa_docx_exporter.py b/src/perspicacite/rag/export/apa_docx_exporter.py new file mode 100644 index 0000000..cbbbd90 --- /dev/null +++ b/src/perspicacite/rag/export/apa_docx_exporter.py @@ -0,0 +1,56 @@ +from docx import Document +from typing import List, Dict + + +def format_authors(authors): + if not authors: + return "" + + if len(authors) == 1: + return authors[0] + + if len(authors) == 2: + return f"{authors[0]} & {authors[1]}" + + return ", ".join(authors[:-1]) + f", & {authors[-1]}" + + +def to_apa(paper: Dict) -> str: + authors = format_authors(paper.get("authors", [])) + year = paper.get("year", "n.d.") + title = paper.get("title", "") + journal = paper.get("journal", "") + doi = paper.get("doi", "") + + text = f"{authors} ({year}). {title}. {journal}." + + if doi: + doi_clean = doi.replace("https://doi.org/", "") + text += f" https://doi.org/{doi_clean}" + + return text + + +def export_apa_docx(manuscript_text: str, papers: List[Dict], output_path: str): + doc = Document() + + doc.add_heading("Manuscript", level=1) + doc.add_paragraph(manuscript_text) + + doc.add_heading("References (APA Style)", level=1) + + seen = set() + merged = [] + + for p in papers: + key = p.get("doi") or p.get("title") + if key in seen: + continue + seen.add(key) + merged.append(p) + + for i, paper in enumerate(merged, 1): + doc.add_paragraph(f"{i}. {to_apa(paper)}") + + doc.save(output_path) + return output_path