From 83e949d655f4ac2a2594440c498a87da927ce413 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:24:26 +0200 Subject: [PATCH 01/11] feat(config): pdf_backend + docling guard knobs (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit f82b9a57e8422913480312d888f4e6002a55ecd8) --- config.example.yml | 7 +++++++ src/perspicacite/config/schema.py | 17 +++++++++++++++++ tests/unit/test_config.py | 12 ++++++++++++ 3 files changed, 36 insertions(+) diff --git a/config.example.yml b/config.example.yml index ed3760fd..d0ba19cd 100644 --- a/config.example.yml +++ b/config.example.yml @@ -67,6 +67,13 @@ knowledge_base: embedding_model: "text-embedding-3-small" chunk_size: 1000 chunk_overlap: 200 + # PDF extraction backend. + # fitz — text-only PyMuPDF (default-installed, fast) + # docling — layout model: structured tables + figures (needs `uv sync --extra docling`) + # auto — docling when installed and within docling_max_pages, else fitz + pdf_backend: auto + docling_max_pages: 40 # auto: PDFs larger than this use fitz + docling_timeout_s: 120 # auto: docling wall-clock cap per document; on timeout → fitz chunking_method: "token" default_top_k: 10 similarity_threshold: 0.7 diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py index a9e7d3d3..3875ecf8 100644 --- a/src/perspicacite/config/schema.py +++ b/src/perspicacite/config/schema.py @@ -92,6 +92,23 @@ class KnowledgeBaseConfig(BaseModel): embedding_model: str = "text-embedding-3-small" chunk_size: int = Field(default=1000, ge=100, le=10000) chunk_overlap: int = Field(default=200, ge=0, le=1000) + pdf_backend: Literal["auto", "docling", "fitz"] = Field( + default="auto", + description=( + "PDF extraction backend. 'fitz' = text-only PyMuPDF (incumbent); " + "'docling' = layout model with structured tables + figures " + "(requires the [docling] extra); 'auto' = docling when importable " + "and within docling_max_pages, else fitz." + ), + ) + docling_max_pages: int = Field( + default=40, ge=1, + description="In 'auto', skip docling for PDFs with more pages than this (use fitz).", + ) + docling_timeout_s: int = Field( + default=120, ge=1, + description="Per-document wall-clock cap for docling; on timeout, fall back to fitz.", + ) chunking_method: Literal["token", "semantic", "agentic"] = "token" default_top_k: int = Field(default=10, ge=1, le=100) similarity_threshold: float = Field(default=0.7, ge=0.0, le=1.0) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 84b81683..410d3b2d 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -219,3 +219,15 @@ def test_anchor_config_near_threshold_bounds(): AnchorConfig(near_threshold=1.0) with pytest.raises(ValidationError): AnchorConfig(near_threshold=1.5) + + +def test_pdf_backend_defaults_and_validation(): + from perspicacite.config.schema import KnowledgeBaseConfig + kb = KnowledgeBaseConfig() + assert kb.pdf_backend == "auto" + assert kb.docling_max_pages == 40 + assert kb.docling_timeout_s == 120 + import pytest + from pydantic import ValidationError + with pytest.raises(ValidationError): + KnowledgeBaseConfig(pdf_backend="banana") From 1c781eacf0aa0967a878d03b74a97fbff24a9465 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:27:24 +0200 Subject: [PATCH 02/11] build: add [docling] optional extra (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit d0ea45f4084aca111a52bf9a77323edf3fbf266c) --- pyproject.toml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/pyproject.toml b/pyproject.toml index 5bc02e0b..a7a71449 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -151,6 +151,18 @@ adapters = [ "indicium-adapters-metabolomics>=0.1.0", ] +# docling — high-fidelity PDF -> structured document conversion (layout, +# tables, sections) for the content pipeline. Heavier than the other +# extras (pulls in torch-backed models + pandas). Install only when +# docling-based parsing is needed. +# +# Install with: +# uv sync --extra docling +docling = [ + "docling>=2.5,<3", + "pandas>=2.0,<3", +] + [project.scripts] perspicacite = "perspicacite.cli:main" From 329928b084f4d3cd429d7d66eca76b5ba686f607 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:30:17 +0200 Subject: [PATCH 03/11] feat(parsers): docling record types + ParsedContent tables/figures (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit 2fa29a6961b62bf8d547db940918ffadf34f5a5b) --- .../pipeline/parsers/docling_pdf.py | 44 +++++++++++++++++++ src/perspicacite/pipeline/parsers/pdf.py | 10 ++++- tests/unit/test_docling_pdf.py | 16 +++++++ 3 files changed, 68 insertions(+), 2 deletions(-) create mode 100644 src/perspicacite/pipeline/parsers/docling_pdf.py create mode 100644 tests/unit/test_docling_pdf.py diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py new file mode 100644 index 00000000..a0e2a725 --- /dev/null +++ b/src/perspicacite/pipeline/parsers/docling_pdf.py @@ -0,0 +1,44 @@ +"""Docling-backed PDF extraction (R2). + +Ports the converter configuration proven in AgenticScienceBuilder's +figures.py: picture images MUST be rendered (generate_picture_images=True) +or PictureItem.get_image() returns None and every figure is dropped; figure +pixel dimensions MUST be read from the rendered image or the size filter +discards them. No dependency on ASB. +""" +from __future__ import annotations + +import importlib.util +from dataclasses import dataclass + +_MIN_AREA_PX = 50_000 # drop logos/icons (mirrors ASB) + + +@dataclass +class DoclingTable: + page: int + caption: str + markdown: str + headers: list[str] + rows: list[list[str]] + + @property + def n_rows(self) -> int: + return len(self.rows) + + @property + def n_cols(self) -> int: + return len(self.headers) + + +@dataclass +class DoclingFigure: + page: int + caption: str + width_px: int + height_px: int + image_bytes: bytes = b"" + + +def docling_importable() -> bool: + return importlib.util.find_spec("docling") is not None diff --git a/src/perspicacite/pipeline/parsers/pdf.py b/src/perspicacite/pipeline/parsers/pdf.py index cb503d74..c09d8a52 100644 --- a/src/perspicacite/pipeline/parsers/pdf.py +++ b/src/perspicacite/pipeline/parsers/pdf.py @@ -6,12 +6,15 @@ """ import re -from dataclasses import dataclass +from dataclasses import dataclass, field from pathlib import Path -from typing import Any +from typing import TYPE_CHECKING, Any from perspicacite.logging import get_logger +if TYPE_CHECKING: + from perspicacite.pipeline.parsers.docling_pdf import DoclingFigure, DoclingTable + logger = get_logger("perspicacite.pipeline.parsers.pdf") @@ -48,6 +51,9 @@ class ParsedContent: title: str | None = None sections: dict[str, str] | None = None metadata: dict[str, Any] | None = None + # R2 (docling): empty on the fitz path; populated when docling is used. + tables: list["DoclingTable"] = field(default_factory=list) + figures: list["DoclingFigure"] = field(default_factory=list) class PDFParser: diff --git a/tests/unit/test_docling_pdf.py b/tests/unit/test_docling_pdf.py new file mode 100644 index 00000000..7d77b5c5 --- /dev/null +++ b/tests/unit/test_docling_pdf.py @@ -0,0 +1,16 @@ +import unittest + + +class TestRecordsAndParsedContent(unittest.TestCase): + def test_parsed_content_defaults_empty_tables_figures(self): + from perspicacite.pipeline.parsers.pdf import ParsedContent + pc = ParsedContent(text="hi") + assert pc.tables == [] + assert pc.figures == [] + + def test_record_dataclasses_construct(self): + from perspicacite.pipeline.parsers.docling_pdf import DoclingTable, DoclingFigure + t = DoclingTable(page=2, caption="Table 1.", markdown="| a |", headers=["a"], rows=[["1"]]) + assert t.n_rows == 1 and t.n_cols == 1 + f = DoclingFigure(page=1, caption="Figure 1.", width_px=300, height_px=300, image_bytes=b"x") + assert f.width_px == 300 From 2a66d5d36ea1498f7cc064bc3f7858942fc2d428 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:34:06 +0200 Subject: [PATCH 04/11] feat(parsers): DoclingPDFParser converter + figure/table mapping (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit 8298502a2836103e015af1b1d0dc9d63ad277db9) --- .../pipeline/parsers/docling_pdf.py | 106 ++++++++++++++++++ tests/unit/test_docling_pdf.py | 67 +++++++++++ 2 files changed, 173 insertions(+) diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py index a0e2a725..4f980080 100644 --- a/src/perspicacite/pipeline/parsers/docling_pdf.py +++ b/src/perspicacite/pipeline/parsers/docling_pdf.py @@ -10,6 +10,17 @@ import importlib.util from dataclasses import dataclass +from io import BytesIO +from typing import TYPE_CHECKING, Any + +from perspicacite.logging import get_logger +from perspicacite.pipeline.parsers.pdf import ParsedContent + +if TYPE_CHECKING: + from collections.abc import Callable + from pathlib import Path + +logger = get_logger("perspicacite.pipeline.parsers.docling") _MIN_AREA_PX = 50_000 # drop logos/icons (mirrors ASB) @@ -42,3 +53,98 @@ class DoclingFigure: def docling_importable() -> bool: return importlib.util.find_spec("docling") is not None + + +def _make_docling_converter(): + # Picture images MUST be enabled or get_image() returns None (zero figures). + from docling.datamodel.base_models import InputFormat + from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.document_converter import DocumentConverter, PdfFormatOption + + opts = PdfPipelineOptions() + opts.generate_picture_images = True + opts.images_scale = 2.0 + return DocumentConverter( + format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} + ) + + +def _page_of(item) -> int: + prov = getattr(item, "prov", None) or [] + if prov and getattr(prov[0], "page_no", None) is not None: + return int(prov[0].page_no) + return 1 + + +class DoclingPDFParser: + """Extracts text + structured tables + figures via docling.""" + + def __init__(self, converter_factory: Callable[[], Any] = _make_docling_converter): + self._converter_factory = converter_factory + + def extract(self, source: str | Path) -> ParsedContent: + conv = self._converter_factory() + doc = conv.convert(str(source)).document + figures = self._figures(doc) + tables = self._tables(doc) + text = self._text(doc) + return ParsedContent( + text=text, + sections=None, + metadata={"extractor": "docling"}, + tables=tables, + figures=figures, + ) + + def _text(self, doc) -> str: + try: + return doc.export_to_markdown() + except Exception: + return "" + + def _figures(self, doc) -> list[DoclingFigure]: + out: list[DoclingFigure] = [] + for pic in getattr(doc, "pictures", []) or []: + try: + pil = pic.get_image(doc) + w, h = pil.width, pil.height + buf = BytesIO() + pil.save(buf, "PNG") + image_bytes = buf.getvalue() + except Exception: + continue + if len(image_bytes) < 1024: + continue + try: + caption = pic.caption_text(doc) or "" + except Exception: + caption = "" + out.append( + DoclingFigure( + page=_page_of(pic), caption=caption, + width_px=w, height_px=h, image_bytes=image_bytes, + ) + ) + return out + + def _tables(self, doc) -> list[DoclingTable]: + out: list[DoclingTable] = [] + for tbl in getattr(doc, "tables", []) or []: + try: + df = tbl.export_to_dataframe(doc) + headers = [str(c) for c in df.columns.tolist()] + rows = [[str(v) for v in row] for row in df.values.tolist()] + markdown = tbl.export_to_markdown(doc) + except Exception: + continue + try: + caption = tbl.caption_text(doc) or "" + except Exception: + caption = "" + out.append( + DoclingTable( + page=_page_of(tbl), caption=caption, + markdown=markdown, headers=headers, rows=rows, + ) + ) + return out diff --git a/tests/unit/test_docling_pdf.py b/tests/unit/test_docling_pdf.py index 7d77b5c5..2f3866dd 100644 --- a/tests/unit/test_docling_pdf.py +++ b/tests/unit/test_docling_pdf.py @@ -14,3 +14,70 @@ def test_record_dataclasses_construct(self): assert t.n_rows == 1 and t.n_cols == 1 f = DoclingFigure(page=1, caption="Figure 1.", width_px=300, height_px=300, image_bytes=b"x") assert f.width_px == 300 + + +class _FakeProv: + def __init__(self, page_no): self.page_no = page_no + +class _FakeImg: + def __init__(self, png): self._png = png; self.width = 300; self.height = 300 + def save(self, buf, fmt): buf.write(self._png) + +class _FakePicture: + def __init__(self, page, caption, png): + self.prov = [_FakeProv(page)]; self._caption = caption; self._png = png + def caption_text(self, doc): return self._caption + def get_image(self, doc): return _FakeImg(self._png) + +class _FakeTable: + def __init__(self, page, caption, headers, rows): + self.prov = [_FakeProv(page)]; self._caption = caption + self._headers = headers; self._rows = rows + def caption_text(self, doc): return self._caption + def export_to_markdown(self, doc=None): return "| " + " | ".join(self._headers) + " |" + def export_to_dataframe(self, doc=None): + import pandas as pd + return pd.DataFrame(self._rows, columns=self._headers) + +class _FakeDoc: + def __init__(self, pictures, tables): self.pictures = pictures; self.tables = tables + +class _FakeResult: + def __init__(self, doc): self.document = doc + +class _FakeConverter: + def __init__(self, doc): self._doc = doc + def convert(self, source): return _FakeResult(self._doc) + + +class TestDoclingExtraction(unittest.TestCase): + def test_maps_pictures_and_tables_dims_populated(self): + import importlib.util + if importlib.util.find_spec("pandas") is None: + self.skipTest("pandas required") + from perspicacite.pipeline.parsers import docling_pdf as d + png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 2048 + doc = _FakeDoc( + pictures=[_FakePicture(1, "Figure 1.", png)], + tables=[_FakeTable(2, "Table 1.", ["k", "v"], [["a", "1"]])], + ) + parser = d.DoclingPDFParser(converter_factory=lambda: _FakeConverter(doc)) + res = parser.extract("/x.pdf") + assert len(res.figures) == 1 + assert res.figures[0].width_px == 300 and res.figures[0].height_px == 300 + assert len(res.tables) == 1 + assert res.tables[0].headers == ["k", "v"] and res.tables[0].rows == [["a", "1"]] + assert "k" in res.tables[0].markdown + + +class TestDoclingConverterConfig(unittest.TestCase): + def test_converter_enables_picture_images(self): + import importlib.util + if importlib.util.find_spec("docling") is None: + self.skipTest("docling extra required") + from perspicacite.pipeline.parsers.docling_pdf import _make_docling_converter + from docling.datamodel.base_models import InputFormat + conv = _make_docling_converter() + opts = conv.format_to_options[InputFormat.PDF].pipeline_options + assert opts.generate_picture_images is True + assert opts.images_scale >= 2.0 From 067c8402e835d8be0ab4a1ae0a8283e17fd4d39a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:37:22 +0200 Subject: [PATCH 05/11] feat(parsers): docling backend selector + page/timeout guard (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit dcd8351d68916713a0089ae3b5c477b37eccb00e) --- src/perspicacite/pipeline/parsers/pdf.py | 72 ++++++++++++++++++- tests/unit/test_pdf_backend_guard.py | 88 ++++++++++++++++++++++++ 2 files changed, 159 insertions(+), 1 deletion(-) create mode 100644 tests/unit/test_pdf_backend_guard.py diff --git a/src/perspicacite/pipeline/parsers/pdf.py b/src/perspicacite/pipeline/parsers/pdf.py index c09d8a52..a2c047f7 100644 --- a/src/perspicacite/pipeline/parsers/pdf.py +++ b/src/perspicacite/pipeline/parsers/pdf.py @@ -18,6 +18,16 @@ logger = get_logger("perspicacite.pipeline.parsers.pdf") +def _docling_importable() -> bool: + from perspicacite.pipeline.parsers.docling_pdf import docling_importable + return docling_importable() + + +def _docling_extract_worker(path: str): + from perspicacite.pipeline.parsers.docling_pdf import DoclingPDFParser + return DoclingPDFParser().extract(path) + + def _clean_text(text: str, threshold: float = 0.05) -> str: """Collapse excess newlines when they dominate the text. @@ -173,11 +183,61 @@ def _extract_with_pdfplumber(self, source: str | Path | bytes) -> tuple[str, dic return "\n\n".join(all_text), sections, page_count + # ------------------------------------------------------------------ + # Backend selection + guards (R2 docling) + # ------------------------------------------------------------------ + + def _page_count(self, source) -> int: + fitz = self._get_fitz() + if fitz is None: + return 0 + try: + doc = ( + fitz.open(str(source)) + if isinstance(source, (str, Path)) + else fitz.open(stream=source, filetype="pdf") + ) + n = doc.page_count + doc.close() + return n + except Exception: + return 0 + + def _select_backend(self, source, page_count: int, config) -> str: + backend = getattr(config, "pdf_backend", "auto") + if backend == "fitz": + return "fitz" + if backend == "docling": + return "docling" + # auto: + if not _docling_importable(): + return "fitz" + if page_count > int(getattr(config, "docling_max_pages", 40)): + logger.warning("docling_fallback", reason="oversized", pages=page_count) + return "fitz" + return "docling" + + def _run_docling_with_timeout(self, source, timeout_s: int): + """Run docling in a worker process; return ParsedContent or None on + timeout/error (caller falls back to fitz).""" + from concurrent.futures import ProcessPoolExecutor + from concurrent.futures import TimeoutError as FTimeout + try: + with ProcessPoolExecutor(max_workers=1) as ex: + fut = ex.submit(_docling_extract_worker, str(source)) + return fut.result(timeout=timeout_s) + except FTimeout: + logger.warning("docling_fallback", reason="timeout", path=str(source)) + return None + except Exception as exc: + logger.warning("docling_fallback", reason="error", error=str(exc)) + return None + # ------------------------------------------------------------------ # Public API # ------------------------------------------------------------------ - async def parse(self, source: str | Path | bytes) -> ParsedContent: + async def parse(self, source: str | Path | bytes, config=None) -> ParsedContent: """ Parse PDF and extract text. @@ -187,6 +247,16 @@ async def parse(self, source: str | Path | bytes) -> ParsedContent: Returns: Parsed content with text and metadata """ + if config is not None: + pages = self._page_count(source) + if self._select_backend(source, pages, config) == "docling": + pc = self._run_docling_with_timeout( + source, int(getattr(config, "docling_timeout_s", 120)) + ) + if pc is not None: + return pc + # else fall through to the fitz/pdfplumber path below + # Try PyMuPDF first (better column handling) result = self._extract_with_fitz(source) diff --git a/tests/unit/test_pdf_backend_guard.py b/tests/unit/test_pdf_backend_guard.py new file mode 100644 index 00000000..c6aedc32 --- /dev/null +++ b/tests/unit/test_pdf_backend_guard.py @@ -0,0 +1,88 @@ +import unittest + + +class _Cfg: + def __init__(self, backend="auto", max_pages=40, timeout=120): + self.pdf_backend = backend + self.docling_max_pages = max_pages + self.docling_timeout_s = timeout + + +class TestBackendSelector(unittest.TestCase): + def _select(self, parser, pages, cfg=None): + return parser._select_backend("/x.pdf", pages, _Cfg(**(cfg or {}))) + + def test_explicit_fitz(self): + from perspicacite.pipeline.parsers.pdf import PDFParser + p = PDFParser() + assert self._select(p, 5, {"backend": "fitz"}) == "fitz" + + def test_explicit_docling(self): + from perspicacite.pipeline.parsers.pdf import PDFParser + p = PDFParser() + assert self._select(p, 5, {"backend": "docling"}) == "docling" + + def test_auto_uses_fitz_when_docling_absent(self): + from perspicacite.pipeline.parsers import pdf as m + p = m.PDFParser() + orig = m._docling_importable + m._docling_importable = lambda: False + try: + assert self._select(p, 5) == "fitz" + finally: + m._docling_importable = orig + + def test_auto_guard_on_pages(self): + from perspicacite.pipeline.parsers import pdf as m + p = m.PDFParser() + orig = m._docling_importable + m._docling_importable = lambda: True + try: + assert self._select(p, 999, {"max_pages": 40}) == "fitz" + assert self._select(p, 10, {"max_pages": 40}) == "docling" + finally: + m._docling_importable = orig + + +class TestTimeoutFallback(unittest.TestCase): + def test_timeout_branch_via_stub(self): + from concurrent.futures import TimeoutError as FTimeout + + from perspicacite.pipeline.parsers.pdf import PDFParser + p = PDFParser() + + class _Fut: + def result(self, timeout): raise FTimeout() + + class _Ex: + def __enter__(self): return self + def __exit__(self, *a): return False + def submit(self, *a, **k): return _Fut() + + import concurrent.futures as cf + orig_ex = cf.ProcessPoolExecutor + cf.ProcessPoolExecutor = lambda *a, **k: _Ex() + try: + assert p._run_docling_with_timeout("/x.pdf", timeout_s=1) is None + finally: + cf.ProcessPoolExecutor = orig_ex + + def test_error_branch_returns_none(self): + from perspicacite.pipeline.parsers.pdf import PDFParser + p = PDFParser() + + class _Fut: + def result(self, timeout): raise RuntimeError("boom") + + class _Ex: + def __enter__(self): return self + def __exit__(self, *a): return False + def submit(self, *a, **k): return _Fut() + + import concurrent.futures as cf + orig_ex = cf.ProcessPoolExecutor + cf.ProcessPoolExecutor = lambda *a, **k: _Ex() + try: + assert p._run_docling_with_timeout("/x.pdf", timeout_s=1) is None + finally: + cf.ProcessPoolExecutor = orig_ex From 440e744c9785ddc7555ead96858f8f04f9af440b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:41:30 +0200 Subject: [PATCH 06/11] feat(chunking): emit content_type=table chunks from docling tables (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit fd47e9541c3d6f4fbaed0b36a2218ff69fc95457) --- .../pipeline/chunking_dispatch.py | 24 ++++++++++++++++++ tests/unit/test_docling_table_chunks.py | 25 +++++++++++++++++++ 2 files changed, 49 insertions(+) create mode 100644 tests/unit/test_docling_table_chunks.py diff --git a/src/perspicacite/pipeline/chunking_dispatch.py b/src/perspicacite/pipeline/chunking_dispatch.py index be188396..156c0807 100644 --- a/src/perspicacite/pipeline/chunking_dispatch.py +++ b/src/perspicacite/pipeline/chunking_dispatch.py @@ -58,6 +58,30 @@ } +def table_records_to_chunks(tables, paper, start_index: int) -> list[DocumentChunk]: + """Turn ``DoclingTable`` records into retrievable chunks tagged ``content_type='table'``. + + No-op when ``tables`` is empty (the fitz path), preserving today's behaviour. + """ + chunks: list[DocumentChunk] = [] + for i, t in enumerate(tables): + body = (f"{t.caption}\n\n{t.markdown}" if t.caption else t.markdown).strip() + idx = start_index + i + meta = ChunkMetadata( + paper_id=getattr(paper, "paper_id", "unknown"), + chunk_index=idx, + content_type="table", + page=getattr(t, "page", None), + title=getattr(paper, "title", None), + doi=getattr(paper, "doi", None), + year=getattr(paper, "year", None), + ) + chunks.append( + DocumentChunk(id=f"{meta.paper_id}:table:{idx}", text=body, metadata=meta) + ) + return chunks + + def infer_content_type(path: Path) -> tuple[str, str | None]: """Map file extension to ``(content_type, language)``. diff --git a/tests/unit/test_docling_table_chunks.py b/tests/unit/test_docling_table_chunks.py new file mode 100644 index 00000000..cd5c038c --- /dev/null +++ b/tests/unit/test_docling_table_chunks.py @@ -0,0 +1,25 @@ +import unittest + + +class TestTableChunks(unittest.TestCase): + def test_table_records_become_table_chunks(self): + from perspicacite.pipeline.parsers.docling_pdf import DoclingTable + from perspicacite.pipeline.chunking_dispatch import table_records_to_chunks + + class _Paper: + paper_id = "local:abc" + title = "T"; doi = None; year = None + tables = [DoclingTable(page=3, caption="Table 1. Params.", + markdown="| k | v |\n| a | 1 |", headers=["k", "v"], rows=[["a", "1"]])] + chunks = table_records_to_chunks(tables, _Paper(), start_index=0) + assert len(chunks) == 1 + c = chunks[0] + assert c.metadata.content_type == "table" + assert c.metadata.page == 3 + assert "Table 1" in c.text and "| k | v |" in c.text + + def test_empty_tables_yield_no_chunks(self): + from perspicacite.pipeline.chunking_dispatch import table_records_to_chunks + class _Paper: + paper_id = "p"; title = None; doi = None; year = None + assert table_records_to_chunks([], _Paper(), start_index=5) == [] From 2a4ee2e3ade842f6d8e8a5bfab8201611b7d76f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:46:57 +0200 Subject: [PATCH 07/11] feat(parsers): map docling figures to multimodal record shape (R2) Co-Authored-By: Claude Opus 4.7 (cherry picked from commit 16495f54590babcdc547c26e09d5edeb5250ba31) --- .../pipeline/parsers/docling_pdf.py | 16 +++++++++++++ tests/unit/test_docling_pdf.py | 23 +++++++++++++++++++ 2 files changed, 39 insertions(+) diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py index 4f980080..4f119286 100644 --- a/src/perspicacite/pipeline/parsers/docling_pdf.py +++ b/src/perspicacite/pipeline/parsers/docling_pdf.py @@ -9,6 +9,7 @@ from __future__ import annotations import importlib.util +import re from dataclasses import dataclass from io import BytesIO from typing import TYPE_CHECKING, Any @@ -76,6 +77,21 @@ def _page_of(item) -> int: return 1 +_FIG_LABEL_RE = re.compile( + r"^\s*((?:supplementary\s+)?(?:fig(?:ure|\.)?|scheme)\s+[A-Za-z]?\d+[A-Za-z]?)", + re.IGNORECASE, +) + + +def figure_to_multimodal_record(fig: DoclingFigure) -> dict: + """Map a DoclingFigure to the existing multimodal record shape + {kind, label, caption, content} used by parsers/multimodal.py. `content` + is left empty: docling supplies the image, not a semantic description.""" + m = _FIG_LABEL_RE.match(fig.caption or "") + label = m.group(1).strip() if m else "" + return {"kind": "figure", "label": label, "caption": fig.caption or "", "content": ""} + + class DoclingPDFParser: """Extracts text + structured tables + figures via docling.""" diff --git a/tests/unit/test_docling_pdf.py b/tests/unit/test_docling_pdf.py index 2f3866dd..cd77f680 100644 --- a/tests/unit/test_docling_pdf.py +++ b/tests/unit/test_docling_pdf.py @@ -81,3 +81,26 @@ def test_converter_enables_picture_images(self): opts = conv.format_to_options[InputFormat.PDF].pipeline_options assert opts.generate_picture_images is True assert opts.images_scale >= 2.0 + + +class TestFigureToMultimodalShape(unittest.TestCase): + def test_figure_maps_to_kind_caption_content(self): + from perspicacite.pipeline.parsers.docling_pdf import ( + DoclingFigure, figure_to_multimodal_record, + ) + f = DoclingFigure(page=1, caption="Figure 2. Workflow.", + width_px=400, height_px=300, image_bytes=b"x") + rec = figure_to_multimodal_record(f) + assert rec["kind"] == "figure" + assert rec["caption"] == "Figure 2. Workflow." + assert rec["label"] == "Figure 2" + assert "content" in rec + + def test_figure_without_label_caption(self): + from perspicacite.pipeline.parsers.docling_pdf import ( + DoclingFigure, figure_to_multimodal_record, + ) + f = DoclingFigure(page=1, caption="An unlabeled panel", width_px=400, height_px=300) + rec = figure_to_multimodal_record(f) + assert rec["kind"] == "figure" + assert rec["label"] == "" From de7eeec9dbbb1a64396b636a9a91b6a9d6399450 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 15:52:40 +0200 Subject: [PATCH 08/11] feat(local-docs): run docling on PDF ingest + emit table chunks (R2) local-file ingest now passes the KB config to PDFParser.parse so the docling backend activates per the guard, and appends content_type=table chunks from any extracted tables. BibTeX/DOI path unchanged (follow-up). Co-Authored-By: Claude Opus 4.7 (cherry picked from commit 08130497b8307a927f5f7482060db63c95d55110) --- src/perspicacite/integrations/local_docs.py | 24 +++++++-- tests/unit/test_local_docs_docling_wire.py | 59 +++++++++++++++++++++ 2 files changed, 78 insertions(+), 5 deletions(-) create mode 100644 tests/unit/test_local_docs_docling_wire.py diff --git a/src/perspicacite/integrations/local_docs.py b/src/perspicacite/integrations/local_docs.py index f72275c1..57381f2e 100644 --- a/src/perspicacite/integrations/local_docs.py +++ b/src/perspicacite/integrations/local_docs.py @@ -18,6 +18,7 @@ chunk_document, infer_content_type, ) +from perspicacite.pipeline.parsers.pdf import ParsedContent logger = get_logger("perspicacite.local_docs") @@ -97,17 +98,20 @@ def _extract_year_from_text(text: str | None) -> int | None: return None -async def _read_text(path: Path, content_type: str, pdf_parser) -> str | None: +async def _read_text( + path: Path, content_type: str, pdf_parser, config=None +) -> ParsedContent | None: if content_type == "pdf": if pdf_parser is None: return None - parsed = await pdf_parser.parse(path) - return parsed.text or None + parsed = await pdf_parser.parse(path, config=config) + return parsed if (parsed and parsed.text) else None try: - return path.read_text(encoding="utf-8", errors="replace") + raw = path.read_text(encoding="utf-8", errors="replace") except Exception as exc: logger.warning("local_docs_read_failed", path=str(path), error=str(exc)) return None + return ParsedContent(text=raw) if raw else None async def _ingest_files( @@ -136,7 +140,8 @@ async def _ingest_files( for idx, fp in enumerate(files): content_type, language = infer_content_type(fp) paper = _paper_for_file(fp) - text = await _read_text(fp, content_type, app_state.pdf_parser) + parsed = await _read_text(fp, content_type, app_state.pdf_parser, kb_cfg) + text = parsed.text if parsed else None # F-13: opportunistic year extraction from the document body so # KB-stats by_year and recency-weighted retrieval have signal # for URL- / file-ingested sources. @@ -158,6 +163,15 @@ async def _ingest_files( text, paper, content_type=content_type, language=language, config=kb_cfg, ) + if parsed is not None and parsed.tables: + from perspicacite.pipeline.chunking_dispatch import ( + table_records_to_chunks, + ) + chunks.extend( + table_records_to_chunks( + parsed.tables, paper, start_index=len(chunks) + ) + ) # ChunkMetadata is frozen — recreate with source_file_path set, # plus optional external_metadata annotations (Cycle C). ext_parent = (external_metadata or {}).get("parent_paper_id") diff --git a/tests/unit/test_local_docs_docling_wire.py b/tests/unit/test_local_docs_docling_wire.py new file mode 100644 index 00000000..d83c6e4a --- /dev/null +++ b/tests/unit/test_local_docs_docling_wire.py @@ -0,0 +1,59 @@ +import asyncio +import unittest +from pathlib import Path + + +class _FakeParsed: + pass + + +class TestReadTextThreadsConfigAndTables(unittest.TestCase): + def test_pdf_returns_parsedcontent_with_tables_and_passes_config(self): + from perspicacite.integrations.local_docs import _read_text + from perspicacite.pipeline.parsers.docling_pdf import DoclingTable + from perspicacite.pipeline.parsers.pdf import ParsedContent + + seen = {} + + class _FakeParser: + async def parse(self, source, config=None): + seen["config"] = config + return ParsedContent( + text="body text", + tables=[DoclingTable(page=1, caption="Table 1.", + markdown="| a |", headers=["a"], rows=[["1"]])], + ) + + sentinel = object() + out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), sentinel)) + assert isinstance(out, ParsedContent) + assert out.text == "body text" + assert len(out.tables) == 1 + assert seen["config"] is sentinel # config threaded to parse() + + def test_pdf_empty_text_returns_none(self): + from perspicacite.integrations.local_docs import _read_text + from perspicacite.pipeline.parsers.pdf import ParsedContent + + class _FakeParser: + async def parse(self, source, config=None): + return ParsedContent(text="") + + out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), None)) + assert out is None + + def test_non_pdf_wraps_text_in_parsedcontent(self): + import os + import tempfile + + from perspicacite.integrations.local_docs import _read_text + from perspicacite.pipeline.parsers.pdf import ParsedContent + with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f: + f.write("hello world") + p = Path(f.name) + try: + out = asyncio.run(_read_text(p, "text", None, None)) + assert isinstance(out, ParsedContent) + assert "hello world" in out.text + finally: + os.unlink(p) From 23e0f3e0597886b5f0fa9e240f2606b271d6e672 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 16:53:17 +0200 Subject: [PATCH 09/11] fix(docling): force CPU accelerator (MPS float64 crash on Apple Silicon) docling auto-selects MPS on Apple Silicon, which raises "Cannot convert a MPS Tensor to float64" and fails conversion on every page. Pin AcceleratorOptions(device=CPU). Verified: 13-page PDF extracts 6 figures on CPU (~10min); MPS unusable even with PYTORCH_ENABLE_MPS_FALLBACK=1. Co-Authored-By: Claude Opus 4.7 (cherry picked from commit a9893e738a8d6b3b5bd8afc3957a1d6ac849c942) --- src/perspicacite/pipeline/parsers/docling_pdf.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py index 4f119286..80945c9e 100644 --- a/src/perspicacite/pipeline/parsers/docling_pdf.py +++ b/src/perspicacite/pipeline/parsers/docling_pdf.py @@ -59,12 +59,21 @@ def docling_importable() -> bool: def _make_docling_converter(): # Picture images MUST be enabled or get_image() returns None (zero figures). from docling.datamodel.base_models import InputFormat - from docling.datamodel.pipeline_options import PdfPipelineOptions + from docling.datamodel.pipeline_options import ( + AcceleratorDevice, + AcceleratorOptions, + PdfPipelineOptions, + ) from docling.document_converter import DocumentConverter, PdfFormatOption opts = PdfPipelineOptions() opts.generate_picture_images = True opts.images_scale = 2.0 + # Force CPU. On Apple Silicon docling auto-selects the MPS (Metal) backend, + # which raises "Cannot convert a MPS Tensor to float64 ... MPS doesn't + # support float64" and fails conversion on every page. CPU is portable and + # matches the documented R2 device intent. + opts.accelerator_options = AcceleratorOptions(device=AcceleratorDevice.CPU) return DocumentConverter( format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)} ) From f5a6500738c2796b4001ea3d6cf67fb286338f22 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 16:58:56 +0200 Subject: [PATCH 10/11] refactor(docling): fitz text always; docling tables/figures as additive advanced opt-in (R2) Text extraction stays 100% fitz (fast, default). docling no longer replaces the text path. New advanced flag docling_extract_tables_figures (off by default) runs docling on PDF ingest ONLY to append structured table chunks, guarded by docling_max_pages + docling_timeout_s (now 600s). Text is unaffected if docling is absent/oversized/times out. Co-Authored-By: Claude Opus 4.7 (cherry picked from commit 01c4b02c3269d21ad2b8f981839091e4227d2fa2) --- config.example.yml | 14 +++--- src/perspicacite/config/schema.py | 20 +++++---- src/perspicacite/integrations/local_docs.py | 38 +++++++++-------- src/perspicacite/pipeline/parsers/pdf.py | 34 +++++---------- tests/unit/test_config.py | 12 +++--- tests/unit/test_local_docs_docling_wire.py | 47 ++++++--------------- tests/unit/test_pdf_backend_guard.py | 41 +++++++++--------- 7 files changed, 87 insertions(+), 119 deletions(-) diff --git a/config.example.yml b/config.example.yml index d0ba19cd..3968d554 100644 --- a/config.example.yml +++ b/config.example.yml @@ -67,13 +67,13 @@ knowledge_base: embedding_model: "text-embedding-3-small" chunk_size: 1000 chunk_overlap: 200 - # PDF extraction backend. - # fitz — text-only PyMuPDF (default-installed, fast) - # docling — layout model: structured tables + figures (needs `uv sync --extra docling`) - # auto — docling when installed and within docling_max_pages, else fitz - pdf_backend: auto - docling_max_pages: 40 # auto: PDFs larger than this use fitz - docling_timeout_s: 120 # auto: docling wall-clock cap per document; on timeout → fitz + # Advanced: extract structured tables + figures with docling, IN ADDITION to + # the always-on fast fitz text extraction. Requires `uv sync --extra docling`. + # docling is CPU-only here and slow (~minutes/page), so this is off by default + # — turn it on for high-value PDFs where tables/figures matter. + docling_extract_tables_figures: false + docling_max_pages: 40 # skip docling extras for PDFs larger than this + docling_timeout_s: 600 # per-document docling wall-clock cap; on timeout, skip extras chunking_method: "token" default_top_k: 10 similarity_threshold: 0.7 diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py index 3875ecf8..31c574bf 100644 --- a/src/perspicacite/config/schema.py +++ b/src/perspicacite/config/schema.py @@ -92,22 +92,24 @@ class KnowledgeBaseConfig(BaseModel): embedding_model: str = "text-embedding-3-small" chunk_size: int = Field(default=1000, ge=100, le=10000) chunk_overlap: int = Field(default=200, ge=0, le=1000) - pdf_backend: Literal["auto", "docling", "fitz"] = Field( - default="auto", + docling_extract_tables_figures: bool = Field( + default=False, description=( - "PDF extraction backend. 'fitz' = text-only PyMuPDF (incumbent); " - "'docling' = layout model with structured tables + figures " - "(requires the [docling] extra); 'auto' = docling when importable " - "and within docling_max_pages, else fitz." + "Advanced: when True, run docling (CPU, slow ~min/page) to extract " + "structured tables + figures from PDFs IN ADDITION to the always-on " + "fitz text extraction. Off by default." ), ) docling_max_pages: int = Field( default=40, ge=1, - description="In 'auto', skip docling for PDFs with more pages than this (use fitz).", + description="Skip the docling extras pass for PDFs with more pages than this.", ) docling_timeout_s: int = Field( - default=120, ge=1, - description="Per-document wall-clock cap for docling; on timeout, fall back to fitz.", + default=600, ge=1, + description=( + "Per-document wall-clock cap for the docling extras pass; " + "on timeout, skip extras." + ), ) chunking_method: Literal["token", "semantic", "agentic"] = "token" default_top_k: int = Field(default=10, ge=1, le=100) diff --git a/src/perspicacite/integrations/local_docs.py b/src/perspicacite/integrations/local_docs.py index 57381f2e..e654d468 100644 --- a/src/perspicacite/integrations/local_docs.py +++ b/src/perspicacite/integrations/local_docs.py @@ -18,7 +18,6 @@ chunk_document, infer_content_type, ) -from perspicacite.pipeline.parsers.pdf import ParsedContent logger = get_logger("perspicacite.local_docs") @@ -98,20 +97,18 @@ def _extract_year_from_text(text: str | None) -> int | None: return None -async def _read_text( - path: Path, content_type: str, pdf_parser, config=None -) -> ParsedContent | None: +async def _read_text(path: Path, content_type: str, pdf_parser) -> str | None: if content_type == "pdf": if pdf_parser is None: return None - parsed = await pdf_parser.parse(path, config=config) - return parsed if (parsed and parsed.text) else None + parsed = await pdf_parser.parse(path) + return parsed.text or None try: raw = path.read_text(encoding="utf-8", errors="replace") except Exception as exc: logger.warning("local_docs_read_failed", path=str(path), error=str(exc)) return None - return ParsedContent(text=raw) if raw else None + return raw or None async def _ingest_files( @@ -140,8 +137,7 @@ async def _ingest_files( for idx, fp in enumerate(files): content_type, language = infer_content_type(fp) paper = _paper_for_file(fp) - parsed = await _read_text(fp, content_type, app_state.pdf_parser, kb_cfg) - text = parsed.text if parsed else None + text = await _read_text(fp, content_type, app_state.pdf_parser) # F-13: opportunistic year extraction from the document body so # KB-stats by_year and recency-weighted retrieval have signal # for URL- / file-ingested sources. @@ -163,15 +159,23 @@ async def _ingest_files( text, paper, content_type=content_type, language=language, config=kb_cfg, ) - if parsed is not None and parsed.tables: - from perspicacite.pipeline.chunking_dispatch import ( - table_records_to_chunks, - ) - chunks.extend( - table_records_to_chunks( - parsed.tables, paper, start_index=len(chunks) + # R2 advanced: optionally augment with docling-extracted tables. + if content_type == "pdf" and getattr(kb_cfg, "docling_extract_tables_figures", False): + parser = app_state.pdf_parser + pages = parser._page_count(fp) + if parser._should_run_docling_extras(pages, kb_cfg): + pc = parser._run_docling_with_timeout( + fp, int(getattr(kb_cfg, "docling_timeout_s", 600)) ) - ) + if pc is not None and pc.tables: + from perspicacite.pipeline.chunking_dispatch import ( + table_records_to_chunks, + ) + chunks.extend( + table_records_to_chunks( + pc.tables, paper, start_index=len(chunks) + ) + ) # ChunkMetadata is frozen — recreate with source_file_path set, # plus optional external_metadata annotations (Cycle C). ext_parent = (external_metadata or {}).get("parent_paper_id") diff --git a/src/perspicacite/pipeline/parsers/pdf.py b/src/perspicacite/pipeline/parsers/pdf.py index a2c047f7..e7456a54 100644 --- a/src/perspicacite/pipeline/parsers/pdf.py +++ b/src/perspicacite/pipeline/parsers/pdf.py @@ -184,7 +184,7 @@ def _extract_with_pdfplumber(self, source: str | Path | bytes) -> tuple[str, dic return "\n\n".join(all_text), sections, page_count # ------------------------------------------------------------------ - # Backend selection + guards (R2 docling) + # docling extras pass: guards + worker runner (R2 docling) # ------------------------------------------------------------------ def _page_count(self, source) -> int: @@ -203,19 +203,15 @@ def _page_count(self, source) -> int: except Exception: return 0 - def _select_backend(self, source, page_count: int, config) -> str: - backend = getattr(config, "pdf_backend", "auto") - if backend == "fitz": - return "fitz" - if backend == "docling": - return "docling" - # auto: + def _should_run_docling_extras(self, page_count: int, config) -> bool: + """True when docling tables/figures extraction should run: the advanced + flag is on, the [docling] extra is importable, and the PDF is within the + page-count cap. The wall-clock timeout is the runtime safety net.""" + if not getattr(config, "docling_extract_tables_figures", False): + return False if not _docling_importable(): - return "fitz" - if page_count > int(getattr(config, "docling_max_pages", 40)): - logger.warning("docling_fallback", reason="oversized", pages=page_count) - return "fitz" - return "docling" + return False + return page_count <= int(getattr(config, "docling_max_pages", 40)) def _run_docling_with_timeout(self, source, timeout_s: int): """Run docling in a worker process; return ParsedContent or None on @@ -237,7 +233,7 @@ def _run_docling_with_timeout(self, source, timeout_s: int): # Public API # ------------------------------------------------------------------ - async def parse(self, source: str | Path | bytes, config=None) -> ParsedContent: + async def parse(self, source: str | Path | bytes) -> ParsedContent: """ Parse PDF and extract text. @@ -247,16 +243,6 @@ async def parse(self, source: str | Path | bytes, config=None) -> ParsedContent: Returns: Parsed content with text and metadata """ - if config is not None: - pages = self._page_count(source) - if self._select_backend(source, pages, config) == "docling": - pc = self._run_docling_with_timeout( - source, int(getattr(config, "docling_timeout_s", 120)) - ) - if pc is not None: - return pc - # else fall through to the fitz/pdfplumber path below - # Try PyMuPDF first (better column handling) result = self._extract_with_fitz(source) diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py index 410d3b2d..1b1d4e63 100644 --- a/tests/unit/test_config.py +++ b/tests/unit/test_config.py @@ -221,13 +221,11 @@ def test_anchor_config_near_threshold_bounds(): AnchorConfig(near_threshold=1.5) -def test_pdf_backend_defaults_and_validation(): +def test_docling_extras_config_defaults(): from perspicacite.config.schema import KnowledgeBaseConfig kb = KnowledgeBaseConfig() - assert kb.pdf_backend == "auto" + assert kb.docling_extract_tables_figures is False assert kb.docling_max_pages == 40 - assert kb.docling_timeout_s == 120 - import pytest - from pydantic import ValidationError - with pytest.raises(ValidationError): - KnowledgeBaseConfig(pdf_backend="banana") + assert kb.docling_timeout_s == 600 + kb2 = KnowledgeBaseConfig(docling_extract_tables_figures=True) + assert kb2.docling_extract_tables_figures is True diff --git a/tests/unit/test_local_docs_docling_wire.py b/tests/unit/test_local_docs_docling_wire.py index d83c6e4a..7331d5d1 100644 --- a/tests/unit/test_local_docs_docling_wire.py +++ b/tests/unit/test_local_docs_docling_wire.py @@ -3,57 +3,38 @@ from pathlib import Path -class _FakeParsed: - pass - - -class TestReadTextThreadsConfigAndTables(unittest.TestCase): - def test_pdf_returns_parsedcontent_with_tables_and_passes_config(self): +class TestReadTextIsFitzTextOnly(unittest.TestCase): + def test_pdf_returns_text_string(self): from perspicacite.integrations.local_docs import _read_text - from perspicacite.pipeline.parsers.docling_pdf import DoclingTable from perspicacite.pipeline.parsers.pdf import ParsedContent - seen = {} - class _FakeParser: - async def parse(self, source, config=None): - seen["config"] = config - return ParsedContent( - text="body text", - tables=[DoclingTable(page=1, caption="Table 1.", - markdown="| a |", headers=["a"], rows=[["1"]])], - ) - - sentinel = object() - out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), sentinel)) - assert isinstance(out, ParsedContent) - assert out.text == "body text" - assert len(out.tables) == 1 - assert seen["config"] is sentinel # config threaded to parse() - - def test_pdf_empty_text_returns_none(self): + async def parse(self, source): + return ParsedContent(text="body text") + + out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser())) + assert out == "body text" + + def test_pdf_empty_returns_none(self): from perspicacite.integrations.local_docs import _read_text from perspicacite.pipeline.parsers.pdf import ParsedContent class _FakeParser: - async def parse(self, source, config=None): + async def parse(self, source): return ParsedContent(text="") - out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), None)) - assert out is None + assert asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser())) is None - def test_non_pdf_wraps_text_in_parsedcontent(self): + def test_non_pdf_returns_text(self): import os import tempfile from perspicacite.integrations.local_docs import _read_text - from perspicacite.pipeline.parsers.pdf import ParsedContent with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f: f.write("hello world") p = Path(f.name) try: - out = asyncio.run(_read_text(p, "text", None, None)) - assert isinstance(out, ParsedContent) - assert "hello world" in out.text + out = asyncio.run(_read_text(p, "text", None)) + assert "hello world" in out finally: os.unlink(p) diff --git a/tests/unit/test_pdf_backend_guard.py b/tests/unit/test_pdf_backend_guard.py index c6aedc32..9912a87d 100644 --- a/tests/unit/test_pdf_backend_guard.py +++ b/tests/unit/test_pdf_backend_guard.py @@ -2,44 +2,41 @@ class _Cfg: - def __init__(self, backend="auto", max_pages=40, timeout=120): - self.pdf_backend = backend + def __init__(self, flag=True, max_pages=40, timeout=600): + self.docling_extract_tables_figures = flag self.docling_max_pages = max_pages self.docling_timeout_s = timeout -class TestBackendSelector(unittest.TestCase): - def _select(self, parser, pages, cfg=None): - return parser._select_backend("/x.pdf", pages, _Cfg(**(cfg or {}))) - - def test_explicit_fitz(self): +class TestShouldRunDoclingExtras(unittest.TestCase): + def test_flag_off_returns_false(self): from perspicacite.pipeline.parsers.pdf import PDFParser - p = PDFParser() - assert self._select(p, 5, {"backend": "fitz"}) == "fitz" + assert PDFParser()._should_run_docling_extras(5, _Cfg(flag=False)) is False - def test_explicit_docling(self): - from perspicacite.pipeline.parsers.pdf import PDFParser - p = PDFParser() - assert self._select(p, 5, {"backend": "docling"}) == "docling" - - def test_auto_uses_fitz_when_docling_absent(self): + def test_flag_on_importable_small_returns_true(self): from perspicacite.pipeline.parsers import pdf as m - p = m.PDFParser() orig = m._docling_importable - m._docling_importable = lambda: False + m._docling_importable = lambda: True try: - assert self._select(p, 5) == "fitz" + assert m.PDFParser()._should_run_docling_extras(5, _Cfg()) is True finally: m._docling_importable = orig - def test_auto_guard_on_pages(self): + def test_oversized_returns_false(self): from perspicacite.pipeline.parsers import pdf as m - p = m.PDFParser() orig = m._docling_importable m._docling_importable = lambda: True try: - assert self._select(p, 999, {"max_pages": 40}) == "fitz" - assert self._select(p, 10, {"max_pages": 40}) == "docling" + assert m.PDFParser()._should_run_docling_extras(999, _Cfg(max_pages=40)) is False + finally: + m._docling_importable = orig + + def test_not_importable_returns_false(self): + from perspicacite.pipeline.parsers import pdf as m + orig = m._docling_importable + m._docling_importable = lambda: False + try: + assert m.PDFParser()._should_run_docling_extras(5, _Cfg()) is False finally: m._docling_importable = orig From 1282ea35816661974f63447b04d4d7576fb3d891 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= Date: Tue, 2 Jun 2026 17:26:06 +0200 Subject: [PATCH 11/11] docs(pdf): document fitz text + optional docling tables/figures extraction (R2) New docs/pdf-extraction-docling.md (two-layer model, enabling, guard knobs, CPU cost + MPS limitation, scope/limits); CLAUDE.md "PDF extraction backends" pointer; README feature line. Co-Authored-By: Claude Opus 4.7 (cherry picked from commit 8b794ed8369aee268a1377b5b9aece071fbd3b9c) --- CLAUDE.md | 4 ++ README.md | 1 + docs/pdf-extraction-docling.md | 89 ++++++++++++++++++++++++++++++++++ 3 files changed, 94 insertions(+) create mode 100644 docs/pdf-extraction-docling.md diff --git a/CLAUDE.md b/CLAUDE.md index c968061a..70ea98c4 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -96,6 +96,10 @@ The `AgenticOrchestrator` ([src/perspicacite/rag/agentic/orchestrator.py](src/pe Publisher API keys are passed as kwargs; missing keys skip that source gracefully. Check `content_type` in the result: `"structured"` > `"full_text"` > `"abstract"` > `"none"`. +### PDF extraction backends + +PDF **text** is always extracted with PyMuPDF (`fitz`) → `pdfplumber` fallback (`pipeline/parsers/pdf.py`) — fast, default, on every ingest. **Structured tables + figures** are an opt-in advanced layer via **docling**, off by default (`knowledge_base.docling_extract_tables_figures`, guarded by `docling_max_pages` / `docling_timeout_s`; needs `uv sync --extra docling`). When enabled on the local-file ingest path, docling adds `content_type="table"` chunks on top of the fitz text; if docling is absent / oversized / times out, the text is unaffected. docling is CPU-bound (~min/page) and the MPS/GPU path is unusable on Apple Silicon. Full details: [docs/pdf-extraction-docling.md](docs/pdf-extraction-docling.md). + ### Retrieval `ChromaVectorStore` ([src/perspicacite/retrieval/chroma_store.py](src/perspicacite/retrieval/chroma_store.py)) wraps ChromaDB. KB collections are named via `chroma_collection_name_for_kb()` from `models/kb.py`. The hybrid retriever ([src/perspicacite/retrieval/hybrid.py](src/perspicacite/retrieval/hybrid.py)) combines ChromaDB cosine scores with BM25Okapi scores; weights default to 0.5/0.5 but can optionally be determined by the LLM at query time. `MultiKBRetriever` ([src/perspicacite/retrieval/multi_kb.py](src/perspicacite/retrieval/multi_kb.py)) fans a query across multiple KB collections, merges by score, deduplicates by `paper_id`, and tags results with `kb_name`; use `check_embedding_compat(kb_metas)` to validate that all queried KBs share the same embedding model before retrieval. diff --git a/README.md b/README.md index efc00cb8..db5b1673 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ - **Multi-database search** — Semantic Scholar, OpenAlex, PubMed, arXiv, HAL, DBLP via SciLEx - **Unified content pipeline** — PMC JATS XML, arXiv HTML, OA PDFs, publisher APIs, and institutional-access via browser-cookie replay; quality-priority routing +- **PDF extraction** — fast PyMuPDF text on every ingest, plus optional [docling](docs/pdf-extraction-docling.md) layout extraction for structured tables/figures (advanced, off by default) - **6 RAG modes** — Basic, Advanced, Profound, Agentic, Literature Survey, Contradiction; per-stage LLM tiering (Haiku routing/screening, Sonnet synthesis) - **Knowledge base management** — BibTeX import, DOI bulk-add, local document ingest, Zotero-collection import; async ingestion with SSE progress streaming - **Citation-graph expansion** — forward + backward snowball over OpenAlex; automatic Semantic Scholar fallback for arXiv-seeded papers (see [docs/concepts/citation-graph.md](docs/concepts/citation-graph.md)) diff --git a/docs/pdf-extraction-docling.md b/docs/pdf-extraction-docling.md new file mode 100644 index 00000000..9435bcc3 --- /dev/null +++ b/docs/pdf-extraction-docling.md @@ -0,0 +1,89 @@ +# PDF extraction: fast text + optional docling tables/figures + +Perspicacité extracts PDF content in **two independent layers**: + +| Layer | Engine | Runs | Output | Speed | +|-------|--------|------|--------|-------| +| **Text** (always on) | PyMuPDF (`fitz`) → `pdfplumber` fallback | every PDF ingest | full body text + sections | fast (sub-second) | +| **Tables + figures** (opt-in, advanced) | docling layout model | only when enabled | structured tables as retrievable chunks | slow (CPU-bound, ~minutes/page) | + +The layers are decoupled: **text never depends on docling.** If the `[docling]` +extra is not installed, the PDF exceeds the page cap, or docling errors/times +out, you still get the full fitz text — you simply don't get the table chunks. +Enabling docling can only *add* content, never break ingest. + +## Why docling is off by default + +Docling runs the RT-DETR layout model + TableFormer. On CPU this is roughly +**~45–50 s per page (~10 min for a typical paper)**. On Apple Silicon the GPU +(MPS) path is currently **unusable** — the upstream `transformers` RT-DETRv2 +positional embedding hard-codes `float64`, which MPS does not support +(see [huggingface/transformers#28334](https://github.com/huggingface/transformers/issues/28334)); +`PYTORCH_ENABLE_MPS_FALLBACK=1` does not help. So docling here is a deliberate, +batch/offline choice, not a hot-path default. A CUDA machine makes it fast +enough for routine use. + +## Enabling docling + +1. Install the optional extra (one-time, heavy — pulls torch + layout models): + + ```bash + uv sync --extra docling + ``` + +2. In `config.yml` under `knowledge_base:`: + + ```yaml + docling_extract_tables_figures: true # default: false + docling_max_pages: 40 # PDFs larger than this skip docling (text-only) + docling_timeout_s: 600 # per-document wall-clock cap; on timeout, keep text, skip extras + ``` + +3. Ingest **local PDF files** (the local-files / dropzone path). Each PDF gets + fitz text **plus** any tables docling extracts, added as searchable chunks + tagged `content_type="table"` (caption + page preserved in metadata). + +## Guard behaviour (config knobs) + +- `docling_extract_tables_figures` (bool, default `false`) — master switch for + the advanced layer. +- `docling_max_pages` (int, default `40`) — documents with more pages skip + docling and use text-only fitz (avoids the worst-case multi-minute cost). +- `docling_timeout_s` (int, default `600`) — per-document wall-clock cap. docling + runs in a worker process; on timeout it is abandoned and ingest falls back to + the already-extracted fitz text. Every fallback logs one structured + `docling_fallback` event (`reason=oversized|timeout|error`). + +## Scope and current limits + +- **Wired for the local-file ingest path** (`integrations/local_docs.py`). The + DOI/BibTeX download path is text-only for now (adding table chunks there needs + a `Paper.tables` field — a follow-up). +- **Tables become chunks today; figures are extracted but not yet consumed.** + Docling figure records are produced (caption + image, dimensions populated) + and mapped to the existing multimodal record shape, but feeding figure images + into the answer/vision pipeline is a follow-up. +- **CPU-only in practice** on Apple Silicon (see above). Prefer a CUDA host or a + remote docling service for large batches. + +## Implementation pointers + +- Converter + record mapping: `src/perspicacite/pipeline/parsers/docling_pdf.py` + (`DoclingPDFParser`, `DoclingTable`, `DoclingFigure`, + `figure_to_multimodal_record`). The converter forces + `AcceleratorDevice.CPU` and enables `generate_picture_images` + `images_scale=2.0` + (without picture-image rendering, `PictureItem.get_image()` returns `None` and + every figure is dropped). +- Backend guard + worker: `src/perspicacite/pipeline/parsers/pdf.py` + (`_should_run_docling_extras`, `_run_docling_with_timeout`, `_docling_importable`). +- Table → chunk: `src/perspicacite/pipeline/chunking_dispatch.py` + (`table_records_to_chunks`). +- Config: `src/perspicacite/config/schema.py` (`KnowledgeBaseConfig`). + +## Note on full text vs. abstracts + +If a knowledge base shows only abstracts, that is a **source** issue, not a +docling one: a Zotero `.bib` carries abstracts only. To get full text, ingest +the actual **PDFs** (local-file path) — the fast fitz layer already returns the +complete body text, no docling required. Enable docling only when you also want +the papers' **tables** as retrievable content.