From 83e949d655f4ac2a2594440c498a87da927ce413 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:24:26 +0200
Subject: [PATCH 01/11] feat(config): pdf_backend + docling guard knobs (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit f82b9a57e8422913480312d888f4e6002a55ecd8)
---
 config.example.yml                |  7 +++++++
 src/perspicacite/config/schema.py | 17 +++++++++++++++++
 tests/unit/test_config.py         | 12 ++++++++++++
 3 files changed, 36 insertions(+)

diff --git a/config.example.yml b/config.example.yml
index ed3760fd..d0ba19cd 100644
--- a/config.example.yml
+++ b/config.example.yml
@@ -67,6 +67,13 @@ knowledge_base:
   embedding_model: "text-embedding-3-small"
   chunk_size: 1000
   chunk_overlap: 200
+  # PDF extraction backend.
+  #   fitz    — text-only PyMuPDF (default-installed, fast)
+  #   docling — layout model: structured tables + figures (needs `uv sync --extra docling`)
+  #   auto    — docling when installed and within docling_max_pages, else fitz
+  pdf_backend: auto
+  docling_max_pages: 40     # auto: PDFs larger than this use fitz
+  docling_timeout_s: 120    # auto: docling wall-clock cap per document; on timeout → fitz
   chunking_method: "token"
   default_top_k: 10
   similarity_threshold: 0.7
diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py
index a9e7d3d3..3875ecf8 100644
--- a/src/perspicacite/config/schema.py
+++ b/src/perspicacite/config/schema.py
@@ -92,6 +92,23 @@ class KnowledgeBaseConfig(BaseModel):
     embedding_model: str = "text-embedding-3-small"
     chunk_size: int = Field(default=1000, ge=100, le=10000)
     chunk_overlap: int = Field(default=200, ge=0, le=1000)
+    pdf_backend: Literal["auto", "docling", "fitz"] = Field(
+        default="auto",
+        description=(
+            "PDF extraction backend. 'fitz' = text-only PyMuPDF (incumbent); "
+            "'docling' = layout model with structured tables + figures "
+            "(requires the [docling] extra); 'auto' = docling when importable "
+            "and within docling_max_pages, else fitz."
+        ),
+    )
+    docling_max_pages: int = Field(
+        default=40, ge=1,
+        description="In 'auto', skip docling for PDFs with more pages than this (use fitz).",
+    )
+    docling_timeout_s: int = Field(
+        default=120, ge=1,
+        description="Per-document wall-clock cap for docling; on timeout, fall back to fitz.",
+    )
     chunking_method: Literal["token", "semantic", "agentic"] = "token"
     default_top_k: int = Field(default=10, ge=1, le=100)
     similarity_threshold: float = Field(default=0.7, ge=0.0, le=1.0)
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 84b81683..410d3b2d 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -219,3 +219,15 @@ def test_anchor_config_near_threshold_bounds():
     AnchorConfig(near_threshold=1.0)
     with pytest.raises(ValidationError):
         AnchorConfig(near_threshold=1.5)
+
+
+def test_pdf_backend_defaults_and_validation():
+    from perspicacite.config.schema import KnowledgeBaseConfig
+    kb = KnowledgeBaseConfig()
+    assert kb.pdf_backend == "auto"
+    assert kb.docling_max_pages == 40
+    assert kb.docling_timeout_s == 120
+    import pytest
+    from pydantic import ValidationError
+    with pytest.raises(ValidationError):
+        KnowledgeBaseConfig(pdf_backend="banana")

From 1c781eacf0aa0967a878d03b74a97fbff24a9465 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:27:24 +0200
Subject: [PATCH 02/11] build: add [docling] optional extra (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit d0ea45f4084aca111a52bf9a77323edf3fbf266c)
---
 pyproject.toml | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/pyproject.toml b/pyproject.toml
index 5bc02e0b..a7a71449 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -151,6 +151,18 @@ adapters = [
     "indicium-adapters-metabolomics>=0.1.0",
 ]
 
+# docling — high-fidelity PDF -> structured document conversion (layout,
+# tables, sections) for the content pipeline. Heavier than the other
+# extras (pulls in torch-backed models + pandas). Install only when
+# docling-based parsing is needed.
+#
+# Install with:
+#   uv sync --extra docling
+docling = [
+    "docling>=2.5,<3",
+    "pandas>=2.0,<3",
+]
+
 [project.scripts]
 perspicacite = "perspicacite.cli:main"
 

From 329928b084f4d3cd429d7d66eca76b5ba686f607 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:30:17 +0200
Subject: [PATCH 03/11] feat(parsers): docling record types + ParsedContent
 tables/figures (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit 2fa29a6961b62bf8d547db940918ffadf34f5a5b)
---
 .../pipeline/parsers/docling_pdf.py           | 44 +++++++++++++++++++
 src/perspicacite/pipeline/parsers/pdf.py      | 10 ++++-
 tests/unit/test_docling_pdf.py                | 16 +++++++
 3 files changed, 68 insertions(+), 2 deletions(-)
 create mode 100644 src/perspicacite/pipeline/parsers/docling_pdf.py
 create mode 100644 tests/unit/test_docling_pdf.py

diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py
new file mode 100644
index 00000000..a0e2a725
--- /dev/null
+++ b/src/perspicacite/pipeline/parsers/docling_pdf.py
@@ -0,0 +1,44 @@
+"""Docling-backed PDF extraction (R2).
+
+Ports the converter configuration proven in AgenticScienceBuilder's
+figures.py: picture images MUST be rendered (generate_picture_images=True)
+or PictureItem.get_image() returns None and every figure is dropped; figure
+pixel dimensions MUST be read from the rendered image or the size filter
+discards them. No dependency on ASB.
+"""
+from __future__ import annotations
+
+import importlib.util
+from dataclasses import dataclass
+
+_MIN_AREA_PX = 50_000  # drop logos/icons (mirrors ASB)
+
+
+@dataclass
+class DoclingTable:
+    page: int
+    caption: str
+    markdown: str
+    headers: list[str]
+    rows: list[list[str]]
+
+    @property
+    def n_rows(self) -> int:
+        return len(self.rows)
+
+    @property
+    def n_cols(self) -> int:
+        return len(self.headers)
+
+
+@dataclass
+class DoclingFigure:
+    page: int
+    caption: str
+    width_px: int
+    height_px: int
+    image_bytes: bytes = b""
+
+
+def docling_importable() -> bool:
+    return importlib.util.find_spec("docling") is not None
diff --git a/src/perspicacite/pipeline/parsers/pdf.py b/src/perspicacite/pipeline/parsers/pdf.py
index cb503d74..c09d8a52 100644
--- a/src/perspicacite/pipeline/parsers/pdf.py
+++ b/src/perspicacite/pipeline/parsers/pdf.py
@@ -6,12 +6,15 @@
 """
 
 import re
-from dataclasses import dataclass
+from dataclasses import dataclass, field
 from pathlib import Path
-from typing import Any
+from typing import TYPE_CHECKING, Any
 
 from perspicacite.logging import get_logger
 
+if TYPE_CHECKING:
+    from perspicacite.pipeline.parsers.docling_pdf import DoclingFigure, DoclingTable
+
 logger = get_logger("perspicacite.pipeline.parsers.pdf")
 
 
@@ -48,6 +51,9 @@ class ParsedContent:
     title: str | None = None
     sections: dict[str, str] | None = None
     metadata: dict[str, Any] | None = None
+    # R2 (docling): empty on the fitz path; populated when docling is used.
+    tables: list["DoclingTable"] = field(default_factory=list)
+    figures: list["DoclingFigure"] = field(default_factory=list)
 
 
 class PDFParser:
diff --git a/tests/unit/test_docling_pdf.py b/tests/unit/test_docling_pdf.py
new file mode 100644
index 00000000..7d77b5c5
--- /dev/null
+++ b/tests/unit/test_docling_pdf.py
@@ -0,0 +1,16 @@
+import unittest
+
+
+class TestRecordsAndParsedContent(unittest.TestCase):
+    def test_parsed_content_defaults_empty_tables_figures(self):
+        from perspicacite.pipeline.parsers.pdf import ParsedContent
+        pc = ParsedContent(text="hi")
+        assert pc.tables == []
+        assert pc.figures == []
+
+    def test_record_dataclasses_construct(self):
+        from perspicacite.pipeline.parsers.docling_pdf import DoclingTable, DoclingFigure
+        t = DoclingTable(page=2, caption="Table 1.", markdown="| a |", headers=["a"], rows=[["1"]])
+        assert t.n_rows == 1 and t.n_cols == 1
+        f = DoclingFigure(page=1, caption="Figure 1.", width_px=300, height_px=300, image_bytes=b"x")
+        assert f.width_px == 300

From 2a66d5d36ea1498f7cc064bc3f7858942fc2d428 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:34:06 +0200
Subject: [PATCH 04/11] feat(parsers): DoclingPDFParser converter +
 figure/table mapping (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit 8298502a2836103e015af1b1d0dc9d63ad277db9)
---
 .../pipeline/parsers/docling_pdf.py           | 106 ++++++++++++++++++
 tests/unit/test_docling_pdf.py                |  67 +++++++++++
 2 files changed, 173 insertions(+)

diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py
index a0e2a725..4f980080 100644
--- a/src/perspicacite/pipeline/parsers/docling_pdf.py
+++ b/src/perspicacite/pipeline/parsers/docling_pdf.py
@@ -10,6 +10,17 @@
 
 import importlib.util
 from dataclasses import dataclass
+from io import BytesIO
+from typing import TYPE_CHECKING, Any
+
+from perspicacite.logging import get_logger
+from perspicacite.pipeline.parsers.pdf import ParsedContent
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+    from pathlib import Path
+
+logger = get_logger("perspicacite.pipeline.parsers.docling")
 
 _MIN_AREA_PX = 50_000  # drop logos/icons (mirrors ASB)
 
@@ -42,3 +53,98 @@ class DoclingFigure:
 
 def docling_importable() -> bool:
     return importlib.util.find_spec("docling") is not None
+
+
+def _make_docling_converter():
+    # Picture images MUST be enabled or get_image() returns None (zero figures).
+    from docling.datamodel.base_models import InputFormat
+    from docling.datamodel.pipeline_options import PdfPipelineOptions
+    from docling.document_converter import DocumentConverter, PdfFormatOption
+
+    opts = PdfPipelineOptions()
+    opts.generate_picture_images = True
+    opts.images_scale = 2.0
+    return DocumentConverter(
+        format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
+    )
+
+
+def _page_of(item) -> int:
+    prov = getattr(item, "prov", None) or []
+    if prov and getattr(prov[0], "page_no", None) is not None:
+        return int(prov[0].page_no)
+    return 1
+
+
+class DoclingPDFParser:
+    """Extracts text + structured tables + figures via docling."""
+
+    def __init__(self, converter_factory: Callable[[], Any] = _make_docling_converter):
+        self._converter_factory = converter_factory
+
+    def extract(self, source: str | Path) -> ParsedContent:
+        conv = self._converter_factory()
+        doc = conv.convert(str(source)).document
+        figures = self._figures(doc)
+        tables = self._tables(doc)
+        text = self._text(doc)
+        return ParsedContent(
+            text=text,
+            sections=None,
+            metadata={"extractor": "docling"},
+            tables=tables,
+            figures=figures,
+        )
+
+    def _text(self, doc) -> str:
+        try:
+            return doc.export_to_markdown()
+        except Exception:
+            return ""
+
+    def _figures(self, doc) -> list[DoclingFigure]:
+        out: list[DoclingFigure] = []
+        for pic in getattr(doc, "pictures", []) or []:
+            try:
+                pil = pic.get_image(doc)
+                w, h = pil.width, pil.height
+                buf = BytesIO()
+                pil.save(buf, "PNG")
+                image_bytes = buf.getvalue()
+            except Exception:
+                continue
+            if len(image_bytes) < 1024:
+                continue
+            try:
+                caption = pic.caption_text(doc) or ""
+            except Exception:
+                caption = ""
+            out.append(
+                DoclingFigure(
+                    page=_page_of(pic), caption=caption,
+                    width_px=w, height_px=h, image_bytes=image_bytes,
+                )
+            )
+        return out
+
+    def _tables(self, doc) -> list[DoclingTable]:
+        out: list[DoclingTable] = []
+        for tbl in getattr(doc, "tables", []) or []:
+            try:
+                df = tbl.export_to_dataframe(doc)
+                headers = [str(c) for c in df.columns.tolist()]
+                rows = [[str(v) for v in row] for row in df.values.tolist()]
+                markdown = tbl.export_to_markdown(doc)
+            except Exception:
+                continue
+            try:
+                caption = tbl.caption_text(doc) or ""
+            except Exception:
+                caption = ""
+            out.append(
+                DoclingTable(
+                    page=_page_of(tbl), caption=caption,
+                    markdown=markdown, headers=headers, rows=rows,
+                )
+            )
+        return out
diff --git a/tests/unit/test_docling_pdf.py b/tests/unit/test_docling_pdf.py
index 7d77b5c5..2f3866dd 100644
--- a/tests/unit/test_docling_pdf.py
+++ b/tests/unit/test_docling_pdf.py
@@ -14,3 +14,70 @@ def test_record_dataclasses_construct(self):
         assert t.n_rows == 1 and t.n_cols == 1
         f = DoclingFigure(page=1, caption="Figure 1.", width_px=300, height_px=300, image_bytes=b"x")
         assert f.width_px == 300
+
+
+class _FakeProv:
+    def __init__(self, page_no): self.page_no = page_no
+
+class _FakeImg:
+    def __init__(self, png): self._png = png; self.width = 300; self.height = 300
+    def save(self, buf, fmt): buf.write(self._png)
+
+class _FakePicture:
+    def __init__(self, page, caption, png):
+        self.prov = [_FakeProv(page)]; self._caption = caption; self._png = png
+    def caption_text(self, doc): return self._caption
+    def get_image(self, doc): return _FakeImg(self._png)
+
+class _FakeTable:
+    def __init__(self, page, caption, headers, rows):
+        self.prov = [_FakeProv(page)]; self._caption = caption
+        self._headers = headers; self._rows = rows
+    def caption_text(self, doc): return self._caption
+    def export_to_markdown(self, doc=None): return "| " + " | ".join(self._headers) + " |"
+    def export_to_dataframe(self, doc=None):
+        import pandas as pd
+        return pd.DataFrame(self._rows, columns=self._headers)
+
+class _FakeDoc:
+    def __init__(self, pictures, tables): self.pictures = pictures; self.tables = tables
+
+class _FakeResult:
+    def __init__(self, doc): self.document = doc
+
+class _FakeConverter:
+    def __init__(self, doc): self._doc = doc
+    def convert(self, source): return _FakeResult(self._doc)
+
+
+class TestDoclingExtraction(unittest.TestCase):
+    def test_maps_pictures_and_tables_dims_populated(self):
+        import importlib.util
+        if importlib.util.find_spec("pandas") is None:
+            self.skipTest("pandas required")
+        from perspicacite.pipeline.parsers import docling_pdf as d
+        png = b"\x89PNG\r\n\x1a\n" + b"\x00" * 2048
+        doc = _FakeDoc(
+            pictures=[_FakePicture(1, "Figure 1.", png)],
+            tables=[_FakeTable(2, "Table 1.", ["k", "v"], [["a", "1"]])],
+        )
+        parser = d.DoclingPDFParser(converter_factory=lambda: _FakeConverter(doc))
+        res = parser.extract("/x.pdf")
+        assert len(res.figures) == 1
+        assert res.figures[0].width_px == 300 and res.figures[0].height_px == 300
+        assert len(res.tables) == 1
+        assert res.tables[0].headers == ["k", "v"] and res.tables[0].rows == [["a", "1"]]
+        assert "k" in res.tables[0].markdown
+
+
+class TestDoclingConverterConfig(unittest.TestCase):
+    def test_converter_enables_picture_images(self):
+        import importlib.util
+        if importlib.util.find_spec("docling") is None:
+            self.skipTest("docling extra required")
+        from perspicacite.pipeline.parsers.docling_pdf import _make_docling_converter
+        from docling.datamodel.base_models import InputFormat
+        conv = _make_docling_converter()
+        opts = conv.format_to_options[InputFormat.PDF].pipeline_options
+        assert opts.generate_picture_images is True
+        assert opts.images_scale >= 2.0

From 067c8402e835d8be0ab4a1ae0a8283e17fd4d39a Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:37:22 +0200
Subject: [PATCH 05/11] feat(parsers): docling backend selector + page/timeout
 guard (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit dcd8351d68916713a0089ae3b5c477b37eccb00e)
---
 src/perspicacite/pipeline/parsers/pdf.py | 72 ++++++++++++++++++-
 tests/unit/test_pdf_backend_guard.py     | 88 ++++++++++++++++++++++++
 2 files changed, 159 insertions(+), 1 deletion(-)
 create mode 100644 tests/unit/test_pdf_backend_guard.py

diff --git a/src/perspicacite/pipeline/parsers/pdf.py b/src/perspicacite/pipeline/parsers/pdf.py
index c09d8a52..a2c047f7 100644
--- a/src/perspicacite/pipeline/parsers/pdf.py
+++ b/src/perspicacite/pipeline/parsers/pdf.py
@@ -18,6 +18,16 @@
 logger = get_logger("perspicacite.pipeline.parsers.pdf")
 
 
+def _docling_importable() -> bool:
+    from perspicacite.pipeline.parsers.docling_pdf import docling_importable
+    return docling_importable()
+
+
+def _docling_extract_worker(path: str):
+    from perspicacite.pipeline.parsers.docling_pdf import DoclingPDFParser
+    return DoclingPDFParser().extract(path)
+
+
 def _clean_text(text: str, threshold: float = 0.05) -> str:
     """Collapse excess newlines when they dominate the text.
 
@@ -173,11 +183,61 @@ def _extract_with_pdfplumber(self, source: str | Path | bytes) -> tuple[str, dic
 
         return "\n\n".join(all_text), sections, page_count
 
+    # ------------------------------------------------------------------
+    # Backend selection + guards (R2 docling)
+    # ------------------------------------------------------------------
+
+    def _page_count(self, source) -> int:
+        fitz = self._get_fitz()
+        if fitz is None:
+            return 0
+        try:
+            doc = (
+                fitz.open(str(source))
+                if isinstance(source, (str, Path))
+                else fitz.open(stream=source, filetype="pdf")
+            )
+            n = doc.page_count
+            doc.close()
+            return n
+        except Exception:
+            return 0
+
+    def _select_backend(self, source, page_count: int, config) -> str:
+        backend = getattr(config, "pdf_backend", "auto")
+        if backend == "fitz":
+            return "fitz"
+        if backend == "docling":
+            return "docling"
+        # auto:
+        if not _docling_importable():
+            return "fitz"
+        if page_count > int(getattr(config, "docling_max_pages", 40)):
+            logger.warning("docling_fallback", reason="oversized", pages=page_count)
+            return "fitz"
+        return "docling"
+
+    def _run_docling_with_timeout(self, source, timeout_s: int):
+        """Run docling in a worker process; return ParsedContent or None on
+        timeout/error (caller falls back to fitz)."""
+        from concurrent.futures import ProcessPoolExecutor
+        from concurrent.futures import TimeoutError as FTimeout
+        try:
+            with ProcessPoolExecutor(max_workers=1) as ex:
+                fut = ex.submit(_docling_extract_worker, str(source))
+                return fut.result(timeout=timeout_s)
+        except FTimeout:
+            logger.warning("docling_fallback", reason="timeout", path=str(source))
+            return None
+        except Exception as exc:
+            logger.warning("docling_fallback", reason="error", error=str(exc))
+            return None
+
     # ------------------------------------------------------------------
     # Public API
     # ------------------------------------------------------------------
 
-    async def parse(self, source: str | Path | bytes) -> ParsedContent:
+    async def parse(self, source: str | Path | bytes, config=None) -> ParsedContent:
         """
         Parse PDF and extract text.
 
@@ -187,6 +247,16 @@ async def parse(self, source: str | Path | bytes) -> ParsedContent:
         Returns:
             Parsed content with text and metadata
         """
+        if config is not None:
+            pages = self._page_count(source)
+            if self._select_backend(source, pages, config) == "docling":
+                pc = self._run_docling_with_timeout(
+                    source, int(getattr(config, "docling_timeout_s", 120))
+                )
+                if pc is not None:
+                    return pc
+                # else fall through to the fitz/pdfplumber path below
+
         # Try PyMuPDF first (better column handling)
         result = self._extract_with_fitz(source)
 
diff --git a/tests/unit/test_pdf_backend_guard.py b/tests/unit/test_pdf_backend_guard.py
new file mode 100644
index 00000000..c6aedc32
--- /dev/null
+++ b/tests/unit/test_pdf_backend_guard.py
@@ -0,0 +1,88 @@
+import unittest
+
+
+class _Cfg:
+    def __init__(self, backend="auto", max_pages=40, timeout=120):
+        self.pdf_backend = backend
+        self.docling_max_pages = max_pages
+        self.docling_timeout_s = timeout
+
+
+class TestBackendSelector(unittest.TestCase):
+    def _select(self, parser, pages, cfg=None):
+        return parser._select_backend("/x.pdf", pages, _Cfg(**(cfg or {})))
+
+    def test_explicit_fitz(self):
+        from perspicacite.pipeline.parsers.pdf import PDFParser
+        p = PDFParser()
+        assert self._select(p, 5, {"backend": "fitz"}) == "fitz"
+
+    def test_explicit_docling(self):
+        from perspicacite.pipeline.parsers.pdf import PDFParser
+        p = PDFParser()
+        assert self._select(p, 5, {"backend": "docling"}) == "docling"
+
+    def test_auto_uses_fitz_when_docling_absent(self):
+        from perspicacite.pipeline.parsers import pdf as m
+        p = m.PDFParser()
+        orig = m._docling_importable
+        m._docling_importable = lambda: False
+        try:
+            assert self._select(p, 5) == "fitz"
+        finally:
+            m._docling_importable = orig
+
+    def test_auto_guard_on_pages(self):
+        from perspicacite.pipeline.parsers import pdf as m
+        p = m.PDFParser()
+        orig = m._docling_importable
+        m._docling_importable = lambda: True
+        try:
+            assert self._select(p, 999, {"max_pages": 40}) == "fitz"
+            assert self._select(p, 10, {"max_pages": 40}) == "docling"
+        finally:
+            m._docling_importable = orig
+
+
+class TestTimeoutFallback(unittest.TestCase):
+    def test_timeout_branch_via_stub(self):
+        from concurrent.futures import TimeoutError as FTimeout
+
+        from perspicacite.pipeline.parsers.pdf import PDFParser
+        p = PDFParser()
+
+        class _Fut:
+            def result(self, timeout): raise FTimeout()
+
+        class _Ex:
+            def __enter__(self): return self
+            def __exit__(self, *a): return False
+            def submit(self, *a, **k): return _Fut()
+
+        import concurrent.futures as cf
+        orig_ex = cf.ProcessPoolExecutor
+        cf.ProcessPoolExecutor = lambda *a, **k: _Ex()
+        try:
+            assert p._run_docling_with_timeout("/x.pdf", timeout_s=1) is None
+        finally:
+            cf.ProcessPoolExecutor = orig_ex
+
+    def test_error_branch_returns_none(self):
+        from perspicacite.pipeline.parsers.pdf import PDFParser
+        p = PDFParser()
+
+        class _Fut:
+            def result(self, timeout): raise RuntimeError("boom")
+
+        class _Ex:
+            def __enter__(self): return self
+            def __exit__(self, *a): return False
+            def submit(self, *a, **k): return _Fut()
+
+        import concurrent.futures as cf
+        orig_ex = cf.ProcessPoolExecutor
+        cf.ProcessPoolExecutor = lambda *a, **k: _Ex()
+        try:
+            assert p._run_docling_with_timeout("/x.pdf", timeout_s=1) is None
+        finally:
+            cf.ProcessPoolExecutor = orig_ex

From 440e744c9785ddc7555ead96858f8f04f9af440b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:41:30 +0200
Subject: [PATCH 06/11] feat(chunking): emit content_type=table chunks from
 docling tables (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit fd47e9541c3d6f4fbaed0b36a2218ff69fc95457)
---
 .../pipeline/chunking_dispatch.py             | 24 ++++++++++++++++++
 tests/unit/test_docling_table_chunks.py       | 25 +++++++++++++++++++
 2 files changed, 49 insertions(+)
 create mode 100644 tests/unit/test_docling_table_chunks.py

diff --git a/src/perspicacite/pipeline/chunking_dispatch.py b/src/perspicacite/pipeline/chunking_dispatch.py
index be188396..156c0807 100644
--- a/src/perspicacite/pipeline/chunking_dispatch.py
+++ b/src/perspicacite/pipeline/chunking_dispatch.py
@@ -58,6 +58,30 @@
 }
 
 
+def table_records_to_chunks(tables, paper, start_index: int) -> list[DocumentChunk]:
+    """Turn ``DoclingTable`` records into retrievable chunks tagged ``content_type='table'``.
+
+    No-op when ``tables`` is empty (the fitz path), preserving today's behaviour.
+    """
+    chunks: list[DocumentChunk] = []
+    for i, t in enumerate(tables):
+        body = (f"{t.caption}\n\n{t.markdown}" if t.caption else t.markdown).strip()
+        idx = start_index + i
+        meta = ChunkMetadata(
+            paper_id=getattr(paper, "paper_id", "unknown"),
+            chunk_index=idx,
+            content_type="table",
+            page=getattr(t, "page", None),
+            title=getattr(paper, "title", None),
+            doi=getattr(paper, "doi", None),
+            year=getattr(paper, "year", None),
+        )
+        chunks.append(
+            DocumentChunk(id=f"{meta.paper_id}:table:{idx}", text=body, metadata=meta)
+        )
+    return chunks
+
+
 def infer_content_type(path: Path) -> tuple[str, str | None]:
     """Map file extension to ``(content_type, language)``.
 
diff --git a/tests/unit/test_docling_table_chunks.py b/tests/unit/test_docling_table_chunks.py
new file mode 100644
index 00000000..cd5c038c
--- /dev/null
+++ b/tests/unit/test_docling_table_chunks.py
@@ -0,0 +1,25 @@
+import unittest
+
+
+class TestTableChunks(unittest.TestCase):
+    def test_table_records_become_table_chunks(self):
+        from perspicacite.pipeline.parsers.docling_pdf import DoclingTable
+        from perspicacite.pipeline.chunking_dispatch import table_records_to_chunks
+
+        class _Paper:
+            paper_id = "local:abc"
+            title = "T"; doi = None; year = None
+        tables = [DoclingTable(page=3, caption="Table 1. Params.",
+                               markdown="| k | v |\n| a | 1 |", headers=["k", "v"], rows=[["a", "1"]])]
+        chunks = table_records_to_chunks(tables, _Paper(), start_index=0)
+        assert len(chunks) == 1
+        c = chunks[0]
+        assert c.metadata.content_type == "table"
+        assert c.metadata.page == 3
+        assert "Table 1" in c.text and "| k | v |" in c.text
+
+    def test_empty_tables_yield_no_chunks(self):
+        from perspicacite.pipeline.chunking_dispatch import table_records_to_chunks
+        class _Paper:
+            paper_id = "p"; title = None; doi = None; year = None
+        assert table_records_to_chunks([], _Paper(), start_index=5) == []

From 2a4ee2e3ade842f6d8e8a5bfab8201611b7d76f2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:46:57 +0200
Subject: [PATCH 07/11] feat(parsers): map docling figures to multimodal record
 shape (R2)

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit 16495f54590babcdc547c26e09d5edeb5250ba31)
---
 .../pipeline/parsers/docling_pdf.py           | 16 +++++++++++++
 tests/unit/test_docling_pdf.py                | 23 +++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py
index 4f980080..4f119286 100644
--- a/src/perspicacite/pipeline/parsers/docling_pdf.py
+++ b/src/perspicacite/pipeline/parsers/docling_pdf.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import importlib.util
+import re
 from dataclasses import dataclass
 from io import BytesIO
 from typing import TYPE_CHECKING, Any
@@ -76,6 +77,21 @@ def _page_of(item) -> int:
     return 1
 
 
+_FIG_LABEL_RE = re.compile(
+    r"^\s*((?:supplementary\s+)?(?:fig(?:ure|\.)?|scheme)\s+[A-Za-z]?\d+[A-Za-z]?)",
+    re.IGNORECASE,
+)
+
+
+def figure_to_multimodal_record(fig: DoclingFigure) -> dict:
+    """Map a DoclingFigure to the existing multimodal record shape
+    {kind, label, caption, content} used by parsers/multimodal.py. `content`
+    is left empty: docling supplies the image, not a semantic description."""
+    m = _FIG_LABEL_RE.match(fig.caption or "")
+    label = m.group(1).strip() if m else ""
+    return {"kind": "figure", "label": label, "caption": fig.caption or "", "content": ""}
+
+
 class DoclingPDFParser:
     """Extracts text + structured tables + figures via docling."""
 
diff --git a/tests/unit/test_docling_pdf.py b/tests/unit/test_docling_pdf.py
index 2f3866dd..cd77f680 100644
--- a/tests/unit/test_docling_pdf.py
+++ b/tests/unit/test_docling_pdf.py
@@ -81,3 +81,26 @@ def test_converter_enables_picture_images(self):
         opts = conv.format_to_options[InputFormat.PDF].pipeline_options
         assert opts.generate_picture_images is True
         assert opts.images_scale >= 2.0
+
+
+class TestFigureToMultimodalShape(unittest.TestCase):
+    def test_figure_maps_to_kind_caption_content(self):
+        from perspicacite.pipeline.parsers.docling_pdf import (
+            DoclingFigure, figure_to_multimodal_record,
+        )
+        f = DoclingFigure(page=1, caption="Figure 2. Workflow.",
+                          width_px=400, height_px=300, image_bytes=b"x")
+        rec = figure_to_multimodal_record(f)
+        assert rec["kind"] == "figure"
+        assert rec["caption"] == "Figure 2. Workflow."
+        assert rec["label"] == "Figure 2"
+        assert "content" in rec
+
+    def test_figure_without_label_caption(self):
+        from perspicacite.pipeline.parsers.docling_pdf import (
+            DoclingFigure, figure_to_multimodal_record,
+        )
+        f = DoclingFigure(page=1, caption="An unlabeled panel", width_px=400, height_px=300)
+        rec = figure_to_multimodal_record(f)
+        assert rec["kind"] == "figure"
+        assert rec["label"] == ""

From de7eeec9dbbb1a64396b636a9a91b6a9d6399450 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 15:52:40 +0200
Subject: [PATCH 08/11] feat(local-docs): run docling on PDF ingest + emit
 table chunks (R2)

local-file ingest now passes the KB config to PDFParser.parse so the
docling backend activates per the guard, and appends content_type=table
chunks from any extracted tables. BibTeX/DOI path unchanged (follow-up).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit 08130497b8307a927f5f7482060db63c95d55110)
---
 src/perspicacite/integrations/local_docs.py | 24 +++++++--
 tests/unit/test_local_docs_docling_wire.py  | 59 +++++++++++++++++++++
 2 files changed, 78 insertions(+), 5 deletions(-)
 create mode 100644 tests/unit/test_local_docs_docling_wire.py

diff --git a/src/perspicacite/integrations/local_docs.py b/src/perspicacite/integrations/local_docs.py
index f72275c1..57381f2e 100644
--- a/src/perspicacite/integrations/local_docs.py
+++ b/src/perspicacite/integrations/local_docs.py
@@ -18,6 +18,7 @@
     chunk_document,
     infer_content_type,
 )
+from perspicacite.pipeline.parsers.pdf import ParsedContent
 
 logger = get_logger("perspicacite.local_docs")
 
@@ -97,17 +98,20 @@ def _extract_year_from_text(text: str | None) -> int | None:
     return None
 
 
-async def _read_text(path: Path, content_type: str, pdf_parser) -> str | None:
+async def _read_text(
+    path: Path, content_type: str, pdf_parser, config=None
+) -> ParsedContent | None:
     if content_type == "pdf":
         if pdf_parser is None:
             return None
-        parsed = await pdf_parser.parse(path)
-        return parsed.text or None
+        parsed = await pdf_parser.parse(path, config=config)
+        return parsed if (parsed and parsed.text) else None
     try:
-        return path.read_text(encoding="utf-8", errors="replace")
+        raw = path.read_text(encoding="utf-8", errors="replace")
     except Exception as exc:
         logger.warning("local_docs_read_failed", path=str(path), error=str(exc))
         return None
+    return ParsedContent(text=raw) if raw else None
 
 
 async def _ingest_files(
@@ -136,7 +140,8 @@ async def _ingest_files(
         for idx, fp in enumerate(files):
             content_type, language = infer_content_type(fp)
             paper = _paper_for_file(fp)
-            text = await _read_text(fp, content_type, app_state.pdf_parser)
+            parsed = await _read_text(fp, content_type, app_state.pdf_parser, kb_cfg)
+            text = parsed.text if parsed else None
             # F-13: opportunistic year extraction from the document body so
             # KB-stats by_year and recency-weighted retrieval have signal
             # for URL- / file-ingested sources.
@@ -158,6 +163,15 @@ async def _ingest_files(
                 text, paper,
                 content_type=content_type, language=language, config=kb_cfg,
             )
+            if parsed is not None and parsed.tables:
+                from perspicacite.pipeline.chunking_dispatch import (
+                    table_records_to_chunks,
+                )
+                chunks.extend(
+                    table_records_to_chunks(
+                        parsed.tables, paper, start_index=len(chunks)
+                    )
+                )
             # ChunkMetadata is frozen — recreate with source_file_path set,
             # plus optional external_metadata annotations (Cycle C).
             ext_parent = (external_metadata or {}).get("parent_paper_id")
diff --git a/tests/unit/test_local_docs_docling_wire.py b/tests/unit/test_local_docs_docling_wire.py
new file mode 100644
index 00000000..d83c6e4a
--- /dev/null
+++ b/tests/unit/test_local_docs_docling_wire.py
@@ -0,0 +1,59 @@
+import asyncio
+import unittest
+from pathlib import Path
+
+
+class _FakeParsed:
+    pass
+
+
+class TestReadTextThreadsConfigAndTables(unittest.TestCase):
+    def test_pdf_returns_parsedcontent_with_tables_and_passes_config(self):
+        from perspicacite.integrations.local_docs import _read_text
+        from perspicacite.pipeline.parsers.docling_pdf import DoclingTable
+        from perspicacite.pipeline.parsers.pdf import ParsedContent
+
+        seen = {}
+
+        class _FakeParser:
+            async def parse(self, source, config=None):
+                seen["config"] = config
+                return ParsedContent(
+                    text="body text",
+                    tables=[DoclingTable(page=1, caption="Table 1.",
+                                         markdown="| a |", headers=["a"], rows=[["1"]])],
+                )
+
+        sentinel = object()
+        out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), sentinel))
+        assert isinstance(out, ParsedContent)
+        assert out.text == "body text"
+        assert len(out.tables) == 1
+        assert seen["config"] is sentinel  # config threaded to parse()
+
+    def test_pdf_empty_text_returns_none(self):
+        from perspicacite.integrations.local_docs import _read_text
+        from perspicacite.pipeline.parsers.pdf import ParsedContent
+
+        class _FakeParser:
+            async def parse(self, source, config=None):
+                return ParsedContent(text="")
+
+        out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), None))
+        assert out is None
+
+    def test_non_pdf_wraps_text_in_parsedcontent(self):
+        import os
+        import tempfile
+
+        from perspicacite.integrations.local_docs import _read_text
+        from perspicacite.pipeline.parsers.pdf import ParsedContent
+        with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
+            f.write("hello world")
+            p = Path(f.name)
+        try:
+            out = asyncio.run(_read_text(p, "text", None, None))
+            assert isinstance(out, ParsedContent)
+            assert "hello world" in out.text
+        finally:
+            os.unlink(p)

From 23e0f3e0597886b5f0fa9e240f2606b271d6e672 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 16:53:17 +0200
Subject: [PATCH 09/11] fix(docling): force CPU accelerator (MPS float64 crash
 on Apple Silicon)

docling auto-selects MPS on Apple Silicon, which raises "Cannot convert a
MPS Tensor to float64" and fails conversion on every page. Pin
AcceleratorOptions(device=CPU). Verified: 13-page PDF extracts 6 figures
on CPU (~10min); MPS unusable even with PYTORCH_ENABLE_MPS_FALLBACK=1.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit a9893e738a8d6b3b5bd8afc3957a1d6ac849c942)
---
 src/perspicacite/pipeline/parsers/docling_pdf.py | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/perspicacite/pipeline/parsers/docling_pdf.py b/src/perspicacite/pipeline/parsers/docling_pdf.py
index 4f119286..80945c9e 100644
--- a/src/perspicacite/pipeline/parsers/docling_pdf.py
+++ b/src/perspicacite/pipeline/parsers/docling_pdf.py
@@ -59,12 +59,21 @@ def docling_importable() -> bool:
 def _make_docling_converter():
     # Picture images MUST be enabled or get_image() returns None (zero figures).
     from docling.datamodel.base_models import InputFormat
-    from docling.datamodel.pipeline_options import PdfPipelineOptions
+    from docling.datamodel.pipeline_options import (
+        AcceleratorDevice,
+        AcceleratorOptions,
+        PdfPipelineOptions,
+    )
     from docling.document_converter import DocumentConverter, PdfFormatOption
 
     opts = PdfPipelineOptions()
     opts.generate_picture_images = True
     opts.images_scale = 2.0
+    # Force CPU. On Apple Silicon docling auto-selects the MPS (Metal) backend,
+    # which raises "Cannot convert a MPS Tensor to float64 ... MPS doesn't
+    # support float64" and fails conversion on every page. CPU is portable and
+    # matches the documented R2 device intent.
+    opts.accelerator_options = AcceleratorOptions(device=AcceleratorDevice.CPU)
     return DocumentConverter(
         format_options={InputFormat.PDF: PdfFormatOption(pipeline_options=opts)}
     )

From f5a6500738c2796b4001ea3d6cf67fb286338f22 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 16:58:56 +0200
Subject: [PATCH 10/11] refactor(docling): fitz text always; docling
 tables/figures as additive advanced opt-in (R2)

Text extraction stays 100% fitz (fast, default). docling no longer
replaces the text path. New advanced flag docling_extract_tables_figures
(off by default) runs docling on PDF ingest ONLY to append structured
table chunks, guarded by docling_max_pages + docling_timeout_s (now 600s).
Text is unaffected if docling is absent/oversized/times out.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit 01c4b02c3269d21ad2b8f981839091e4227d2fa2)
---
 config.example.yml                          | 14 +++---
 src/perspicacite/config/schema.py           | 20 +++++----
 src/perspicacite/integrations/local_docs.py | 38 +++++++++--------
 src/perspicacite/pipeline/parsers/pdf.py    | 34 +++++----------
 tests/unit/test_config.py                   | 12 +++---
 tests/unit/test_local_docs_docling_wire.py  | 47 ++++++---------------
 tests/unit/test_pdf_backend_guard.py        | 41 +++++++++---------
 7 files changed, 87 insertions(+), 119 deletions(-)

diff --git a/config.example.yml b/config.example.yml
index d0ba19cd..3968d554 100644
--- a/config.example.yml
+++ b/config.example.yml
@@ -67,13 +67,13 @@ knowledge_base:
   embedding_model: "text-embedding-3-small"
   chunk_size: 1000
   chunk_overlap: 200
-  # PDF extraction backend.
-  #   fitz    — text-only PyMuPDF (default-installed, fast)
-  #   docling — layout model: structured tables + figures (needs `uv sync --extra docling`)
-  #   auto    — docling when installed and within docling_max_pages, else fitz
-  pdf_backend: auto
-  docling_max_pages: 40     # auto: PDFs larger than this use fitz
-  docling_timeout_s: 120    # auto: docling wall-clock cap per document; on timeout → fitz
+  # Advanced: extract structured tables + figures with docling, IN ADDITION to
+  # the always-on fast fitz text extraction. Requires `uv sync --extra docling`.
+  # docling is CPU-only here and slow (~minutes/page), so this is off by default
+  # — turn it on for high-value PDFs where tables/figures matter.
+  docling_extract_tables_figures: false
+  docling_max_pages: 40     # skip docling extras for PDFs larger than this
+  docling_timeout_s: 600    # per-document docling wall-clock cap; on timeout, skip extras
   chunking_method: "token"
   default_top_k: 10
   similarity_threshold: 0.7
diff --git a/src/perspicacite/config/schema.py b/src/perspicacite/config/schema.py
index 3875ecf8..31c574bf 100644
--- a/src/perspicacite/config/schema.py
+++ b/src/perspicacite/config/schema.py
@@ -92,22 +92,24 @@ class KnowledgeBaseConfig(BaseModel):
     embedding_model: str = "text-embedding-3-small"
     chunk_size: int = Field(default=1000, ge=100, le=10000)
     chunk_overlap: int = Field(default=200, ge=0, le=1000)
-    pdf_backend: Literal["auto", "docling", "fitz"] = Field(
-        default="auto",
+    docling_extract_tables_figures: bool = Field(
+        default=False,
         description=(
-            "PDF extraction backend. 'fitz' = text-only PyMuPDF (incumbent); "
-            "'docling' = layout model with structured tables + figures "
-            "(requires the [docling] extra); 'auto' = docling when importable "
-            "and within docling_max_pages, else fitz."
+            "Advanced: when True, run docling (CPU, slow ~min/page) to extract "
+            "structured tables + figures from PDFs IN ADDITION to the always-on "
+            "fitz text extraction. Off by default."
         ),
     )
     docling_max_pages: int = Field(
         default=40, ge=1,
-        description="In 'auto', skip docling for PDFs with more pages than this (use fitz).",
+        description="Skip the docling extras pass for PDFs with more pages than this.",
     )
     docling_timeout_s: int = Field(
-        default=120, ge=1,
-        description="Per-document wall-clock cap for docling; on timeout, fall back to fitz.",
+        default=600, ge=1,
+        description=(
+            "Per-document wall-clock cap for the docling extras pass; "
+            "on timeout, skip extras."
+        ),
     )
     chunking_method: Literal["token", "semantic", "agentic"] = "token"
     default_top_k: int = Field(default=10, ge=1, le=100)
diff --git a/src/perspicacite/integrations/local_docs.py b/src/perspicacite/integrations/local_docs.py
index 57381f2e..e654d468 100644
--- a/src/perspicacite/integrations/local_docs.py
+++ b/src/perspicacite/integrations/local_docs.py
@@ -18,7 +18,6 @@
     chunk_document,
     infer_content_type,
 )
-from perspicacite.pipeline.parsers.pdf import ParsedContent
 
 logger = get_logger("perspicacite.local_docs")
 
@@ -98,20 +97,18 @@ def _extract_year_from_text(text: str | None) -> int | None:
     return None
 
 
-async def _read_text(
-    path: Path, content_type: str, pdf_parser, config=None
-) -> ParsedContent | None:
+async def _read_text(path: Path, content_type: str, pdf_parser) -> str | None:
     if content_type == "pdf":
         if pdf_parser is None:
             return None
-        parsed = await pdf_parser.parse(path, config=config)
-        return parsed if (parsed and parsed.text) else None
+        parsed = await pdf_parser.parse(path)
+        return parsed.text or None
     try:
         raw = path.read_text(encoding="utf-8", errors="replace")
     except Exception as exc:
         logger.warning("local_docs_read_failed", path=str(path), error=str(exc))
         return None
-    return ParsedContent(text=raw) if raw else None
+    return raw or None
 
 
 async def _ingest_files(
@@ -140,8 +137,7 @@ async def _ingest_files(
         for idx, fp in enumerate(files):
             content_type, language = infer_content_type(fp)
             paper = _paper_for_file(fp)
-            parsed = await _read_text(fp, content_type, app_state.pdf_parser, kb_cfg)
-            text = parsed.text if parsed else None
+            text = await _read_text(fp, content_type, app_state.pdf_parser)
             # F-13: opportunistic year extraction from the document body so
             # KB-stats by_year and recency-weighted retrieval have signal
             # for URL- / file-ingested sources.
@@ -163,15 +159,23 @@ async def _ingest_files(
                 text, paper,
                 content_type=content_type, language=language, config=kb_cfg,
             )
-            if parsed is not None and parsed.tables:
-                from perspicacite.pipeline.chunking_dispatch import (
-                    table_records_to_chunks,
-                )
-                chunks.extend(
-                    table_records_to_chunks(
-                        parsed.tables, paper, start_index=len(chunks)
+            # R2 advanced: optionally augment with docling-extracted tables.
+            if content_type == "pdf" and getattr(kb_cfg, "docling_extract_tables_figures", False):
+                parser = app_state.pdf_parser
+                pages = parser._page_count(fp)
+                if parser._should_run_docling_extras(pages, kb_cfg):
+                    pc = parser._run_docling_with_timeout(
+                        fp, int(getattr(kb_cfg, "docling_timeout_s", 600))
                     )
-                )
+                    if pc is not None and pc.tables:
+                        from perspicacite.pipeline.chunking_dispatch import (
+                            table_records_to_chunks,
+                        )
+                        chunks.extend(
+                            table_records_to_chunks(
+                                pc.tables, paper, start_index=len(chunks)
+                            )
+                        )
             # ChunkMetadata is frozen — recreate with source_file_path set,
             # plus optional external_metadata annotations (Cycle C).
             ext_parent = (external_metadata or {}).get("parent_paper_id")
diff --git a/src/perspicacite/pipeline/parsers/pdf.py b/src/perspicacite/pipeline/parsers/pdf.py
index a2c047f7..e7456a54 100644
--- a/src/perspicacite/pipeline/parsers/pdf.py
+++ b/src/perspicacite/pipeline/parsers/pdf.py
@@ -184,7 +184,7 @@ def _extract_with_pdfplumber(self, source: str | Path | bytes) -> tuple[str, dic
         return "\n\n".join(all_text), sections, page_count
 
     # ------------------------------------------------------------------
-    # Backend selection + guards (R2 docling)
+    # docling extras pass: guards + worker runner (R2 docling)
     # ------------------------------------------------------------------
 
     def _page_count(self, source) -> int:
@@ -203,19 +203,15 @@ def _page_count(self, source) -> int:
         except Exception:
             return 0
 
-    def _select_backend(self, source, page_count: int, config) -> str:
-        backend = getattr(config, "pdf_backend", "auto")
-        if backend == "fitz":
-            return "fitz"
-        if backend == "docling":
-            return "docling"
-        # auto:
+    def _should_run_docling_extras(self, page_count: int, config) -> bool:
+        """True when docling tables/figures extraction should run: the advanced
+        flag is on, the [docling] extra is importable, and the PDF is within the
+        page-count cap. The wall-clock timeout is the runtime safety net."""
+        if not getattr(config, "docling_extract_tables_figures", False):
+            return False
         if not _docling_importable():
-            return "fitz"
-        if page_count > int(getattr(config, "docling_max_pages", 40)):
-            logger.warning("docling_fallback", reason="oversized", pages=page_count)
-            return "fitz"
-        return "docling"
+            return False
+        return page_count <= int(getattr(config, "docling_max_pages", 40))
 
     def _run_docling_with_timeout(self, source, timeout_s: int):
         """Run docling in a worker process; return ParsedContent or None on
@@ -237,7 +233,7 @@ def _run_docling_with_timeout(self, source, timeout_s: int):
     # Public API
     # ------------------------------------------------------------------
 
-    async def parse(self, source: str | Path | bytes, config=None) -> ParsedContent:
+    async def parse(self, source: str | Path | bytes) -> ParsedContent:
         """
         Parse PDF and extract text.
 
@@ -247,16 +243,6 @@ async def parse(self, source: str | Path | bytes, config=None) -> ParsedContent:
         Returns:
             Parsed content with text and metadata
         """
-        if config is not None:
-            pages = self._page_count(source)
-            if self._select_backend(source, pages, config) == "docling":
-                pc = self._run_docling_with_timeout(
-                    source, int(getattr(config, "docling_timeout_s", 120))
-                )
-                if pc is not None:
-                    return pc
-                # else fall through to the fitz/pdfplumber path below
-
         # Try PyMuPDF first (better column handling)
         result = self._extract_with_fitz(source)
 
diff --git a/tests/unit/test_config.py b/tests/unit/test_config.py
index 410d3b2d..1b1d4e63 100644
--- a/tests/unit/test_config.py
+++ b/tests/unit/test_config.py
@@ -221,13 +221,11 @@ def test_anchor_config_near_threshold_bounds():
         AnchorConfig(near_threshold=1.5)
 
 
-def test_pdf_backend_defaults_and_validation():
+def test_docling_extras_config_defaults():
     from perspicacite.config.schema import KnowledgeBaseConfig
     kb = KnowledgeBaseConfig()
-    assert kb.pdf_backend == "auto"
+    assert kb.docling_extract_tables_figures is False
     assert kb.docling_max_pages == 40
-    assert kb.docling_timeout_s == 120
-    import pytest
-    from pydantic import ValidationError
-    with pytest.raises(ValidationError):
-        KnowledgeBaseConfig(pdf_backend="banana")
+    assert kb.docling_timeout_s == 600
+    kb2 = KnowledgeBaseConfig(docling_extract_tables_figures=True)
+    assert kb2.docling_extract_tables_figures is True
diff --git a/tests/unit/test_local_docs_docling_wire.py b/tests/unit/test_local_docs_docling_wire.py
index d83c6e4a..7331d5d1 100644
--- a/tests/unit/test_local_docs_docling_wire.py
+++ b/tests/unit/test_local_docs_docling_wire.py
@@ -3,57 +3,38 @@
 from pathlib import Path
 
 
-class _FakeParsed:
-    pass
-
-
-class TestReadTextThreadsConfigAndTables(unittest.TestCase):
-    def test_pdf_returns_parsedcontent_with_tables_and_passes_config(self):
+class TestReadTextIsFitzTextOnly(unittest.TestCase):
+    def test_pdf_returns_text_string(self):
         from perspicacite.integrations.local_docs import _read_text
-        from perspicacite.pipeline.parsers.docling_pdf import DoclingTable
         from perspicacite.pipeline.parsers.pdf import ParsedContent
 
-        seen = {}
-
         class _FakeParser:
-            async def parse(self, source, config=None):
-                seen["config"] = config
-                return ParsedContent(
-                    text="body text",
-                    tables=[DoclingTable(page=1, caption="Table 1.",
-                                         markdown="| a |", headers=["a"], rows=[["1"]])],
-                )
-
-        sentinel = object()
-        out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), sentinel))
-        assert isinstance(out, ParsedContent)
-        assert out.text == "body text"
-        assert len(out.tables) == 1
-        assert seen["config"] is sentinel  # config threaded to parse()
-
-    def test_pdf_empty_text_returns_none(self):
+            async def parse(self, source):
+                return ParsedContent(text="body text")
+
+        out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser()))
+        assert out == "body text"
+
+    def test_pdf_empty_returns_none(self):
         from perspicacite.integrations.local_docs import _read_text
         from perspicacite.pipeline.parsers.pdf import ParsedContent
 
         class _FakeParser:
-            async def parse(self, source, config=None):
+            async def parse(self, source):
                 return ParsedContent(text="")
 
-        out = asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser(), None))
-        assert out is None
+        assert asyncio.run(_read_text(Path("/x.pdf"), "pdf", _FakeParser())) is None
 
-    def test_non_pdf_wraps_text_in_parsedcontent(self):
+    def test_non_pdf_returns_text(self):
         import os
         import tempfile
 
         from perspicacite.integrations.local_docs import _read_text
-        from perspicacite.pipeline.parsers.pdf import ParsedContent
         with tempfile.NamedTemporaryFile("w", suffix=".txt", delete=False) as f:
             f.write("hello world")
             p = Path(f.name)
         try:
-            out = asyncio.run(_read_text(p, "text", None, None))
-            assert isinstance(out, ParsedContent)
-            assert "hello world" in out.text
+            out = asyncio.run(_read_text(p, "text", None))
+            assert "hello world" in out
         finally:
             os.unlink(p)
diff --git a/tests/unit/test_pdf_backend_guard.py b/tests/unit/test_pdf_backend_guard.py
index c6aedc32..9912a87d 100644
--- a/tests/unit/test_pdf_backend_guard.py
+++ b/tests/unit/test_pdf_backend_guard.py
@@ -2,44 +2,41 @@
 
 
 class _Cfg:
-    def __init__(self, backend="auto", max_pages=40, timeout=120):
-        self.pdf_backend = backend
+    def __init__(self, flag=True, max_pages=40, timeout=600):
+        self.docling_extract_tables_figures = flag
         self.docling_max_pages = max_pages
         self.docling_timeout_s = timeout
 
 
-class TestBackendSelector(unittest.TestCase):
-    def _select(self, parser, pages, cfg=None):
-        return parser._select_backend("/x.pdf", pages, _Cfg(**(cfg or {})))
-
-    def test_explicit_fitz(self):
+class TestShouldRunDoclingExtras(unittest.TestCase):
+    def test_flag_off_returns_false(self):
         from perspicacite.pipeline.parsers.pdf import PDFParser
-        p = PDFParser()
-        assert self._select(p, 5, {"backend": "fitz"}) == "fitz"
+        assert PDFParser()._should_run_docling_extras(5, _Cfg(flag=False)) is False
 
-    def test_explicit_docling(self):
-        from perspicacite.pipeline.parsers.pdf import PDFParser
-        p = PDFParser()
-        assert self._select(p, 5, {"backend": "docling"}) == "docling"
-
-    def test_auto_uses_fitz_when_docling_absent(self):
+    def test_flag_on_importable_small_returns_true(self):
         from perspicacite.pipeline.parsers import pdf as m
-        p = m.PDFParser()
         orig = m._docling_importable
-        m._docling_importable = lambda: False
+        m._docling_importable = lambda: True
         try:
-            assert self._select(p, 5) == "fitz"
+            assert m.PDFParser()._should_run_docling_extras(5, _Cfg()) is True
         finally:
             m._docling_importable = orig
 
-    def test_auto_guard_on_pages(self):
+    def test_oversized_returns_false(self):
         from perspicacite.pipeline.parsers import pdf as m
-        p = m.PDFParser()
         orig = m._docling_importable
         m._docling_importable = lambda: True
         try:
-            assert self._select(p, 999, {"max_pages": 40}) == "fitz"
-            assert self._select(p, 10, {"max_pages": 40}) == "docling"
+            assert m.PDFParser()._should_run_docling_extras(999, _Cfg(max_pages=40)) is False
+        finally:
+            m._docling_importable = orig
+
+    def test_not_importable_returns_false(self):
+        from perspicacite.pipeline.parsers import pdf as m
+        orig = m._docling_importable
+        m._docling_importable = lambda: False
+        try:
+            assert m.PDFParser()._should_run_docling_extras(5, _Cfg()) is False
         finally:
             m._docling_importable = orig
 

From 1282ea35816661974f63447b04d4d7576fb3d891 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Louis-F=C3=A9lix=20Nothias?= <louis-felix.nothias@cnrs.fr>
Date: Tue, 2 Jun 2026 17:26:06 +0200
Subject: [PATCH 11/11] docs(pdf): document fitz text + optional docling
 tables/figures extraction (R2)

New docs/pdf-extraction-docling.md (two-layer model, enabling, guard knobs,
CPU cost + MPS limitation, scope/limits); CLAUDE.md "PDF extraction backends"
pointer; README feature line.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
(cherry picked from commit 8b794ed8369aee268a1377b5b9aece071fbd3b9c)
---
 CLAUDE.md                      |  4 ++
 README.md                      |  1 +
 docs/pdf-extraction-docling.md | 89 ++++++++++++++++++++++++++++++++++
 3 files changed, 94 insertions(+)
 create mode 100644 docs/pdf-extraction-docling.md

diff --git a/CLAUDE.md b/CLAUDE.md
index c968061a..70ea98c4 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -96,6 +96,10 @@ The `AgenticOrchestrator` ([src/perspicacite/rag/agentic/orchestrator.py](src/pe
 
 Publisher API keys are passed as kwargs; missing keys skip that source gracefully. Check `content_type` in the result: `"structured"` > `"full_text"` > `"abstract"` > `"none"`.
 
+### PDF extraction backends
+
+PDF **text** is always extracted with PyMuPDF (`fitz`) → `pdfplumber` fallback (`pipeline/parsers/pdf.py`) — fast, default, on every ingest. **Structured tables + figures** are an opt-in advanced layer via **docling**, off by default (`knowledge_base.docling_extract_tables_figures`, guarded by `docling_max_pages` / `docling_timeout_s`; needs `uv sync --extra docling`). When enabled on the local-file ingest path, docling adds `content_type="table"` chunks on top of the fitz text; if docling is absent / oversized / times out, the text is unaffected. docling is CPU-bound (~min/page) and the MPS/GPU path is unusable on Apple Silicon. Full details: [docs/pdf-extraction-docling.md](docs/pdf-extraction-docling.md).
+
 ### Retrieval
 
 `ChromaVectorStore` ([src/perspicacite/retrieval/chroma_store.py](src/perspicacite/retrieval/chroma_store.py)) wraps ChromaDB. KB collections are named via `chroma_collection_name_for_kb()` from `models/kb.py`. The hybrid retriever ([src/perspicacite/retrieval/hybrid.py](src/perspicacite/retrieval/hybrid.py)) combines ChromaDB cosine scores with BM25Okapi scores; weights default to 0.5/0.5 but can optionally be determined by the LLM at query time. `MultiKBRetriever` ([src/perspicacite/retrieval/multi_kb.py](src/perspicacite/retrieval/multi_kb.py)) fans a query across multiple KB collections, merges by score, deduplicates by `paper_id`, and tags results with `kb_name`; use `check_embedding_compat(kb_metas)` to validate that all queried KBs share the same embedding model before retrieval.
diff --git a/README.md b/README.md
index efc00cb8..db5b1673 100644
--- a/README.md
+++ b/README.md
@@ -28,6 +28,7 @@
 
 - **Multi-database search** — Semantic Scholar, OpenAlex, PubMed, arXiv, HAL, DBLP via SciLEx
 - **Unified content pipeline** — PMC JATS XML, arXiv HTML, OA PDFs, publisher APIs, and institutional-access via browser-cookie replay; quality-priority routing
+- **PDF extraction** — fast PyMuPDF text on every ingest, plus optional [docling](docs/pdf-extraction-docling.md) layout extraction for structured tables/figures (advanced, off by default)
 - **6 RAG modes** — Basic, Advanced, Profound, Agentic, Literature Survey, Contradiction; per-stage LLM tiering (Haiku routing/screening, Sonnet synthesis)
 - **Knowledge base management** — BibTeX import, DOI bulk-add, local document ingest, Zotero-collection import; async ingestion with SSE progress streaming
 - **Citation-graph expansion** — forward + backward snowball over OpenAlex; automatic Semantic Scholar fallback for arXiv-seeded papers (see [docs/concepts/citation-graph.md](docs/concepts/citation-graph.md))
diff --git a/docs/pdf-extraction-docling.md b/docs/pdf-extraction-docling.md
new file mode 100644
index 00000000..9435bcc3
--- /dev/null
+++ b/docs/pdf-extraction-docling.md
@@ -0,0 +1,89 @@
+# PDF extraction: fast text + optional docling tables/figures
+
+Perspicacité extracts PDF content in **two independent layers**:
+
+| Layer | Engine | Runs | Output | Speed |
+|-------|--------|------|--------|-------|
+| **Text** (always on) | PyMuPDF (`fitz`) → `pdfplumber` fallback | every PDF ingest | full body text + sections | fast (sub-second) |
+| **Tables + figures** (opt-in, advanced) | docling layout model | only when enabled | structured tables as retrievable chunks | slow (CPU-bound, ~minutes/page) |
+
+The layers are decoupled: **text never depends on docling.** If the `[docling]`
+extra is not installed, the PDF exceeds the page cap, or docling errors/times
+out, you still get the full fitz text — you simply don't get the table chunks.
+Enabling docling can only *add* content, never break ingest.
+
+## Why docling is off by default
+
+Docling runs the RT-DETR layout model + TableFormer. On CPU this is roughly
+**~45–50 s per page (~10 min for a typical paper)**. On Apple Silicon the GPU
+(MPS) path is currently **unusable** — the upstream `transformers` RT-DETRv2
+positional embedding hard-codes `float64`, which MPS does not support
+(see [huggingface/transformers#28334](https://github.com/huggingface/transformers/issues/28334));
+`PYTORCH_ENABLE_MPS_FALLBACK=1` does not help. So docling here is a deliberate,
+batch/offline choice, not a hot-path default. A CUDA machine makes it fast
+enough for routine use.
+
+## Enabling docling
+
+1. Install the optional extra (one-time, heavy — pulls torch + layout models):
+
+   ```bash
+   uv sync --extra docling
+   ```
+
+2. In `config.yml` under `knowledge_base:`:
+
+   ```yaml
+   docling_extract_tables_figures: true   # default: false
+   docling_max_pages: 40                  # PDFs larger than this skip docling (text-only)
+   docling_timeout_s: 600                 # per-document wall-clock cap; on timeout, keep text, skip extras
+   ```
+
+3. Ingest **local PDF files** (the local-files / dropzone path). Each PDF gets
+   fitz text **plus** any tables docling extracts, added as searchable chunks
+   tagged `content_type="table"` (caption + page preserved in metadata).
+
+## Guard behaviour (config knobs)
+
+- `docling_extract_tables_figures` (bool, default `false`) — master switch for
+  the advanced layer.
+- `docling_max_pages` (int, default `40`) — documents with more pages skip
+  docling and use text-only fitz (avoids the worst-case multi-minute cost).
+- `docling_timeout_s` (int, default `600`) — per-document wall-clock cap. docling
+  runs in a worker process; on timeout it is abandoned and ingest falls back to
+  the already-extracted fitz text. Every fallback logs one structured
+  `docling_fallback` event (`reason=oversized|timeout|error`).
+
+## Scope and current limits
+
+- **Wired for the local-file ingest path** (`integrations/local_docs.py`). The
+  DOI/BibTeX download path is text-only for now (adding table chunks there needs
+  a `Paper.tables` field — a follow-up).
+- **Tables become chunks today; figures are extracted but not yet consumed.**
+  Docling figure records are produced (caption + image, dimensions populated)
+  and mapped to the existing multimodal record shape, but feeding figure images
+  into the answer/vision pipeline is a follow-up.
+- **CPU-only in practice** on Apple Silicon (see above). Prefer a CUDA host or a
+  remote docling service for large batches.
+
+## Implementation pointers
+
+- Converter + record mapping: `src/perspicacite/pipeline/parsers/docling_pdf.py`
+  (`DoclingPDFParser`, `DoclingTable`, `DoclingFigure`,
+  `figure_to_multimodal_record`). The converter forces
+  `AcceleratorDevice.CPU` and enables `generate_picture_images` + `images_scale=2.0`
+  (without picture-image rendering, `PictureItem.get_image()` returns `None` and
+  every figure is dropped).
+- Backend guard + worker: `src/perspicacite/pipeline/parsers/pdf.py`
+  (`_should_run_docling_extras`, `_run_docling_with_timeout`, `_docling_importable`).
+- Table → chunk: `src/perspicacite/pipeline/chunking_dispatch.py`
+  (`table_records_to_chunks`).
+- Config: `src/perspicacite/config/schema.py` (`KnowledgeBaseConfig`).
+
+## Note on full text vs. abstracts
+
+If a knowledge base shows only abstracts, that is a **source** issue, not a
+docling one: a Zotero `.bib` carries abstracts only. To get full text, ingest
+the actual **PDFs** (local-file path) — the fast fitz layer already returns the
+complete body text, no docling required. Enable docling only when you also want
+the papers' **tables** as retrievable content.