From 9fcee4f8e8ce4185648e43b30d824e23f8bd3315 Mon Sep 17 00:00:00 2001
From: MollyAI <molly@openclaw.ai>
Date: Sun, 22 Mar 2026 14:58:14 -0400
Subject: [PATCH] Add OCR fallback for scanned PDF memories

---
 pyproject.toml                          |  5 +-
 src/recallforge/documents.py            | 55 ++++++++++++++++-
 src/recallforge/server.py               |  3 +-
 src/recallforge/storage/indexing_ops.py | 21 +++++++
 tests/test_config_tools.py              | 33 ++++++++++
 tests/test_documents.py                 | 81 ++++++++++++++++---------
 tests/test_storage.py                   | 45 ++++++++++++++
 7 files changed, 210 insertions(+), 33 deletions(-)

diff --git a/pyproject.toml b/pyproject.toml
index 8bccf40..3733a6c 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -49,7 +49,10 @@ mlx = [
     # transformers 5.x Qwen3VLVideoProcessor requires torchvision for processor loading
     "torchvision>=0.15,<1.0",
 ]
-docs = ["pypdf>=5.0,<6.0"]
+docs = [
+    "pypdf>=5.0,<6.0",
+    "pymupdf>=1.24,<2.0",
+]
 server = [
     "starlette>=0.36,<1.0",
     "uvicorn>=0.30,<1.0",
diff --git a/src/recallforge/documents.py b/src/recallforge/documents.py
index 4d1b684..8d4e3df 100644
--- a/src/recallforge/documents.py
+++ b/src/recallforge/documents.py
@@ -265,6 +265,27 @@ def _render_pdf_page_as_image(
     return None
 
 
+def _ocr_pdf_page_text(pdf_path: Path, page_number: int) -> str:
+    """Extract OCR text for a PDF page when embedded text is unavailable."""
+    import logging
+
+    logger = logging.getLogger("recallforge.documents")
+
+    if importlib.util.find_spec("fitz") is None:
+        return ""
+
+    try:
+        import fitz  # type: ignore
+
+        with fitz.open(str(pdf_path)) as doc:
+            page = doc.load_page(page_number - 1)  # 0-indexed
+            text_page = page.get_textpage_ocr(language="eng", dpi=150, full=True)
+            return _clean_text(page.get_text(textpage=text_page) or "")
+    except Exception as exc:
+        logger.debug("pymupdf OCR failed for %s page %d: %s", pdf_path, page_number, exc)
+        return ""
+
+
 def _extract_pdf_with_pypdf(path: Path, logical_path: str) -> DocumentArtifacts:
     import logging
     from pypdf import PdfReader  # type: ignore
@@ -295,19 +316,31 @@ def _extract_pdf_with_pypdf(path: Path, logical_path: str) -> DocumentArtifacts:
         # No text extracted - try to render page as image
         if temp_dir is None:
             temp_dir = Path(tempfile.mkdtemp(prefix="recallforge_pdf_"))
+        ocr_text = _ocr_pdf_page_text(path, index)
         image_path = _render_pdf_page_as_image(path, index, temp_dir)
         if image_path:
             sections.append(
                 DocumentSection(
                     logical_path=f"{logical_path}::page:{index:04d}",
                     title=f"{path.stem} page {index}",
-                    text="",  # No text, image will be embedded
+                    text=ocr_text,
                     section_type="page",
                     index=index,
                     content_type="image",
                     image_path=image_path,
                 )
             )
+        elif ocr_text:
+            sections.append(
+                DocumentSection(
+                    logical_path=f"{logical_path}::page:{index:04d}",
+                    title=f"{path.stem} page {index}",
+                    text=ocr_text,
+                    section_type="page",
+                    index=index,
+                    content_type="text",
+                )
+            )
 
     if not sections:
         logger.warning("No extractable text or images found in PDF: %s", path)
@@ -344,7 +377,8 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts:
             extractor="builtin-pdf-fallback",
         )
 
-    # No text extracted - try to render first page as image using pymupdf
+    # No text extracted - try OCR + page rendering for scanned/image-only PDFs.
+    ocr_text = _ocr_pdf_page_text(path, 1)
     image_path = _render_pdf_page_as_image(path, 1, None)
     if image_path:
         return DocumentArtifacts(
@@ -352,7 +386,7 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts:
                 DocumentSection(
                     logical_path=f"{logical_path}::page:0001",
                     title=f"{path.stem} page 1",
-                    text="",
+                    text=ocr_text,
                     section_type="page",
                     index=1,
                     content_type="image",
@@ -362,6 +396,21 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts:
             document_type="pdf",
             extractor="builtin-pdf-fallback",
         )
+    if ocr_text:
+        return DocumentArtifacts(
+            sections=[
+                DocumentSection(
+                    logical_path=f"{logical_path}::page:0001",
+                    title=f"{path.stem} page 1",
+                    text=ocr_text,
+                    section_type="page",
+                    index=1,
+                    content_type="text",
+                )
+            ],
+            document_type="pdf",
+            extractor="builtin-pdf-fallback",
+        )
 
     logger.warning(
         "No extractable text or images found in PDF: %s. Install recallforge[docs] for richer PDF parsing.",
diff --git a/src/recallforge/server.py b/src/recallforge/server.py
index 4dc4e59..6b4d380 100644
--- a/src/recallforge/server.py
+++ b/src/recallforge/server.py
@@ -119,7 +119,6 @@ def _resolve_file_query_input(
                 for section in artifacts.sections
                 if isinstance(section.text, str)
                 and section.text.strip()
-                and section.content_type == "text"
             )
         )
         if merged:
@@ -128,7 +127,7 @@ def _resolve_file_query_input(
             None,
             None,
             None,
-            f"No extractable document text found in {resolved.name}. OCR/image-only document queries are not supported yet.",
+            f"No extractable document text found in {resolved.name}. Install OCR-capable PDF support for scanned/image-only documents.",
         )
 
     try:
diff --git a/src/recallforge/storage/indexing_ops.py b/src/recallforge/storage/indexing_ops.py
index 1ecd753..8537c4a 100644
--- a/src/recallforge/storage/indexing_ops.py
+++ b/src/recallforge/storage/indexing_ops.py
@@ -1422,6 +1422,27 @@ def index_document_file(
                 content_hash = hash_content(f"pdf_page_image:{actual_path}:page:{section.index}")
                 self._backend.insert_content(content_hash, actual_path, content_type="pdf_page_image")
                 indexed_images += 1
+
+                # Preserve OCR text for scanned/image-only pages as a sibling
+                # text child so BM25 and file-as-query can use it without
+                # dropping the visual page representation.
+                ocr_text = (section.text or "").strip()
+                if ocr_text:
+                    self.upsert_memory(
+                        path=f"{section.logical_path}::ocr",
+                        text=ocr_text,
+                        collection=collection,
+                        embed_func=embed_func,
+                        model=model,
+                        user_id=user_id,
+                        session_id=session_id,
+                        project_id=project_id,
+                        profile=profile,
+                        _skip_delete=True,
+                        memory_role="child",
+                        memory_root_path=logical_path,
+                    )
+                    indexed_sections += 1
             else:
                 # Use text embedding for text sections
                 self.upsert_memory(
diff --git a/tests/test_config_tools.py b/tests/test_config_tools.py
index e762c1c..15cea0c 100644
--- a/tests/test_config_tools.py
+++ b/tests/test_config_tools.py
@@ -31,6 +31,7 @@
     create_server,
 )
 from recallforge import __version__
+from recallforge.documents import DocumentArtifacts, DocumentSection
 from recallforge.search import HybridResult, SearchAudit
 
 
@@ -608,6 +609,38 @@ def read(self, size: int = -1) -> bytes:
         finally:
             os.unlink(file_path)
 
+    async def test_document_file_query_uses_ocr_text_from_image_only_pages(self):
+        with tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False) as tmp:
+            tmp.write(b"%PDF-1.4 mock")
+            file_path = tmp.name
+
+        try:
+            artifacts = DocumentArtifacts(
+                sections=[
+                    DocumentSection(
+                        logical_path=f"{file_path}::page:0001",
+                        title="scan page 1",
+                        text="Scanned invoice total due",
+                        section_type="page",
+                        index=1,
+                        content_type="image",
+                        image_path="/tmp/page_0001.png",
+                    )
+                ],
+                document_type="pdf",
+                extractor="unit-test-ocr",
+            )
+
+            with unittest.mock.patch("recallforge.server.extract_document_artifacts", return_value=artifacts):
+                query_text, image_path, video_path, error = _resolve_file_query_input(file_path)
+
+            self.assertEqual(query_text, "Scanned invoice total due")
+            self.assertIsNone(image_path)
+            self.assertIsNone(video_path)
+            self.assertIsNone(error)
+        finally:
+            os.unlink(file_path)
+
     async def test_get_config_schema(self):
         backend = _make_backend()
         storage = _make_storage()
diff --git a/tests/test_documents.py b/tests/test_documents.py
index 71c2d70..c9c65b7 100644
--- a/tests/test_documents.py
+++ b/tests/test_documents.py
@@ -1,6 +1,7 @@
 from __future__ import annotations
 
 from pathlib import Path
+from unittest.mock import patch
 from zipfile import ZipFile
 
 from recallforge.documents import extract_document_artifacts, is_document_file
@@ -89,6 +90,35 @@ def _write_fake_pdf(path: Path) -> None:
     path.write_bytes(b"".join(chunks))
 
 
+def _write_empty_pdf(path: Path) -> None:
+    objects = [
+        b"<< /Type /Catalog /Pages 2 0 R >>",
+        b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
+        b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>",
+        b"<< /Length 0 >>\nstream\n\nendstream",
+    ]
+
+    chunks = [b"%PDF-1.4\n"]
+    offsets = [0]
+    for index, obj in enumerate(objects, start=1):
+        offsets.append(sum(len(chunk) for chunk in chunks))
+        chunks.append(f"{index} 0 obj\n".encode("ascii"))
+        chunks.append(obj)
+        chunks.append(b"\nendobj\n")
+
+    xref_offset = sum(len(chunk) for chunk in chunks)
+    chunks.append(f"xref\n0 {len(objects) + 1}\n".encode("ascii"))
+    chunks.append(b"0000000000 65535 f \n")
+    for offset in offsets[1:]:
+        chunks.append(f"{offset:010d} 00000 n \n".encode("ascii"))
+    chunks.append(
+        f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n".encode(
+            "ascii"
+        )
+    )
+    path.write_bytes(b"".join(chunks))
+
+
 def test_is_document_file():
     assert is_document_file("report.pdf")
     assert is_document_file("slides.pptx")
@@ -138,34 +168,8 @@ def test_extract_pdf_artifacts_with_builtin_fallback(tmp_path):
 
 def test_extract_pdf_empty_graceful_skip(tmp_path):
     """PDFs with no extractable text should return empty sections, not crash."""
-    # Create a minimal PDF with no text streams - just structure
     path = tmp_path / "empty.pdf"
-    objects = [
-        b"<< /Type /Catalog /Pages 2 0 R >>",
-        b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
-        b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>",
-        b"<< /Length 0 >>\nstream\n\nendstream",
-    ]
-
-    chunks = [b"%PDF-1.4\n"]
-    offsets = [0]
-    for index, obj in enumerate(objects, start=1):
-        offsets.append(sum(len(chunk) for chunk in chunks))
-        chunks.append(f"{index} 0 obj\n".encode("ascii"))
-        chunks.append(obj)
-        chunks.append(b"\nendobj\n")
-
-    xref_offset = sum(len(chunk) for chunk in chunks)
-    chunks.append(f"xref\n0 {len(objects) + 1}\n".encode("ascii"))
-    chunks.append(b"0000000000 65535 f \n")
-    for offset in offsets[1:]:
-        chunks.append(f"{offset:010d} 00000 n \n".encode("ascii"))
-    chunks.append(
-        f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n".encode(
-            "ascii"
-        )
-    )
-    path.write_bytes(b"".join(chunks))
+    _write_empty_pdf(path)
 
     artifacts = extract_document_artifacts(path, "empty/empty.pdf")
 
@@ -174,6 +178,29 @@ def test_extract_pdf_empty_graceful_skip(tmp_path):
     assert artifacts.sections == []
 
 
+def test_extract_pdf_scanned_page_carries_ocr_text_with_image(tmp_path):
+    path = tmp_path / "scanned.pdf"
+    _write_empty_pdf(path)
+
+    def _fake_find_spec(name: str):
+        if name == "pypdf":
+            return None
+        return object()
+
+    with (
+        patch("recallforge.documents.importlib.util.find_spec", side_effect=_fake_find_spec),
+        patch("recallforge.documents._ocr_pdf_page_text", return_value="Scanned invoice total due"),
+        patch("recallforge.documents._render_pdf_page_as_image", return_value=str(tmp_path / "page_0001.png")),
+    ):
+        artifacts = extract_document_artifacts(path, "scanned/scanned.pdf")
+
+    assert artifacts.document_type == "pdf"
+    assert len(artifacts.sections) == 1
+    assert artifacts.sections[0].content_type == "image"
+    assert artifacts.sections[0].image_path is not None
+    assert artifacts.sections[0].text == "Scanned invoice total due"
+
+
 def test_extract_docx_empty_graceful_skip(tmp_path):
     """DOCX with no extractable text should return empty sections."""
     path = tmp_path / "empty.docx"
diff --git a/tests/test_storage.py b/tests/test_storage.py
index ac48b13..3c990d3 100644
--- a/tests/test_storage.py
+++ b/tests/test_storage.py
@@ -1443,6 +1443,51 @@ def test_index_document_file_creates_root_memory_and_links_sections(self):
         self.assertEqual(child_doc.memory_role, "child")
         self.assertEqual(child_doc.memory_root_path, logical_path)
 
+    def test_index_document_file_preserves_ocr_text_for_image_only_pages(self):
+        document_path = os.path.join(self.temp_dir, "scan.pdf")
+        logical_path = str(Path(document_path).expanduser().resolve())
+        with open(document_path, "wb") as f:
+            f.write(b"%PDF-1.4 mock")
+
+        fake_artifacts = SimpleNamespace(
+            document_type="pdf",
+            extractor="unit-test",
+            sections=[
+                SimpleNamespace(
+                    logical_path=f"{logical_path}::page:0001",
+                    title="scan page 1",
+                    text="Scanned invoice total due on receipt.",
+                    section_type="page",
+                    index=1,
+                    content_type="image",
+                    image_path=self.frame_path,
+                )
+            ],
+        )
+
+        with patch("recallforge.storage.indexing_ops.extract_document_artifacts", return_value=fake_artifacts):
+            result = self.backend.index_document_file(
+                path=document_path,
+                collection="test",
+                embed_func=mock_embed,
+                embed_image_func=mock_embed,
+                model="mock-embedder",
+            )
+
+        self.assertEqual(result["indexed_images"], 1)
+        self.assertEqual(result["indexed_sections"], 1)
+
+        ocr_doc = self.backend.find_document("test", f"{logical_path}::page:0001::ocr")
+        self.assertIsNotNone(ocr_doc)
+        self.assertEqual(ocr_doc.memory_role, "child")
+        self.assertEqual(ocr_doc.memory_root_path, logical_path)
+
+        ocr_rows = self.backend._embeddings_table.search().where(
+            f"collection = 'test' AND file_path = '{logical_path}::page:0001::ocr'"
+        ).to_list()
+        self.assertGreaterEqual(len(ocr_rows), 1)
+        self.assertIn("Scanned invoice total due", ocr_rows[0].get("text_body") or "")
+
     def test_ingest_caption_media_disabled_skips_image_caption(self):
         embedder = CaptioningEmbedder()
         self.backend.ingest(