From 9fcee4f8e8ce4185648e43b30d824e23f8bd3315 Mon Sep 17 00:00:00 2001 From: MollyAI Date: Sun, 22 Mar 2026 14:58:14 -0400 Subject: [PATCH] Add OCR fallback for scanned PDF memories --- pyproject.toml | 5 +- src/recallforge/documents.py | 55 ++++++++++++++++- src/recallforge/server.py | 3 +- src/recallforge/storage/indexing_ops.py | 21 +++++++ tests/test_config_tools.py | 33 ++++++++++ tests/test_documents.py | 81 ++++++++++++++++--------- tests/test_storage.py | 45 ++++++++++++++ 7 files changed, 210 insertions(+), 33 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 8bccf40..3733a6c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -49,7 +49,10 @@ mlx = [ # transformers 5.x Qwen3VLVideoProcessor requires torchvision for processor loading "torchvision>=0.15,<1.0", ] -docs = ["pypdf>=5.0,<6.0"] +docs = [ + "pypdf>=5.0,<6.0", + "pymupdf>=1.24,<2.0", +] server = [ "starlette>=0.36,<1.0", "uvicorn>=0.30,<1.0", diff --git a/src/recallforge/documents.py b/src/recallforge/documents.py index 4d1b684..8d4e3df 100644 --- a/src/recallforge/documents.py +++ b/src/recallforge/documents.py @@ -265,6 +265,27 @@ def _render_pdf_page_as_image( return None +def _ocr_pdf_page_text(pdf_path: Path, page_number: int) -> str: + """Extract OCR text for a PDF page when embedded text is unavailable.""" + import logging + + logger = logging.getLogger("recallforge.documents") + + if importlib.util.find_spec("fitz") is None: + return "" + + try: + import fitz # type: ignore + + with fitz.open(str(pdf_path)) as doc: + page = doc.load_page(page_number - 1) # 0-indexed + text_page = page.get_textpage_ocr(language="eng", dpi=150, full=True) + return _clean_text(page.get_text(textpage=text_page) or "") + except Exception as exc: + logger.debug("pymupdf OCR failed for %s page %d: %s", pdf_path, page_number, exc) + return "" + + def _extract_pdf_with_pypdf(path: Path, logical_path: str) -> DocumentArtifacts: import logging from pypdf import PdfReader # type: ignore @@ -295,19 +316,31 @@ def _extract_pdf_with_pypdf(path: Path, logical_path: str) -> DocumentArtifacts: # No text extracted - try to render page as image if temp_dir is None: temp_dir = Path(tempfile.mkdtemp(prefix="recallforge_pdf_")) + ocr_text = _ocr_pdf_page_text(path, index) image_path = _render_pdf_page_as_image(path, index, temp_dir) if image_path: sections.append( DocumentSection( logical_path=f"{logical_path}::page:{index:04d}", title=f"{path.stem} page {index}", - text="", # No text, image will be embedded + text=ocr_text, section_type="page", index=index, content_type="image", image_path=image_path, ) ) + elif ocr_text: + sections.append( + DocumentSection( + logical_path=f"{logical_path}::page:{index:04d}", + title=f"{path.stem} page {index}", + text=ocr_text, + section_type="page", + index=index, + content_type="text", + ) + ) if not sections: logger.warning("No extractable text or images found in PDF: %s", path) @@ -344,7 +377,8 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts: extractor="builtin-pdf-fallback", ) - # No text extracted - try to render first page as image using pymupdf + # No text extracted - try OCR + page rendering for scanned/image-only PDFs. + ocr_text = _ocr_pdf_page_text(path, 1) image_path = _render_pdf_page_as_image(path, 1, None) if image_path: return DocumentArtifacts( @@ -352,7 +386,7 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts: DocumentSection( logical_path=f"{logical_path}::page:0001", title=f"{path.stem} page 1", - text="", + text=ocr_text, section_type="page", index=1, content_type="image", @@ -362,6 +396,21 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts: document_type="pdf", extractor="builtin-pdf-fallback", ) + if ocr_text: + return DocumentArtifacts( + sections=[ + DocumentSection( + logical_path=f"{logical_path}::page:0001", + title=f"{path.stem} page 1", + text=ocr_text, + section_type="page", + index=1, + content_type="text", + ) + ], + document_type="pdf", + extractor="builtin-pdf-fallback", + ) logger.warning( "No extractable text or images found in PDF: %s. Install recallforge[docs] for richer PDF parsing.", diff --git a/src/recallforge/server.py b/src/recallforge/server.py index 4dc4e59..6b4d380 100644 --- a/src/recallforge/server.py +++ b/src/recallforge/server.py @@ -119,7 +119,6 @@ def _resolve_file_query_input( for section in artifacts.sections if isinstance(section.text, str) and section.text.strip() - and section.content_type == "text" ) ) if merged: @@ -128,7 +127,7 @@ def _resolve_file_query_input( None, None, None, - f"No extractable document text found in {resolved.name}. OCR/image-only document queries are not supported yet.", + f"No extractable document text found in {resolved.name}. Install OCR-capable PDF support for scanned/image-only documents.", ) try: diff --git a/src/recallforge/storage/indexing_ops.py b/src/recallforge/storage/indexing_ops.py index 1ecd753..8537c4a 100644 --- a/src/recallforge/storage/indexing_ops.py +++ b/src/recallforge/storage/indexing_ops.py @@ -1422,6 +1422,27 @@ def index_document_file( content_hash = hash_content(f"pdf_page_image:{actual_path}:page:{section.index}") self._backend.insert_content(content_hash, actual_path, content_type="pdf_page_image") indexed_images += 1 + + # Preserve OCR text for scanned/image-only pages as a sibling + # text child so BM25 and file-as-query can use it without + # dropping the visual page representation. + ocr_text = (section.text or "").strip() + if ocr_text: + self.upsert_memory( + path=f"{section.logical_path}::ocr", + text=ocr_text, + collection=collection, + embed_func=embed_func, + model=model, + user_id=user_id, + session_id=session_id, + project_id=project_id, + profile=profile, + _skip_delete=True, + memory_role="child", + memory_root_path=logical_path, + ) + indexed_sections += 1 else: # Use text embedding for text sections self.upsert_memory( diff --git a/tests/test_config_tools.py b/tests/test_config_tools.py index e762c1c..15cea0c 100644 --- a/tests/test_config_tools.py +++ b/tests/test_config_tools.py @@ -31,6 +31,7 @@ create_server, ) from recallforge import __version__ +from recallforge.documents import DocumentArtifacts, DocumentSection from recallforge.search import HybridResult, SearchAudit @@ -608,6 +609,38 @@ def read(self, size: int = -1) -> bytes: finally: os.unlink(file_path) + async def test_document_file_query_uses_ocr_text_from_image_only_pages(self): + with tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False) as tmp: + tmp.write(b"%PDF-1.4 mock") + file_path = tmp.name + + try: + artifacts = DocumentArtifacts( + sections=[ + DocumentSection( + logical_path=f"{file_path}::page:0001", + title="scan page 1", + text="Scanned invoice total due", + section_type="page", + index=1, + content_type="image", + image_path="/tmp/page_0001.png", + ) + ], + document_type="pdf", + extractor="unit-test-ocr", + ) + + with unittest.mock.patch("recallforge.server.extract_document_artifacts", return_value=artifacts): + query_text, image_path, video_path, error = _resolve_file_query_input(file_path) + + self.assertEqual(query_text, "Scanned invoice total due") + self.assertIsNone(image_path) + self.assertIsNone(video_path) + self.assertIsNone(error) + finally: + os.unlink(file_path) + async def test_get_config_schema(self): backend = _make_backend() storage = _make_storage() diff --git a/tests/test_documents.py b/tests/test_documents.py index 71c2d70..c9c65b7 100644 --- a/tests/test_documents.py +++ b/tests/test_documents.py @@ -1,6 +1,7 @@ from __future__ import annotations from pathlib import Path +from unittest.mock import patch from zipfile import ZipFile from recallforge.documents import extract_document_artifacts, is_document_file @@ -89,6 +90,35 @@ def _write_fake_pdf(path: Path) -> None: path.write_bytes(b"".join(chunks)) +def _write_empty_pdf(path: Path) -> None: + objects = [ + b"<< /Type /Catalog /Pages 2 0 R >>", + b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>", + b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>", + b"<< /Length 0 >>\nstream\n\nendstream", + ] + + chunks = [b"%PDF-1.4\n"] + offsets = [0] + for index, obj in enumerate(objects, start=1): + offsets.append(sum(len(chunk) for chunk in chunks)) + chunks.append(f"{index} 0 obj\n".encode("ascii")) + chunks.append(obj) + chunks.append(b"\nendobj\n") + + xref_offset = sum(len(chunk) for chunk in chunks) + chunks.append(f"xref\n0 {len(objects) + 1}\n".encode("ascii")) + chunks.append(b"0000000000 65535 f \n") + for offset in offsets[1:]: + chunks.append(f"{offset:010d} 00000 n \n".encode("ascii")) + chunks.append( + f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n".encode( + "ascii" + ) + ) + path.write_bytes(b"".join(chunks)) + + def test_is_document_file(): assert is_document_file("report.pdf") assert is_document_file("slides.pptx") @@ -138,34 +168,8 @@ def test_extract_pdf_artifacts_with_builtin_fallback(tmp_path): def test_extract_pdf_empty_graceful_skip(tmp_path): """PDFs with no extractable text should return empty sections, not crash.""" - # Create a minimal PDF with no text streams - just structure path = tmp_path / "empty.pdf" - objects = [ - b"<< /Type /Catalog /Pages 2 0 R >>", - b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>", - b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>", - b"<< /Length 0 >>\nstream\n\nendstream", - ] - - chunks = [b"%PDF-1.4\n"] - offsets = [0] - for index, obj in enumerate(objects, start=1): - offsets.append(sum(len(chunk) for chunk in chunks)) - chunks.append(f"{index} 0 obj\n".encode("ascii")) - chunks.append(obj) - chunks.append(b"\nendobj\n") - - xref_offset = sum(len(chunk) for chunk in chunks) - chunks.append(f"xref\n0 {len(objects) + 1}\n".encode("ascii")) - chunks.append(b"0000000000 65535 f \n") - for offset in offsets[1:]: - chunks.append(f"{offset:010d} 00000 n \n".encode("ascii")) - chunks.append( - f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n".encode( - "ascii" - ) - ) - path.write_bytes(b"".join(chunks)) + _write_empty_pdf(path) artifacts = extract_document_artifacts(path, "empty/empty.pdf") @@ -174,6 +178,29 @@ def test_extract_pdf_empty_graceful_skip(tmp_path): assert artifacts.sections == [] +def test_extract_pdf_scanned_page_carries_ocr_text_with_image(tmp_path): + path = tmp_path / "scanned.pdf" + _write_empty_pdf(path) + + def _fake_find_spec(name: str): + if name == "pypdf": + return None + return object() + + with ( + patch("recallforge.documents.importlib.util.find_spec", side_effect=_fake_find_spec), + patch("recallforge.documents._ocr_pdf_page_text", return_value="Scanned invoice total due"), + patch("recallforge.documents._render_pdf_page_as_image", return_value=str(tmp_path / "page_0001.png")), + ): + artifacts = extract_document_artifacts(path, "scanned/scanned.pdf") + + assert artifacts.document_type == "pdf" + assert len(artifacts.sections) == 1 + assert artifacts.sections[0].content_type == "image" + assert artifacts.sections[0].image_path is not None + assert artifacts.sections[0].text == "Scanned invoice total due" + + def test_extract_docx_empty_graceful_skip(tmp_path): """DOCX with no extractable text should return empty sections.""" path = tmp_path / "empty.docx" diff --git a/tests/test_storage.py b/tests/test_storage.py index ac48b13..3c990d3 100644 --- a/tests/test_storage.py +++ b/tests/test_storage.py @@ -1443,6 +1443,51 @@ def test_index_document_file_creates_root_memory_and_links_sections(self): self.assertEqual(child_doc.memory_role, "child") self.assertEqual(child_doc.memory_root_path, logical_path) + def test_index_document_file_preserves_ocr_text_for_image_only_pages(self): + document_path = os.path.join(self.temp_dir, "scan.pdf") + logical_path = str(Path(document_path).expanduser().resolve()) + with open(document_path, "wb") as f: + f.write(b"%PDF-1.4 mock") + + fake_artifacts = SimpleNamespace( + document_type="pdf", + extractor="unit-test", + sections=[ + SimpleNamespace( + logical_path=f"{logical_path}::page:0001", + title="scan page 1", + text="Scanned invoice total due on receipt.", + section_type="page", + index=1, + content_type="image", + image_path=self.frame_path, + ) + ], + ) + + with patch("recallforge.storage.indexing_ops.extract_document_artifacts", return_value=fake_artifacts): + result = self.backend.index_document_file( + path=document_path, + collection="test", + embed_func=mock_embed, + embed_image_func=mock_embed, + model="mock-embedder", + ) + + self.assertEqual(result["indexed_images"], 1) + self.assertEqual(result["indexed_sections"], 1) + + ocr_doc = self.backend.find_document("test", f"{logical_path}::page:0001::ocr") + self.assertIsNotNone(ocr_doc) + self.assertEqual(ocr_doc.memory_role, "child") + self.assertEqual(ocr_doc.memory_root_path, logical_path) + + ocr_rows = self.backend._embeddings_table.search().where( + f"collection = 'test' AND file_path = '{logical_path}::page:0001::ocr'" + ).to_list() + self.assertGreaterEqual(len(ocr_rows), 1) + self.assertIn("Scanned invoice total due", ocr_rows[0].get("text_body") or "") + def test_ingest_caption_media_disabled_skips_image_caption(self): embedder = CaptioningEmbedder() self.backend.ingest(