Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,10 @@ mlx = [
# transformers 5.x Qwen3VLVideoProcessor requires torchvision for processor loading
"torchvision>=0.15,<1.0",
]
docs = ["pypdf>=5.0,<6.0"]
docs = [
"pypdf>=5.0,<6.0",
"pymupdf>=1.24,<2.0",
]
server = [
"starlette>=0.36,<1.0",
"uvicorn>=0.30,<1.0",
Expand Down
55 changes: 52 additions & 3 deletions src/recallforge/documents.py
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,27 @@ def _render_pdf_page_as_image(
return None


def _ocr_pdf_page_text(pdf_path: Path, page_number: int) -> str:
"""Extract OCR text for a PDF page when embedded text is unavailable."""
import logging

logger = logging.getLogger("recallforge.documents")

if importlib.util.find_spec("fitz") is None:
return ""

try:
import fitz # type: ignore

with fitz.open(str(pdf_path)) as doc:
page = doc.load_page(page_number - 1) # 0-indexed
text_page = page.get_textpage_ocr(language="eng", dpi=150, full=True)
return _clean_text(page.get_text(textpage=text_page) or "")
except Exception as exc:
logger.debug("pymupdf OCR failed for %s page %d: %s", pdf_path, page_number, exc)
return ""


def _extract_pdf_with_pypdf(path: Path, logical_path: str) -> DocumentArtifacts:
import logging
from pypdf import PdfReader # type: ignore
Expand Down Expand Up @@ -295,19 +316,31 @@ def _extract_pdf_with_pypdf(path: Path, logical_path: str) -> DocumentArtifacts:
# No text extracted - try to render page as image
if temp_dir is None:
temp_dir = Path(tempfile.mkdtemp(prefix="recallforge_pdf_"))
ocr_text = _ocr_pdf_page_text(path, index)
image_path = _render_pdf_page_as_image(path, index, temp_dir)
if image_path:
sections.append(
DocumentSection(
logical_path=f"{logical_path}::page:{index:04d}",
title=f"{path.stem} page {index}",
text="", # No text, image will be embedded
text=ocr_text,
section_type="page",
index=index,
content_type="image",
image_path=image_path,
)
)
elif ocr_text:
sections.append(
DocumentSection(
logical_path=f"{logical_path}::page:{index:04d}",
title=f"{path.stem} page {index}",
text=ocr_text,
section_type="page",
index=index,
content_type="text",
)
)

if not sections:
logger.warning("No extractable text or images found in PDF: %s", path)
Expand Down Expand Up @@ -344,15 +377,16 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts:
extractor="builtin-pdf-fallback",
)

# No text extracted - try to render first page as image using pymupdf
# No text extracted - try OCR + page rendering for scanned/image-only PDFs.
ocr_text = _ocr_pdf_page_text(path, 1)
image_path = _render_pdf_page_as_image(path, 1, None)
if image_path:
return DocumentArtifacts(
sections=[
DocumentSection(
logical_path=f"{logical_path}::page:0001",
title=f"{path.stem} page 1",
text="",
text=ocr_text,
section_type="page",
index=1,
content_type="image",
Expand All @@ -362,6 +396,21 @@ def _extract_pdf_fallback(path: Path, logical_path: str) -> DocumentArtifacts:
document_type="pdf",
extractor="builtin-pdf-fallback",
)
if ocr_text:
return DocumentArtifacts(
sections=[
DocumentSection(
logical_path=f"{logical_path}::page:0001",
title=f"{path.stem} page 1",
text=ocr_text,
section_type="page",
index=1,
content_type="text",
)
],
document_type="pdf",
extractor="builtin-pdf-fallback",
)

logger.warning(
"No extractable text or images found in PDF: %s. Install recallforge[docs] for richer PDF parsing.",
Expand Down
3 changes: 1 addition & 2 deletions src/recallforge/server.py
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,6 @@ def _resolve_file_query_input(
for section in artifacts.sections
if isinstance(section.text, str)
and section.text.strip()
and section.content_type == "text"
)
)
if merged:
Expand All @@ -128,7 +127,7 @@ def _resolve_file_query_input(
None,
None,
None,
f"No extractable document text found in {resolved.name}. OCR/image-only document queries are not supported yet.",
f"No extractable document text found in {resolved.name}. Install OCR-capable PDF support for scanned/image-only documents.",
)

try:
Expand Down
21 changes: 21 additions & 0 deletions src/recallforge/storage/indexing_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1422,6 +1422,27 @@ def index_document_file(
content_hash = hash_content(f"pdf_page_image:{actual_path}:page:{section.index}")
self._backend.insert_content(content_hash, actual_path, content_type="pdf_page_image")
indexed_images += 1

# Preserve OCR text for scanned/image-only pages as a sibling
# text child so BM25 and file-as-query can use it without
# dropping the visual page representation.
ocr_text = (section.text or "").strip()
if ocr_text:
self.upsert_memory(
path=f"{section.logical_path}::ocr",
text=ocr_text,
collection=collection,
embed_func=embed_func,
model=model,
user_id=user_id,
session_id=session_id,
project_id=project_id,
profile=profile,
_skip_delete=True,
memory_role="child",
memory_root_path=logical_path,
)
indexed_sections += 1
else:
# Use text embedding for text sections
self.upsert_memory(
Expand Down
33 changes: 33 additions & 0 deletions tests/test_config_tools.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
create_server,
)
from recallforge import __version__
from recallforge.documents import DocumentArtifacts, DocumentSection
from recallforge.search import HybridResult, SearchAudit


Expand Down Expand Up @@ -608,6 +609,38 @@ def read(self, size: int = -1) -> bytes:
finally:
os.unlink(file_path)

async def test_document_file_query_uses_ocr_text_from_image_only_pages(self):
with tempfile.NamedTemporaryFile("wb", suffix=".pdf", delete=False) as tmp:
tmp.write(b"%PDF-1.4 mock")
file_path = tmp.name

try:
artifacts = DocumentArtifacts(
sections=[
DocumentSection(
logical_path=f"{file_path}::page:0001",
title="scan page 1",
text="Scanned invoice total due",
section_type="page",
index=1,
content_type="image",
image_path="/tmp/page_0001.png",
)
],
document_type="pdf",
extractor="unit-test-ocr",
)

with unittest.mock.patch("recallforge.server.extract_document_artifacts", return_value=artifacts):
query_text, image_path, video_path, error = _resolve_file_query_input(file_path)

self.assertEqual(query_text, "Scanned invoice total due")
self.assertIsNone(image_path)
self.assertIsNone(video_path)
self.assertIsNone(error)
finally:
os.unlink(file_path)

async def test_get_config_schema(self):
backend = _make_backend()
storage = _make_storage()
Expand Down
81 changes: 54 additions & 27 deletions tests/test_documents.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
from __future__ import annotations

from pathlib import Path
from unittest.mock import patch
from zipfile import ZipFile

from recallforge.documents import extract_document_artifacts, is_document_file
Expand Down Expand Up @@ -89,6 +90,35 @@ def _write_fake_pdf(path: Path) -> None:
path.write_bytes(b"".join(chunks))


def _write_empty_pdf(path: Path) -> None:
objects = [
b"<< /Type /Catalog /Pages 2 0 R >>",
b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>",
b"<< /Length 0 >>\nstream\n\nendstream",
]

chunks = [b"%PDF-1.4\n"]
offsets = [0]
for index, obj in enumerate(objects, start=1):
offsets.append(sum(len(chunk) for chunk in chunks))
chunks.append(f"{index} 0 obj\n".encode("ascii"))
chunks.append(obj)
chunks.append(b"\nendobj\n")

xref_offset = sum(len(chunk) for chunk in chunks)
chunks.append(f"xref\n0 {len(objects) + 1}\n".encode("ascii"))
chunks.append(b"0000000000 65535 f \n")
for offset in offsets[1:]:
chunks.append(f"{offset:010d} 00000 n \n".encode("ascii"))
chunks.append(
f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n".encode(
"ascii"
)
)
path.write_bytes(b"".join(chunks))


def test_is_document_file():
assert is_document_file("report.pdf")
assert is_document_file("slides.pptx")
Expand Down Expand Up @@ -138,34 +168,8 @@ def test_extract_pdf_artifacts_with_builtin_fallback(tmp_path):

def test_extract_pdf_empty_graceful_skip(tmp_path):
"""PDFs with no extractable text should return empty sections, not crash."""
# Create a minimal PDF with no text streams - just structure
path = tmp_path / "empty.pdf"
objects = [
b"<< /Type /Catalog /Pages 2 0 R >>",
b"<< /Type /Pages /Kids [3 0 R] /Count 1 >>",
b"<< /Type /Page /Parent 2 0 R /MediaBox [0 0 612 792] /Contents 4 0 R >>",
b"<< /Length 0 >>\nstream\n\nendstream",
]

chunks = [b"%PDF-1.4\n"]
offsets = [0]
for index, obj in enumerate(objects, start=1):
offsets.append(sum(len(chunk) for chunk in chunks))
chunks.append(f"{index} 0 obj\n".encode("ascii"))
chunks.append(obj)
chunks.append(b"\nendobj\n")

xref_offset = sum(len(chunk) for chunk in chunks)
chunks.append(f"xref\n0 {len(objects) + 1}\n".encode("ascii"))
chunks.append(b"0000000000 65535 f \n")
for offset in offsets[1:]:
chunks.append(f"{offset:010d} 00000 n \n".encode("ascii"))
chunks.append(
f"trailer\n<< /Size {len(objects) + 1} /Root 1 0 R >>\nstartxref\n{xref_offset}\n%%EOF\n".encode(
"ascii"
)
)
path.write_bytes(b"".join(chunks))
_write_empty_pdf(path)

artifacts = extract_document_artifacts(path, "empty/empty.pdf")

Expand All @@ -174,6 +178,29 @@ def test_extract_pdf_empty_graceful_skip(tmp_path):
assert artifacts.sections == []


def test_extract_pdf_scanned_page_carries_ocr_text_with_image(tmp_path):
path = tmp_path / "scanned.pdf"
_write_empty_pdf(path)

def _fake_find_spec(name: str):
if name == "pypdf":
return None
return object()

with (
patch("recallforge.documents.importlib.util.find_spec", side_effect=_fake_find_spec),
patch("recallforge.documents._ocr_pdf_page_text", return_value="Scanned invoice total due"),
patch("recallforge.documents._render_pdf_page_as_image", return_value=str(tmp_path / "page_0001.png")),
):
artifacts = extract_document_artifacts(path, "scanned/scanned.pdf")

assert artifacts.document_type == "pdf"
assert len(artifacts.sections) == 1
assert artifacts.sections[0].content_type == "image"
assert artifacts.sections[0].image_path is not None
assert artifacts.sections[0].text == "Scanned invoice total due"


def test_extract_docx_empty_graceful_skip(tmp_path):
"""DOCX with no extractable text should return empty sections."""
path = tmp_path / "empty.docx"
Expand Down
45 changes: 45 additions & 0 deletions tests/test_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -1443,6 +1443,51 @@ def test_index_document_file_creates_root_memory_and_links_sections(self):
self.assertEqual(child_doc.memory_role, "child")
self.assertEqual(child_doc.memory_root_path, logical_path)

def test_index_document_file_preserves_ocr_text_for_image_only_pages(self):
document_path = os.path.join(self.temp_dir, "scan.pdf")
logical_path = str(Path(document_path).expanduser().resolve())
with open(document_path, "wb") as f:
f.write(b"%PDF-1.4 mock")

fake_artifacts = SimpleNamespace(
document_type="pdf",
extractor="unit-test",
sections=[
SimpleNamespace(
logical_path=f"{logical_path}::page:0001",
title="scan page 1",
text="Scanned invoice total due on receipt.",
section_type="page",
index=1,
content_type="image",
image_path=self.frame_path,
)
],
)

with patch("recallforge.storage.indexing_ops.extract_document_artifacts", return_value=fake_artifacts):
result = self.backend.index_document_file(
path=document_path,
collection="test",
embed_func=mock_embed,
embed_image_func=mock_embed,
model="mock-embedder",
)

self.assertEqual(result["indexed_images"], 1)
self.assertEqual(result["indexed_sections"], 1)

ocr_doc = self.backend.find_document("test", f"{logical_path}::page:0001::ocr")
self.assertIsNotNone(ocr_doc)
self.assertEqual(ocr_doc.memory_role, "child")
self.assertEqual(ocr_doc.memory_root_path, logical_path)

ocr_rows = self.backend._embeddings_table.search().where(
f"collection = 'test' AND file_path = '{logical_path}::page:0001::ocr'"
).to_list()
self.assertGreaterEqual(len(ocr_rows), 1)
self.assertIn("Scanned invoice total due", ocr_rows[0].get("text_body") or "")

def test_ingest_caption_media_disabled_skips_image_caption(self):
embedder = CaptioningEmbedder()
self.backend.ingest(
Expand Down
Loading