Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
138 changes: 136 additions & 2 deletions apps/document_rag.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,87 @@
from base_rag_example import BaseRAGExample
from chunking import create_text_chunks
from llama_index.core import SimpleDirectoryReader
OCR_AVAILABLE = False
# Check if MinerU is available for OCR
try:
import mineru
OCR_AVAILABLE = True
except ImportError:
OCR_AVAILABLE = False

def extract_pdf_with_ocr_fallback(pdf_path: str, use_ocr: bool = False) -> str:
"""
Extract text from PDF with OCR fallback.
Used as a custom file extractor for SimpleDirectoryReader.

Args:
pdf_path: Path to PDF file
use_ocr: Whether to try OCR if standard extraction fails

Returns:
Extracted text string
"""
# Try PyMuPDF first
try:
import fitz # PyMuPDF
doc = fitz.open(pdf_path)
text = ""
for page in doc:
text += page.get_text()
doc.close()

if text and len(text.strip()) > 100:
return text
except Exception:
pass

# Try pdfplumber
try:
import pdfplumber
text = ""
with pdfplumber.open(pdf_path) as pdf:
for page in pdf.pages:
text += page.extract_text() or ""

if text and len(text.strip()) > 100:
return text
except Exception:
pass

# Try OCR if enabled
if use_ocr and OCR_AVAILABLE:
try:
result = None
try:
from mineru import MinerUProcessor
processor = MinerUProcessor()
if hasattr(processor, 'process'):
result = processor.process(pdf_path)
except (ImportError, AttributeError, TypeError):
try:
import mineru
if hasattr(mineru, 'process'):
result = mineru.process(pdf_path)
elif hasattr(mineru, 'extract_text'):
result = mineru.extract_text(pdf_path)
except Exception:
pass

if result:
if isinstance(result, str):
return result
elif hasattr(result, 'text'):
return result.text
elif hasattr(result, 'markdown'):
return result.markdown
elif isinstance(result, dict):
return result.get('text', result.get('markdown', result.get('content', '')))
else:
return str(result)
except Exception as e:
print(f" OCR failed for {pdf_path}: {e}")

return "" # Return empty if all fail

class DocumentRAG(BaseRAGExample):
"""RAG example for document processing (PDF, TXT, MD, etc.)."""
Expand Down Expand Up @@ -51,6 +131,26 @@ def _add_specific_arguments(self, parser):
help="Enable AST-aware chunking for code files in the data directory",
)

# OCR parameters
ocr_group = parser.add_argument_group("OCR Parameters (for scanned PDFs)")
ocr_group.add_argument(
"--use-ocr",
action="store_true",
help="Force OCR processing for all PDFs (even if they contain text)",
)
ocr_group.add_argument(
"--auto-detect-scanned",
action="store_true",
default=True,
help="Automatically detect and OCR scanned PDFs (default: True)",
)
ocr_group.add_argument(
"--no-auto-detect-scanned",
dest="auto_detect_scanned",
action="store_false",
help="Disable automatic detection of scanned PDFs",
)

async def load_data(self, args) -> list[str]:
"""Load documents and convert to text chunks."""
print(f"Loading documents from: {args.data_dir}")
Expand All @@ -63,14 +163,41 @@ async def load_data(self, args) -> list[str]:
data_path = Path(args.data_dir)
if not data_path.exists():
raise ValueError(f"Data directory not found: {args.data_dir}")

# Load documents

use_ocr_for_all = args.use_ocr
auto_detect_scanned = args.auto_detect_scanned and OCR_AVAILABLE

# Create custom PDF extractor with OCR fallback
def pdf_extractor(file_path: str) -> str:
"""Custom extractor for PDFs with OCR support."""
# Check if we should try OCR
try_ocr = use_ocr_for_all

if not try_ocr and auto_detect_scanned:
# Quick check: try standard extraction first
text = extract_pdf_with_ocr_fallback(file_path, use_ocr=False)
# If we got very little text, it's likely scanned
if len(text.strip()) < 100:
try_ocr = True
print(f"Detected scanned PDF: {Path(file_path).name}")

# Extract with OCR if needed
text = extract_pdf_with_ocr_fallback(file_path, use_ocr=try_ocr)
if try_ocr and text:
print(f"✓ OCR: {Path(file_path).name}")
return text

# Load documents with custom PDF extractor
reader_kwargs = {
"recursive": True,
"encoding": "utf-8",
}
if args.file_types:
reader_kwargs["required_exts"] = args.file_types

# Add custom PDF extractor if we need OCR
if use_ocr_for_all or auto_detect_scanned:
reader_kwargs["file_extractor"] = {".pdf": pdf_extractor}

documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
show_progress=True
Expand Down Expand Up @@ -125,6 +252,13 @@ async def load_data(self, args) -> list[str]:
print("- Use --enable-code-chunking to enable AST-aware chunking for code files")
print("- Supports Python, Java, C#, TypeScript files")
print("- Better semantic understanding of code structure")
if OCR_AVAILABLE:
print("\n📄 OCR Support: Scanned PDF processing available!")
print("- Use --use-ocr to force OCR for all PDFs")
print("- Use --auto-detect-scanned (default) to automatically detect scanned PDFs")
else:
print("\n📄 OCR Support: Install mineru for scanned PDF processing:")
print(" pip install mineru or uv pip install -e .[ocr]")
print("\nOr run without --query for interactive mode\n")

rag = DocumentRAG()
Expand Down
49 changes: 49 additions & 0 deletions benchmarks/ocr_benchmark/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
# OCR Benchmark Evaluation with olmOCR-Bench

This benchmark evaluates OCR accuracy using the [olmOCR-Bench dataset](https://huggingface.co/datasets/allenai/olmOCR-bench) from AllenAI.

## Dataset Information

- **Dataset**: [allenai/olmOCR-bench](https://huggingface.co/datasets/allenai/olmOCR-bench)
- **Size**: 1,403 PDF files with 7,010 test cases
- **Splits**: arxiv_math, headers_footers, long_tiny_text, multi_column, old_scans, old_scans_math, table_tests
- **Purpose**: Evaluates OCR systems' ability to accurately convert PDFs to markdown while preserving textual and structural information

## Setup

1. Install dependencies:
```bash
pip install datasets huggingface_hub
```

2. Download the dataset (automatically done by setup script):
```bash
python benchmarks/ocr_benchmark/setup_ocr_bench.py
```

## Evaluation

Run the evaluation:
```bash
# Evaluate on all splits
python benchmarks/ocr_benchmark/evaluate_ocr_bench.py

# Evaluate on specific split
python benchmarks/ocr_benchmark/evaluate_ocr_bench.py --split arxiv_math

# Limit number of samples
python benchmarks/ocr_benchmark/evaluate_ocr_bench.py --max-samples 50
```

## Metrics

- **Character Error Rate (CER)**: Percentage of character-level errors
- **Word Error Rate (WER)**: Percentage of word-level errors
- **Extraction Success Rate**: Percentage of PDFs successfully processed
- **Processing Time**: Time taken for standard vs OCR extraction
- **Test Case Pass Rate**: Percentage of test cases passed (if ground truth available)

## Reference

Based on the olmOCR-Bench paper and dataset from AllenAI.

2 changes: 2 additions & 0 deletions benchmarks/ocr_benchmark/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
"""OCR benchmark evaluation module using olmOCR-Bench dataset."""

Loading
Loading