StarTrail-org · ljy03 · Dec 22, 2025
diff --git a/apps/document_rag.py b/apps/document_rag.py
@@ -12,7 +12,87 @@
 from base_rag_example import BaseRAGExample
 from chunking import create_text_chunks
 from llama_index.core import SimpleDirectoryReader
+OCR_AVAILABLE = False
+# Check if MinerU is available for OCR
+try:
+    import mineru
+    OCR_AVAILABLE = True
+except ImportError:
+    OCR_AVAILABLE = False
 
+def extract_pdf_with_ocr_fallback(pdf_path: str, use_ocr: bool = False) -> str:
+    """
+    Extract text from PDF with OCR fallback.
+    Used as a custom file extractor for SimpleDirectoryReader.
+
+    Args:
+        pdf_path: Path to PDF file
+        use_ocr: Whether to try OCR if standard extraction fails
+
+    Returns:
+        Extracted text string
+    """
+    # Try PyMuPDF first
+    try:
+        import fitz  # PyMuPDF
+        doc = fitz.open(pdf_path)
+        text = ""
+        for page in doc:
+            text += page.get_text()
+        doc.close()
+
+        if text and len(text.strip()) > 100:
+            return text
+    except Exception:
+        pass
+
+    # Try pdfplumber
+    try:
+        import pdfplumber
+        text = ""
+        with pdfplumber.open(pdf_path) as pdf:
+            for page in pdf.pages:
+                text += page.extract_text() or ""
+
+        if text and len(text.strip()) > 100:
+            return text
+    except Exception:
+        pass
+
+    # Try OCR if enabled
+    if use_ocr and OCR_AVAILABLE:
+        try:
+            result = None
+            try:
+                from mineru import MinerUProcessor
+                processor = MinerUProcessor()
+                if hasattr(processor, 'process'):
+                    result = processor.process(pdf_path)
+            except (ImportError, AttributeError, TypeError):
+                try:
+                    import mineru
+                    if hasattr(mineru, 'process'):
+                        result = mineru.process(pdf_path)
+                    elif hasattr(mineru, 'extract_text'):
+                        result = mineru.extract_text(pdf_path)
+                except Exception:
+                    pass
+
+            if result:
+                if isinstance(result, str):
+                    return result
+                elif hasattr(result, 'text'):
+                    return result.text
+                elif hasattr(result, 'markdown'):
+                    return result.markdown
+                elif isinstance(result, dict):
+                    return result.get('text', result.get('markdown', result.get('content', '')))
+                else:
+                    return str(result)
+        except Exception as e:
+            print(f"  OCR failed for {pdf_path}: {e}")
+
+    return ""  # Return empty if all fail
 
 class DocumentRAG(BaseRAGExample):
     """RAG example for document processing (PDF, TXT, MD, etc.)."""
@@ -51,6 +131,26 @@ def _add_specific_arguments(self, parser):
             help="Enable AST-aware chunking for code files in the data directory",
         )
 
+        # OCR parameters
+        ocr_group = parser.add_argument_group("OCR Parameters (for scanned PDFs)")
+        ocr_group.add_argument(
+            "--use-ocr",
+            action="store_true",
+            help="Force OCR processing for all PDFs (even if they contain text)",
+        )
+        ocr_group.add_argument(
+            "--auto-detect-scanned",
+            action="store_true",
+            default=True,
+            help="Automatically detect and OCR scanned PDFs (default: True)",
+        )
+        ocr_group.add_argument(
+            "--no-auto-detect-scanned",
+            dest="auto_detect_scanned",
+            action="store_false",
+            help="Disable automatic detection of scanned PDFs",
+        )
+
     async def load_data(self, args) -> list[str]:
         """Load documents and convert to text chunks."""
         print(f"Loading documents from: {args.data_dir}")
@@ -63,14 +163,41 @@ async def load_data(self, args) -> list[str]:
         data_path = Path(args.data_dir)
         if not data_path.exists():
             raise ValueError(f"Data directory not found: {args.data_dir}")
-
-        # Load documents
+
+        use_ocr_for_all = args.use_ocr
+        auto_detect_scanned = args.auto_detect_scanned and OCR_AVAILABLE
+
+        # Create custom PDF extractor with OCR fallback
+        def pdf_extractor(file_path: str) -> str:
+            """Custom extractor for PDFs with OCR support."""
+            # Check if we should try OCR
+            try_ocr = use_ocr_for_all
+
+            if not try_ocr and auto_detect_scanned:
+                # Quick check: try standard extraction first
+                text = extract_pdf_with_ocr_fallback(file_path, use_ocr=False)
+                # If we got very little text, it's likely scanned
+                if len(text.strip()) < 100:
+                    try_ocr = True
+                    print(f"Detected scanned PDF: {Path(file_path).name}")
+
+            # Extract with OCR if needed
+            text = extract_pdf_with_ocr_fallback(file_path, use_ocr=try_ocr)
+            if try_ocr and text:
+                print(f"✓ OCR: {Path(file_path).name}")
+            return text
+
+        # Load documents with custom PDF extractor
         reader_kwargs = {
             "recursive": True,
             "encoding": "utf-8",
         }
         if args.file_types:
             reader_kwargs["required_exts"] = args.file_types
+
+        # Add custom PDF extractor if we need OCR
+        if use_ocr_for_all or auto_detect_scanned:
+            reader_kwargs["file_extractor"] = {".pdf": pdf_extractor}
 
         documents = SimpleDirectoryReader(args.data_dir, **reader_kwargs).load_data(
             show_progress=True
@@ -125,6 +252,13 @@ async def load_data(self, args) -> list[str]:
     print("- Use --enable-code-chunking to enable AST-aware chunking for code files")
     print("- Supports Python, Java, C#, TypeScript files")
     print("- Better semantic understanding of code structure")
+    if OCR_AVAILABLE:
+        print("\n📄 OCR Support: Scanned PDF processing available!")
+        print("- Use --use-ocr to force OCR for all PDFs")
+        print("- Use --auto-detect-scanned (default) to automatically detect scanned PDFs")
+    else:
+        print("\n📄 OCR Support: Install mineru for scanned PDF processing:")
+        print("  pip install mineru  or  uv pip install -e .[ocr]")
     print("\nOr run without --query for interactive mode\n")
 
     rag = DocumentRAG()

diff --git a/benchmarks/ocr_benchmark/README.md b/benchmarks/ocr_benchmark/README.md
@@ -0,0 +1,49 @@
+# OCR Benchmark Evaluation with olmOCR-Bench
+
+This benchmark evaluates OCR accuracy using the [olmOCR-Bench dataset](https://huggingface.co/datasets/allenai/olmOCR-bench) from AllenAI.
+
+## Dataset Information
+
+- **Dataset**: [allenai/olmOCR-bench](https://huggingface.co/datasets/allenai/olmOCR-bench)
+- **Size**: 1,403 PDF files with 7,010 test cases
+- **Splits**: arxiv_math, headers_footers, long_tiny_text, multi_column, old_scans, old_scans_math, table_tests
+- **Purpose**: Evaluates OCR systems' ability to accurately convert PDFs to markdown while preserving textual and structural information
+
+## Setup
+
+1. Install dependencies:
+```bash
+pip install datasets huggingface_hub
+```
+
+2. Download the dataset (automatically done by setup script):
+```bash
+python benchmarks/ocr_benchmark/setup_ocr_bench.py
+```
+
+## Evaluation
+
+Run the evaluation:
+```bash
+# Evaluate on all splits
+python benchmarks/ocr_benchmark/evaluate_ocr_bench.py
+
+# Evaluate on specific split
+python benchmarks/ocr_benchmark/evaluate_ocr_bench.py --split arxiv_math
+
+# Limit number of samples
+python benchmarks/ocr_benchmark/evaluate_ocr_bench.py --max-samples 50
+```
+
+## Metrics
+
+- **Character Error Rate (CER)**: Percentage of character-level errors
+- **Word Error Rate (WER)**: Percentage of word-level errors
+- **Extraction Success Rate**: Percentage of PDFs successfully processed
+- **Processing Time**: Time taken for standard vs OCR extraction
+- **Test Case Pass Rate**: Percentage of test cases passed (if ground truth available)
+
+## Reference
+
+Based on the olmOCR-Bench paper and dataset from AllenAI.
+
diff --git a/benchmarks/ocr_benchmark/__init__.py b/benchmarks/ocr_benchmark/__init__.py
@@ -0,0 +1,2 @@
+"""OCR benchmark evaluation module using olmOCR-Bench dataset."""
+
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		"""OCR benchmark evaluation module using olmOCR-Bench dataset."""