diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml
new file mode 100644
index 0000000..63f9217
--- /dev/null
+++ b/.github/workflows/license-check.yml
@@ -0,0 +1,50 @@
+name: License Safety Check
+
+on: [push, pull_request]
+
+jobs:
+  license-check:
+    name: Ensure no GPL/AGPL imports in core
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check core files for GPL/AGPL imports
+        run: |
+          echo "=== License Safety Check ==="
+          echo "Verifying no GPL/AGPL package is imported in core code..."
+          echo ""
+
+          FAIL=0
+
+          # List of GPL/AGPL package import patterns to block
+          BLOCKED_PATTERNS="import[[:space:]]+pymupdf|from[[:space:]]+pymupdf|import[[:space:]]+marker\.|from[[:space:]]+marker\.|import[[:space:]]+surya|from[[:space:]]+surya"
+
+          # Files that ARE allowed to import these (isolated backends)
+          ALLOWED_FILES=(
+            "pymupdf_extractor.py"
+            "marker_extractor.py"
+          )
+
+          # Build grep exclude args
+          EXCLUDE_ARGS=""
+          for f in "${ALLOWED_FILES[@]}"; do
+            EXCLUDE_ARGS="$EXCLUDE_ARGS --exclude=$f"
+          done
+
+          # Search all Python files in src/longparser EXCEPT allowed files
+          MATCHES=$(grep -rnE "$BLOCKED_PATTERNS" src/longparser/ \
+            --include='*.py' $EXCLUDE_ARGS || true)
+
+          if [ -n "$MATCHES" ]; then
+            echo "❌ FAIL: GPL/AGPL imports found in core code!"
+            echo ""
+            echo "$MATCHES"
+            echo ""
+            echo "These packages must ONLY be imported in their isolated extractor files."
+            FAIL=1
+          else
+            echo "✅ PASS: No GPL/AGPL imports in core code."
+          fi
+
+          exit $FAIL
diff --git a/FEATURE_ROADMAP.md b/FEATURE_ROADMAP.md
new file mode 100644
index 0000000..2ea7b1a
--- /dev/null
+++ b/FEATURE_ROADMAP.md
@@ -0,0 +1,150 @@
+# LongParser — Product & Feature Roadmap
+
+> This roadmap reflects the current development direction based on community trends,
+> competitor analysis, and the RAG ecosystem in 2025–2026. Items are ordered by
+> priority within each phase. All dates are targets, not guarantees.
+
+---
+
+## Current State — v0.1.x ✅
+
+- 5-stage extraction pipeline (Extract → Validate → HITL → Chunk → Embed → Index)
+- Multi-format support: PDF, DOCX, PPTX, XLSX, CSV via Docling
+- `HybridChunker` — 6-strategy token-aware, hierarchy-aware, table-aware chunking
+- Human-in-the-Loop (HITL) review via LangGraph `interrupt()`
+- 3-layer memory chat engine (short-term + rolling summary + long-term facts)
+- Multi-provider LLM: OpenAI, Gemini, Groq, OpenRouter
+- Multi-backend vector stores: Chroma, FAISS, Qdrant
+- FastAPI REST server + ARQ/Redis job queue + Motor/MongoDB
+- LangChain `BaseRetriever` + LlamaIndex `BaseReader` adapters
+- CPU / GPU install separation via extras
+
+---
+
+## Phase 1 — Accuracy & Quality (v0.2.x) — Q2 2026
+
+### Parser Enhancements
+
+- [ ] **Marker backend** — add `marker-pdf` as an optional extraction backend for higher-fidelity Markdown output on complex academic PDFs
+- [ ] **PyMuPDF4LLM backend** — lightweight, fast alternative for speed-critical pipelines (10× faster than Docling for simple PDFs)
+- [ ] **Scanned PDF fast path** — route documents to Tesseract vs pix2tex vs Surya automatically based on page complexity score
+- [ ] **Multi-column layout detection** — prevent reading-order errors in newspaper/journal-style layouts
+- [ ] **Image extraction** — export embedded figures with captions into separate chunks with `type: figure`
+- [ ] **Document language auto-detection** — select OCR model automatically based on detected script
+
+### Chunking Improvements
+
+- [ ] **Semantic chunking** — optional embedding-based boundary detection (split at semantic shifts, not just token counts)
+- [ ] **Sliding window overlap** — configurable overlap strategy per chunk type (more overlap for tables, less for headings)
+- [ ] **Cross-reference resolution** — link `(see Figure 3)` and `(Table 2)` references to their target blocks
+- [ ] **Summary chunks** — auto-generate a 1–2 sentence summary chunk per section for hierarchical retrieval
+
+### Quality & Validation
+
+- [ ] **Chunk quality scorer** — assign a confidence score per chunk based on OCR confidence, completeness, and structural integrity
+- [ ] **PII detection** — flag and optionally redact personal information (names, emails, phone numbers) before embedding
+- [ ] **Duplicate block detection** — suppress repeated headers/footers that appear on every page
+
+---
+
+## Phase 2 — Agentic & Multimodal (v0.3.x) — Q3 2026
+
+### Agentic RAG
+
+- [ ] **Agentic retrieval loop** — implement query rewriting + iterative retrieval + self-reflection before answer generation
+- [ ] **Multi-hop question answering** — chain retrieval steps for questions that span multiple sections or documents
+- [ ] **Tool-calling integration** — expose document pipeline as a LangChain/LangGraph tool callable by autonomous agents
+- [ ] **Hypothetical Document Embeddings (HyDE)** — generate hypothetical answers to queries for improved retrieval recall
+
+### Multimodal
+
+- [ ] **Vision-Language Model (VLM) integration** — use GPT-4o / Gemini Vision to describe figures, charts, and diagrams as text chunks
+- [ ] **Chart data extraction** — parse bar/line/pie charts into structured data tables
+- [ ] **Slide layout understanding** — treat PPTX slides as visual units with spatial layout context, not just text extraction
+
+### Reranking & Retrieval
+
+- [ ] **Cross-encoder reranker** — add optional `sentence-transformers` cross-encoder reranking step after initial retrieval
+- [ ] **Hybrid search** — combine dense vector search with BM25 sparse retrieval (reciprocal rank fusion)
+- [ ] **Maximum Marginal Relevance (MMR)** — reduce redundancy in retrieved chunks
+- [ ] **Metadata filtering** — filter chunks by `page_number`, `section`, `doc_type`, `date` at query time
+
+---
+
+## Phase 3 — Enterprise & Observability (v0.4.x) — Q4 2026
+
+### Knowledge Graph
+
+- [ ] **Entity extraction** — extract named entities (people, organizations, dates, locations) from chunks
+- [ ] **Relationship mapping** — build entity relationship graphs from document content
+- [ ] **Graph-based retrieval** — traverse the entity graph for multi-hop retrieval (GraphRAG pattern)
+- [ ] **Neo4j / NetworkX integration** — persist the knowledge graph to a graph database
+
+### Evaluation Framework
+
+- [ ] **Built-in RAG evaluator** — measure retrieval recall@k, answer faithfulness, and context adherence
+- [ ] **Chunk attribution** — trace every answer sentence back to the source chunk and page
+- [ ] **RAGAS integration** — plug into the RAGAS evaluation framework
+- [ ] **Benchmark suite** — reproducible benchmarks against Unstructured, LlamaParse, Docling standalone
+
+### Observability & Compliance
+
+- [ ] **LangSmith integration** — trace every pipeline run end-to-end
+- [ ] **OpenTelemetry support** — emit spans/traces to any OTel-compatible backend
+- [ ] **Audit log** — immutable log of every HITL decision (approve/reject/edit) with timestamps and user IDs
+- [ ] **GDPR compliance mode** — PII redaction + right-to-erasure support (delete all chunks for a document)
+- [ ] **Role-based access control (RBAC)** — multi-tenant document access in the REST API
+
+---
+
+## Phase 4 — Scale & Ecosystem (v0.5.x+) — 2027
+
+### Performance & Scale
+
+- [ ] **Async parallel extraction** — process multiple documents concurrently in the background worker
+- [ ] **Streaming extraction** — yield blocks as they are extracted (no need to wait for full document)
+- [ ] **Incremental indexing** — update only changed pages/sections on re-upload
+- [ ] **S3 / GCS / Azure Blob** — native cloud storage input (not just local files)
+- [ ] **Kubernetes Helm chart** — one-command production deployment
+
+### New Integrations
+
+- [ ] **Weaviate** vector store adapter
+- [ ] **Pinecone** vector store adapter
+- [ ] **Milvus** vector store adapter
+- [ ] **DSPy** integration — use DSPy to auto-optimize retrieval prompts
+- [ ] **Haystack `DocumentConverter`** component
+- [ ] **Flowise / Langflow** node — drag-and-drop visual pipeline builder support
+
+### Developer Experience
+
+- [ ] **LongParser CLI** — `longparser parse document.pdf --output chunks.json`
+- [ ] **Web UI (HITL Dashboard)** — visual interface for reviewing and editing blocks before embedding
+- [ ] **VS Code extension** — preview parsed chunks directly from the editor
+- [ ] **Webhook support** — notify external systems when a job completes or requires HITL review
+
+---
+
+## Competitive Positioning
+
+| Capability | LongParser | Unstructured | LlamaParse | Docling |
+|---|---|---|---|---|
+| Privacy-first (fully local) | ✅ | ⚠️ (cloud option) | ❌ (API-only) | ✅ |
+| HITL review workflow | ✅ | ❌ | ❌ | ❌ |
+| Bundled REST API server | ✅ | ✅ (paid) | ✅ (cloud) | ❌ |
+| Table-aware chunking | ✅ | ⚠️ | ✅ | ✅ |
+| LaTeX / equation OCR | ✅ | ❌ | ⚠️ | ⚠️ |
+| LangChain + LlamaIndex | ✅ | ✅ | ✅ | ⚠️ |
+| Open source (MIT) | ✅ | ⚠️ (core only) | ❌ | ✅ |
+| Knowledge graph (planned) | 🔜 | ❌ | ❌ | ❌ |
+| Agentic retrieval (planned) | 🔜 | ❌ | ⚠️ | ❌ |
+
+---
+
+## Guiding Principles
+
+1. **Privacy by default** — all processing runs locally; no data leaves user infrastructure
+2. **Human oversight** — HITL is a first-class citizen, not an afterthought
+3. **Composable** — every stage is independently usable; no forced lock-in to the full stack
+4. **Production-grade** — async, typed, tested, documented from day one
+5. **Ecosystem-native** — LangChain, LlamaIndex, and HuggingFace are first-class integration targets
diff --git a/LICENSE-THIRD-PARTY.md b/LICENSE-THIRD-PARTY.md
new file mode 100644
index 0000000..257709f
--- /dev/null
+++ b/LICENSE-THIRD-PARTY.md
@@ -0,0 +1,50 @@
+# Third-Party Licenses
+
+LongParser core is licensed under the **MIT License**.
+
+Some **optional** backends and integrations use different licenses.
+These packages are **never loaded by default** — they are only imported
+when you explicitly install them and select them in your configuration.
+
+## Optional Backend Licenses
+
+| Package | License | Install Command | When Loaded |
+|---------|---------|-----------------|-------------|
+| `pymupdf4llm` | AGPL-3.0 or Artifex Commercial | `pip install "longparser[pymupdf]"` | Only when you set `backend="pymupdf"` |
+| `marker-pdf` | GPL-3.0-or-later | `pip install "longparser[marker]"` | Only when you set `backend="marker"` *(future)* |
+| `surya-ocr` | GPL-3.0-or-later | `pip install "longparser[surya]"` | Only when explicitly imported *(future)* |
+
+## Core Dependency Licenses (always installed)
+
+| Package | License | Purpose |
+|---------|---------|---------|
+| `pydantic` | MIT | Schema validation |
+| `docling` | MIT | Default PDF extraction engine |
+| `docling-core` | MIT | Docling data models |
+| `fast-langdetect` | Apache-2.0 | Document language detection |
+
+## What This Means for You
+
+- **If you only use `pip install longparser`** — everything is MIT or Apache-2.0.
+  You can use LongParser in any project (commercial, proprietary, open source).
+
+- **If you install `longparser[pymupdf]`** — the `pymupdf4llm` library is
+  AGPL-3.0 licensed. You must comply with AGPL terms for the PyMuPDF component,
+  OR purchase a commercial license from [Artifex](https://artifex.com).
+  LongParser core code remains MIT.
+
+- **If you install `longparser[marker]`** *(future)* — the `marker-pdf` library
+  is GPL-3.0 licensed. You must comply with GPL terms for the Marker component.
+  LongParser core code remains MIT.
+
+## License Isolation Guarantee
+
+LongParser uses **lazy imports** to ensure GPL/AGPL packages are never loaded
+unless explicitly requested. The following guarantees hold:
+
+1. `import longparser` does NOT import any GPL/AGPL package
+2. `from longparser import DocumentPipeline` does NOT import any GPL/AGPL package
+3. `DocumentPipeline().process_file("doc.pdf")` does NOT import any GPL/AGPL
+   package (uses Docling, which is MIT)
+4. GPL/AGPL code is only loaded when you explicitly set `backend="pymupdf"` or
+   `backend="marker"` in `ProcessingConfig`
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
index 5356c04..4ee1d42 100644
--- a/docs/getting-started/installation.md
+++ b/docs/getting-started/installation.md
@@ -104,5 +104,5 @@ The server starts on `http://localhost:8000`.
 
 ```python
 import longparser
-print(longparser.__version__)  # 0.1.3
+print(longparser.__version__)  # 0.1.4
 ```
diff --git a/pyproject.toml b/pyproject.toml
index afea16d..dbb7cbe 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "longparser"
-version = "0.1.3"
+version = "0.1.4"
 description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines."
 readme = {file = "README.md", content-type = "text/markdown"}
 requires-python = ">=3.10"
@@ -36,6 +36,7 @@ dependencies = [
     "docling>=2.14",
     "docling-core>=2.13",
     "langgraph-checkpoint-mongodb>=0.3.1",
+    "fast-langdetect>=0.3,<1.0",  # Apache-2.0 — document language detection
 ]
 
 [project.optional-dependencies]
@@ -51,6 +52,20 @@ langchain = [
 llamaindex = [
     "llama-index-core>=0.10",
 ]
+# ----------- v0.1.4: Optional extraction backends -----------
+# ⚠️ pymupdf4llm is AGPL-3.0 licensed. See LICENSE-THIRD-PARTY.md.
+# Only loaded when user sets backend="pymupdf".
+pymupdf = [
+    "pymupdf4llm>=1.27",
+]
+# ⚠️ marker-pdf is GPL-3.0. GPU recommended. Future release.
+# marker = [
+#     "marker-pdf",
+# ]
+# ⚠️ surya-ocr is GPL-3.0. GPU recommended. Future release.
+# surya = [
+#     "surya-ocr>=0.17",
+# ]
 # FastAPI REST server + MongoDB + job queue + LangChain chat engine
 server = [
     "fastapi>=0.115",
diff --git a/src/longparser/__init__.py b/src/longparser/__init__.py
index 7d00c7e..b1b9794 100755
--- a/src/longparser/__init__.py
+++ b/src/longparser/__init__.py
@@ -25,7 +25,7 @@
 
 from __future__ import annotations
 
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 __author__ = "ENDEVSOLS Team"
 __license__ = "MIT"
 
@@ -59,6 +59,10 @@ def __getattr__(name: str):
     if name == "DoclingExtractor":
         from .extractors import DoclingExtractor
         return DoclingExtractor
+    if name == "PyMuPDFExtractor":
+        # AGPL-isolated — only loaded when explicitly requested
+        from .extractors.pymupdf_extractor import PyMuPDFExtractor
+        return PyMuPDFExtractor
     if name == "PipelineOrchestrator":
         from .pipeline import PipelineOrchestrator
         return PipelineOrchestrator
@@ -101,6 +105,7 @@ def __getattr__(name: str):
     "JobResult",
     # Lazily imported (require extras)
     "DoclingExtractor",
+    "PyMuPDFExtractor",
     "PipelineOrchestrator",
     "DocumentPipeline",
     "PipelineResult",
diff --git a/src/longparser/extractors/pymupdf_extractor.py b/src/longparser/extractors/pymupdf_extractor.py
new file mode 100644
index 0000000..aecf375
--- /dev/null
+++ b/src/longparser/extractors/pymupdf_extractor.py
@@ -0,0 +1,493 @@
+"""PyMuPDF4LLM-based extractor for fast, CPU-native PDF extraction.
+
+⚠️  LICENSE NOTICE — AGPL-3.0
+    pymupdf4llm is dual-licensed under AGPL-3.0 or Artifex Commercial License.
+    By using this backend, you agree to the terms of the AGPL-3.0 license
+    unless you have purchased a commercial license from Artifex Software, Inc.
+
+    This module is NOT imported by default — users must explicitly opt in
+    via ``pip install longparser[pymupdf]`` and ``backend='pymupdf'``.
+
+⚠️  ISOLATION RULES (do NOT violate)
+    1. This file must NEVER be imported by ``extractors/__init__.py``
+    2. This file must NEVER be imported at module level by ``orchestrator.py``
+    3. This file must ONLY be imported behind ``if backend == "pymupdf":``
+    4. ``import longparser`` must NEVER trigger loading this file
+
+Best for:
+    - Native PDFs with embedded text (not scanned)
+    - Speed-critical pipelines (10-50× faster than Docling)
+    - CPU-only environments (no GPU, no ML models)
+
+NOT suitable for:
+    - Scanned PDFs (no OCR capability)
+    - Complex tables with merged cells
+    - Documents needing deep heading hierarchy detection
+
+Usage::
+
+    from longparser import ProcessingConfig, DocumentPipeline
+
+    pipeline = DocumentPipeline(
+        config=ProcessingConfig(backend="pymupdf")
+    )
+    result = pipeline.process_file("report.pdf")
+"""
+
+from __future__ import annotations
+
+import hashlib
+import logging
+import uuid
+from pathlib import Path
+from typing import Optional, List, Tuple
+
+from ..schemas import (
+    Document, Page, Block, Table, TableCell,
+    BlockType, ExtractorType, ProcessingConfig,
+    BoundingBox, Provenance, Confidence, BlockFlags,
+    DocumentMetadata, PageProfile, ExtractionMetadata,
+)
+from .base import BaseExtractor
+
+logger = logging.getLogger(__name__)
+
+
+def _require_pymupdf():
+    """Check that pymupdf4llm is installed; raise clear error if not.
+
+    Returns the ``pymupdf4llm`` module on success.
+    """
+    try:
+        import pymupdf4llm
+        return pymupdf4llm
+    except ImportError:
+        raise ImportError(
+            "\n"
+            "╔══════════════════════════════════════════════════════════╗\n"
+            "║  pymupdf4llm is not installed.                         ║\n"
+            "║                                                        ║\n"
+            "║  Install:  pip install 'longparser[pymupdf]'           ║\n"
+            "║                                                        ║\n"
+            "║  ⚠️  pymupdf4llm is licensed under AGPL-3.0.           ║\n"
+            "║  By installing it, you agree to AGPL terms for that    ║\n"
+            "║  component. LongParser core remains MIT-licensed.      ║\n"
+            "║                                                        ║\n"
+            "║  For commercial use without AGPL obligations, purchase ║\n"
+            "║  a license from https://artifex.com                    ║\n"
+            "╚══════════════════════════════════════════════════════════╝\n"
+        )
+
+
+def _require_pymupdf_fitz():
+    """Import the fitz (PyMuPDF) module for page-level operations."""
+    try:
+        import pymupdf as fitz
+        return fitz
+    except ImportError:
+        try:
+            import fitz
+            return fitz
+        except ImportError:
+            raise ImportError(
+                "PyMuPDF (fitz) is required for the pymupdf backend. "
+                "Install with: pip install 'longparser[pymupdf]'"
+            )
+
+
+class PyMuPDFExtractor(BaseExtractor):
+    """Fast, CPU-native PDF extractor using PyMuPDF4LLM.
+
+    Converts PDFs to structured Markdown and maps the output to
+    LongParser's ``Document`` / ``Block`` model. Uses no ML models,
+    no GPU — pure C-based PDF parsing via MuPDF.
+
+    Attributes
+    ----------
+    extractor_type : ExtractorType
+        Always ``ExtractorType.NATIVE_PDF``.
+    version : str
+        Extractor version string.
+    """
+
+    extractor_type = ExtractorType.NATIVE_PDF
+    version = "1.0.0"
+
+    def __init__(self):
+        """Initialize and verify pymupdf4llm is available."""
+        _require_pymupdf()
+        self._images: list = []
+        logger.info(
+            "PyMuPDF4LLM backend initialized (CPU-native, no OCR, no GPU)"
+        )
+
+    def extract(
+        self,
+        file_path: Path,
+        config: ProcessingConfig,
+        page_numbers: Optional[List[int]] = None,
+    ) -> Tuple[Document, ExtractionMetadata]:
+        """Extract a PDF using PyMuPDF4LLM.
+
+        Parameters
+        ----------
+        file_path:
+            Path to the PDF file.
+        config:
+            Processing configuration.
+        page_numbers:
+            Optional list of 0-indexed page numbers to extract.
+
+        Returns
+        -------
+        tuple[Document, ExtractionMetadata]
+            Extracted document and metadata.
+        """
+        import pymupdf4llm
+
+        file_path = Path(file_path)
+        logger.info("Extracting with PyMuPDF4LLM: %s", file_path.name)
+
+        # Validate file type
+        if file_path.suffix.lower() != ".pdf":
+            raise ValueError(
+                f"PyMuPDF4LLM backend only supports PDF files, got: {file_path.suffix}"
+            )
+
+        # File hash
+        file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest()[:16]
+
+        # Extract with pymupdf4llm
+        kwargs = {"show_progress": False}
+        if page_numbers is not None:
+            kwargs["pages"] = page_numbers
+
+        md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs)
+
+        # Get page-level info using PyMuPDF directly
+        fitz = _require_pymupdf_fitz()
+        pdf_doc = fitz.open(str(file_path))
+        total_pages = len(pdf_doc)
+
+        # Extract images if config.export_images
+        self._images = []
+        if config.export_images:
+            self._extract_images(pdf_doc, config)
+
+        # Build Document from Markdown
+        document = self._markdown_to_document(
+            md_text=md_text,
+            pdf_doc=pdf_doc,
+            file_path=file_path,
+            file_hash=file_hash,
+            total_pages=total_pages,
+            config=config,
+        )
+
+        pdf_doc.close()
+
+        meta = ExtractionMetadata(
+            strategy_used="pymupdf4llm",
+            ocr_backend_used="none (native text)",
+        )
+
+        logger.info(
+            "PyMuPDF4LLM extraction complete: %d pages, %d blocks",
+            total_pages, len(document.all_blocks),
+        )
+
+        return document, meta
+
+    def _markdown_to_document(
+        self,
+        md_text: str,
+        pdf_doc,
+        file_path: Path,
+        file_hash: str,
+        total_pages: int,
+        config: ProcessingConfig,
+    ) -> Document:
+        """Convert Markdown text to a LongParser Document model."""
+        metadata = DocumentMetadata(
+            source_file=str(file_path),
+            file_hash=file_hash,
+            total_pages=total_pages,
+        )
+
+        pages: list[Page] = []
+
+        # Split markdown by page breaks (pymupdf4llm uses "---" or form feeds)
+        page_chunks = self._split_by_pages(md_text, total_pages)
+
+        for page_idx, page_md in enumerate(page_chunks):
+            page_no = page_idx + 1
+
+            # Get page dimensions from PyMuPDF
+            if page_idx < len(pdf_doc):
+                rect = pdf_doc[page_idx].rect
+                width, height = rect.width, rect.height
+            else:
+                width, height = 612.0, 792.0  # Letter default
+
+            # Parse markdown blocks
+            blocks = self._parse_markdown_blocks(page_md, page_no, file_path)
+
+            # Build page profile
+            profile = PageProfile(
+                page_number=page_no,
+                layout_confidence=0.9,  # PyMuPDF is reliable for native PDFs
+            )
+
+            pages.append(Page(
+                page_number=page_no,
+                width=width,
+                height=height,
+                blocks=blocks,
+                profile=profile,
+            ))
+
+        return Document(metadata=metadata, pages=pages)
+
+    def _split_by_pages(self, md_text: str, total_pages: int) -> list[str]:
+        """Split markdown text into per-page chunks."""
+        import re
+
+        # pymupdf4llm inserts page separators
+        # Common patterns: "-----" (5+ dashes), or form feed characters
+        parts = re.split(r'\n-{3,}\n|\f', md_text)
+
+        # If splitting didn't work, put everything on page 1
+        if len(parts) <= 1:
+            return [md_text]
+
+        # Pad to total_pages if needed
+        while len(parts) < total_pages:
+            parts.append("")
+
+        return parts[:total_pages]
+
+    def _parse_markdown_blocks(
+        self,
+        page_md: str,
+        page_no: int,
+        file_path: Path,
+    ) -> list[Block]:
+        """Parse markdown text into Block objects."""
+        blocks: list[Block] = []
+        lines = page_md.strip().split("\n")
+        order_idx = 0
+
+        i = 0
+        while i < len(lines):
+            line = lines[i]
+            stripped = line.strip()
+
+            if not stripped:
+                i += 1
+                continue
+
+            # Detect block type
+            if stripped.startswith("#"):
+                # Heading
+                level = len(stripped) - len(stripped.lstrip("#"))
+                text = stripped.lstrip("#").strip()
+                block = self._make_block(
+                    BlockType.HEADING, text, order_idx, page_no,
+                    file_path, heading_level=min(level, 6),
+                )
+                blocks.append(block)
+
+            elif stripped.startswith("|") and "|" in stripped[1:]:
+                # Table — collect all table lines
+                table_lines = [stripped]
+                i += 1
+                while i < len(lines) and lines[i].strip().startswith("|"):
+                    table_lines.append(lines[i].strip())
+                    i += 1
+                table_md = "\n".join(table_lines)
+                table_obj = self._parse_table(table_lines)
+                block = self._make_block(
+                    BlockType.TABLE, table_md, order_idx, page_no,
+                    file_path, table=table_obj,
+                )
+                blocks.append(block)
+                order_idx += 1
+                continue  # Already incremented i
+
+            elif stripped.startswith(("- ", "* ", "+ ")) or (
+                len(stripped) > 2 and stripped[0].isdigit() and stripped[1] in ".)"
+            ):
+                # List item
+                text = stripped.lstrip("-*+ ").lstrip("0123456789.)").strip()
+                block = self._make_block(
+                    BlockType.LIST_ITEM, text, order_idx, page_no, file_path,
+                )
+                blocks.append(block)
+
+            elif stripped.startswith("```"):
+                # Code block
+                code_lines = []
+                i += 1
+                while i < len(lines) and not lines[i].strip().startswith("```"):
+                    code_lines.append(lines[i])
+                    i += 1
+                code_text = "\n".join(code_lines)
+                block = self._make_block(
+                    BlockType.CODE, code_text, order_idx, page_no, file_path,
+                )
+                blocks.append(block)
+                i += 1  # Skip closing ```
+                order_idx += 1
+                continue
+
+            elif stripped.startswith("$$") or stripped.startswith("\\["):
+                # Equation block
+                eq_lines = [stripped]
+                if not (stripped.endswith("$$") and len(stripped) > 2):
+                    i += 1
+                    while i < len(lines):
+                        eq_line = lines[i].strip()
+                        eq_lines.append(eq_line)
+                        if eq_line.endswith("$$") or eq_line.endswith("\\]"):
+                            break
+                        i += 1
+                eq_text = "\n".join(eq_lines)
+                block = self._make_block(
+                    BlockType.EQUATION, eq_text, order_idx, page_no, file_path,
+                )
+                blocks.append(block)
+
+            else:
+                # Regular paragraph
+                block = self._make_block(
+                    BlockType.PARAGRAPH, stripped, order_idx, page_no, file_path,
+                )
+                blocks.append(block)
+
+            order_idx += 1
+            i += 1
+
+        return blocks
+
+    def _make_block(
+        self,
+        block_type: BlockType,
+        text: str,
+        order_index: int,
+        page_no: int,
+        file_path: Path,
+        heading_level: Optional[int] = None,
+        table: Optional[Table] = None,
+    ) -> Block:
+        """Create a Block with standard provenance."""
+        return Block(
+            type=block_type,
+            text=text,
+            order_index=order_index,
+            heading_level=heading_level,
+            provenance=Provenance(
+                source_file=str(file_path),
+                page_number=page_no,
+                bbox=BoundingBox(x0=0, y0=0, x1=0, y1=0),
+                extractor=self.extractor_type,
+                extractor_version=self.version,
+            ),
+            confidence=Confidence(overall=0.9),
+            table=table,
+        )
+
+    def _parse_table(self, table_lines: list[str]) -> Table:
+        """Parse a Markdown table into a Table object."""
+        # Filter out separator lines (|---|---|)
+        data_lines = [
+            line for line in table_lines
+            if line.strip() and not all(c in "|-: " for c in line.strip())
+        ]
+
+        if not data_lines:
+            return Table(n_rows=0, n_cols=0)
+
+        cells: list[TableCell] = []
+        n_cols = 0
+
+        for row_idx, line in enumerate(data_lines):
+            parts = [p.strip() for p in line.strip("|").split("|")]
+            n_cols = max(n_cols, len(parts))
+            for col_idx, cell_text in enumerate(parts):
+                cells.append(TableCell(
+                    r0=row_idx, c0=col_idx, text=cell_text
+                ))
+
+        return Table(
+            n_rows=len(data_lines),
+            n_cols=n_cols,
+            cells=cells,
+            table_confidence=0.85,
+        )
+
+    def _extract_images(self, pdf_doc, config: ProcessingConfig):
+        """Extract images from PDF pages."""
+        for page_idx in range(len(pdf_doc)):
+            page = pdf_doc[page_idx]
+            image_list = page.get_images(full=True)
+            for img_idx, img in enumerate(image_list):
+                try:
+                    xref = img[0]
+                    base_image = pdf_doc.extract_image(xref)
+                    if base_image:
+                        self._images.append({
+                            "page": page_idx + 1,
+                            "index": img_idx,
+                            "data": base_image["image"],
+                            "ext": base_image.get("ext", "png"),
+                        })
+                except Exception as e:
+                    logger.debug("Failed to extract image on page %d: %s", page_idx + 1, e)
+
+    def save_images(self, output_dir: Path) -> list[Path]:
+        """Save extracted images to disk.
+
+        Parameters
+        ----------
+        output_dir:
+            Directory to save images to.
+
+        Returns
+        -------
+        list[Path]
+            Paths to saved image files.
+        """
+        output_dir = Path(output_dir)
+        output_dir.mkdir(parents=True, exist_ok=True)
+        saved = []
+
+        for img_info in self._images:
+            fname = f"page_{img_info['page']:03d}_img_{img_info['index']:02d}.{img_info['ext']}"
+            fpath = output_dir / fname
+            with open(fpath, "wb") as f:
+                f.write(img_info["data"])
+            saved.append(fpath)
+
+        logger.info("Saved %d images to %s", len(saved), output_dir)
+        return saved
+
+    def to_markdown(self, document: Document) -> str:
+        """Convert Document back to Markdown."""
+        parts = []
+        for page in document.pages:
+            for block in page.blocks:
+                if block.type == BlockType.HEADING:
+                    level = block.heading_level or 1
+                    parts.append(f"{'#' * level} {block.text}")
+                elif block.type == BlockType.TABLE:
+                    parts.append(block.text)
+                elif block.type == BlockType.LIST_ITEM:
+                    parts.append(f"- {block.text}")
+                elif block.type == BlockType.CODE:
+                    parts.append(f"```\n{block.text}\n```")
+                elif block.type == BlockType.EQUATION:
+                    parts.append(f"$$\n{block.text}\n$$")
+                else:
+                    parts.append(block.text)
+                parts.append("")
+        return "\n".join(parts)
diff --git a/src/longparser/integrations/__init__.py b/src/longparser/integrations/__init__.py
index b8eae82..44055de 100755
--- a/src/longparser/integrations/__init__.py
+++ b/src/longparser/integrations/__init__.py
@@ -2,9 +2,9 @@
 
 Install the extras to use these adapters::
 
-    pip install clean_rag[langchain]
-    pip install clean_rag[llamaindex]
-    pip install clean_rag[all]
+    pip install longparser[langchain]
+    pip install longparser[llamaindex]
+    pip install longparser[all]
 """
 
 from __future__ import annotations
diff --git a/src/longparser/integrations/langchain.py b/src/longparser/integrations/langchain.py
index 59bdba0..7848c31 100755
--- a/src/longparser/integrations/langchain.py
+++ b/src/longparser/integrations/langchain.py
@@ -5,7 +5,7 @@
 
 Install the extra to use this adapter::
 
-    pip install clean_rag[langchain]
+    pip install longparser[langchain]
 
 Usage::
 
@@ -27,7 +27,7 @@
 
 _INSTALL_MSG = (
     "langchain-core is required for the LangChain adapter. "
-    "Install it with:  pip install clean_rag[langchain]"
+    "Install it with:  pip install longparser[langchain]"
 )
 
 
@@ -95,6 +95,7 @@ def lazy_load(self) -> Iterator["LCDocument"]:
         from ..pipeline import PipelineOrchestrator
 
         pipeline = PipelineOrchestrator(
+            config=self.config,
             tesseract_lang=self.tesseract_lang,
             tessdata_path=self.tessdata_path,
         )
diff --git a/src/longparser/integrations/llamaindex.py b/src/longparser/integrations/llamaindex.py
index a8d4344..d5437b9 100755
--- a/src/longparser/integrations/llamaindex.py
+++ b/src/longparser/integrations/llamaindex.py
@@ -5,7 +5,7 @@
 
 Install the extra to use this adapter::
 
-    pip install clean_rag[llamaindex]
+    pip install longparser[llamaindex]
 
 Usage::
 
@@ -27,7 +27,7 @@
 
 _INSTALL_MSG = (
     "llama-index-core is required for the LlamaIndex adapter. "
-    "Install it with:  pip install clean_rag[llamaindex]"
+    "Install it with:  pip install longparser[llamaindex]"
 )
 
 
@@ -105,6 +105,7 @@ def load_data(
 
         file = Path(file)
         pipeline = PipelineOrchestrator(
+            config=self.config,
             tesseract_lang=self.tesseract_lang,
             tessdata_path=self.tessdata_path,
         )
diff --git a/src/longparser/pipeline/orchestrator.py b/src/longparser/pipeline/orchestrator.py
index 202be9e..5062a48 100755
--- a/src/longparser/pipeline/orchestrator.py
+++ b/src/longparser/pipeline/orchestrator.py
@@ -1,4 +1,13 @@
-"""Simple pipeline orchestrator for LongParser."""
+"""Simple pipeline orchestrator for LongParser.
+
+Supports multiple extraction backends:
+
+- ``"docling"`` (default) — Docling with Tesseract CLI OCR (MIT)
+- ``"pymupdf"`` — PyMuPDF4LLM for fast native PDF extraction (AGPL, optional)
+- ``"auto"``    — Automatic backend selection based on document properties
+
+Language detection runs before OCR to set the correct Tesseract language.
+"""
 
 from pathlib import Path
 from dataclasses import dataclass, field
@@ -11,6 +20,7 @@
 from ..extractors import DoclingExtractor
 from ..extractors.docling_extractor import HierarchyChunk
 from ..chunkers import HybridChunker
+from ..utils.lang_detect import detect_language, get_tesseract_langs, extract_sample_text
 
 logger = logging.getLogger(__name__)
 
@@ -30,43 +40,189 @@ def total_blocks(self) -> int:
 
 class PipelineOrchestrator:
     """
-    Simple pipeline orchestrator using Docling.
+    Pipeline orchestrator with backend selection and language detection.
     
     Flow:
-    1. Docling extracts with Tesseract CLI OCR
-    2. Layout analysis detects structure
-    3. HierarchicalChunker preserves heading hierarchy
+    1. (Optional) Auto-detect document language
+    2. Select backend: Docling, PyMuPDF, or auto-route
+    3. Extract with chosen backend
+    4. HierarchicalChunker preserves heading hierarchy
+    
+    Parameters
+    ----------
+    config:
+        Processing configuration with backend, language, and layout settings.
+        Only used for backend selection during init. Per-file config is passed
+        to ``process_file()``.
+    tesseract_lang:
+        Languages for Tesseract OCR (default: ``["eng"]``). Overridden by
+        ``config.languages`` or auto-detection if enabled.
+    tessdata_path:
+        Path to tessdata directory with language models and configs.
+    force_full_page_ocr:
+        If True, OCR entire page even if embedded text exists.
     """
     
-    def __init__(self, tesseract_lang: List[str] = None, tessdata_path: str = None, force_full_page_ocr: bool = False):
-        """
-        Initialize pipeline.
-        
-        Args:
-            tesseract_lang: Languages for Tesseract OCR (default: ["eng"])
-            tessdata_path: Path to tessdata directory with language models and configs.
-            force_full_page_ocr: If True, OCR entire page even if embedded text exists.
+    def __init__(
+        self,
+        config: Optional[ProcessingConfig] = None,
+        tesseract_lang: List[str] = None,
+        tessdata_path: str = None,
+        force_full_page_ocr: bool = False,
+    ):
+        self._config = config or ProcessingConfig()
+        self._tessdata_path = tessdata_path
+        self._force_full_page_ocr = force_full_page_ocr
+        self._base_tesseract_lang = tesseract_lang
+
+        # Determine backend from config
+        backend = self._config.backend
+
+        if backend == "pymupdf":
+            # Lazy import — only loaded when user explicitly requests it
+            from ..extractors.pymupdf_extractor import PyMuPDFExtractor
+            self.extractor = PyMuPDFExtractor()
+            self._backend_name = "pymupdf"
+            logger.info("Pipeline initialized with PyMuPDF4LLM backend (CPU-native, fast)")
+
+        elif backend == "auto":
+            # Auto mode: start with Docling (safe default), route at process time
+            self.extractor = DoclingExtractor(
+                tesseract_lang=tesseract_lang,
+                tessdata_path=tessdata_path,
+                force_full_page_ocr=force_full_page_ocr,
+            )
+            self._backend_name = "auto"
+            logger.info("Pipeline initialized in auto mode (will choose backend per document)")
+
+        else:
+            # Default: Docling (MIT, always available)
+            self.extractor = DoclingExtractor(
+                tesseract_lang=tesseract_lang,
+                tessdata_path=tessdata_path,
+                force_full_page_ocr=force_full_page_ocr,
+            )
+            self._backend_name = "docling"
+            logger.info("Pipeline initialized with Docling backend (default)")
+
+    def _resolve_languages(
+        self,
+        file_path: Path,
+        config: ProcessingConfig,
+    ) -> list[str]:
+        """Resolve OCR languages via user override or auto-detection.
+
+        Priority order:
+        1. ``config.languages`` (explicit user override — always wins)
+        2. ``self._base_tesseract_lang`` (constructor param)
+        3. Auto-detection via ``fast-langdetect`` (if enabled)
+        4. Default: ``["eng"]``
         """
-        self.extractor = DoclingExtractor(
-            tesseract_lang=tesseract_lang,
-            tessdata_path=tessdata_path,
-            force_full_page_ocr=force_full_page_ocr,
-        )
-    
+        # 1. Explicit user override
+        if config.languages:
+            logger.info("Using user-specified languages: %s", config.languages)
+            return config.languages
+
+        # 2. Constructor param
+        if self._base_tesseract_lang:
+            # If auto-detect is enabled, try to improve on constructor default
+            if config.auto_detect_language:
+                detected_langs = self._auto_detect(file_path)
+                if detected_langs:
+                    return detected_langs
+            return self._base_tesseract_lang
+
+        # 3. Auto-detect
+        if config.auto_detect_language:
+            detected_langs = self._auto_detect(file_path)
+            if detected_langs:
+                return detected_langs
+
+        # 4. Default
+        return ["eng"]
+
+    def _auto_detect(self, file_path: Path) -> Optional[list[str]]:
+        """Run language detection and return Tesseract codes, or None."""
+        sample = extract_sample_text(file_path)
+        if not sample or len(sample.strip()) < 20:
+            return None
+
+        lang_code, confidence = detect_language(sample)
+        if confidence > 0.0:
+            tess_langs = get_tesseract_langs(lang_code)
+            logger.info(
+                "Auto-detected language: %s (%.0f%%) → Tesseract: %s",
+                lang_code, confidence * 100, tess_langs,
+            )
+            # Store for later use in document metadata
+            self._detected_lang = lang_code
+            self._detected_lang_confidence = confidence
+            return tess_langs
+
+        return None
+
+    def _should_use_pymupdf(self, file_path: Path) -> bool:
+        """Check if PyMuPDF is a better choice for this file (auto mode)."""
+        ext = file_path.suffix.lower()
+
+        # PyMuPDF only handles PDFs
+        if ext != ".pdf":
+            return False
+
+        # Check if PDF has a text layer (= native, not scanned)
+        sample = extract_sample_text(file_path, max_chars=500)
+        if sample and len(sample.strip()) > 100:
+            # Has text → native PDF → PyMuPDF is faster
+            try:
+                from ..extractors.pymupdf_extractor import PyMuPDFExtractor
+                return True
+            except ImportError:
+                # pymupdf4llm not installed — fall back to Docling
+                logger.debug("Auto mode: pymupdf4llm not installed, using Docling")
+                return False
+
+        # Scanned PDF or too little text → use Docling (has OCR)
+        return False
+
     def process(self, request: JobRequest) -> PipelineResult:
         """Process a document."""
         start_time = time.time()
         
         file_path = Path(request.file_path)
         config = request.config
+
+        # Initialize language detection state
+        self._detected_lang = None
+        self._detected_lang_confidence = 0.0
         
         logger.info(f"Processing: {file_path.name}")
-        
+
+        # Auto-mode: decide backend per document
+        if self._backend_name == "auto" and self._should_use_pymupdf(file_path):
+            from ..extractors.pymupdf_extractor import PyMuPDFExtractor
+            extractor = PyMuPDFExtractor()
+            logger.info("Auto mode selected: PyMuPDF4LLM (native PDF detected)")
+        else:
+            extractor = self.extractor
+
+            # Resolve languages for Docling backend
+            if isinstance(extractor, DoclingExtractor):
+                resolved_langs = self._resolve_languages(file_path, config)
+                extractor._languages = resolved_langs
+
         # Extract document
-        document, meta = self.extractor.extract(file_path, config)
-        
-        # Get hierarchy
-        hierarchy = self.extractor.get_hierarchy(file_path, config)
+        document, meta = extractor.extract(file_path, config)
+
+        # Inject language detection results into metadata
+        if self._detected_lang:
+            document.metadata.detected_language = self._detected_lang
+            document.metadata.language_confidence = self._detected_lang_confidence
+
+        # Get hierarchy (only DoclingExtractor has this)
+        if isinstance(extractor, DoclingExtractor):
+            hierarchy = extractor.get_hierarchy(file_path, config)
+        else:
+            hierarchy = []
         
         processing_time = time.time() - start_time
         logger.info(f"Completed in {processing_time:.2f}s")
@@ -164,6 +320,8 @@ def export_results(self, result: PipelineResult, output_dir: Path) -> dict:
             "total_blocks": len(all_blocks),
             "total_tables": total_tables,
             "processing_time_seconds": result.processing_time_seconds,
+            "detected_language": result.document.metadata.detected_language,
+            "language_confidence": result.document.metadata.language_confidence,
             "stages_completed": [
                 "stage1_extraction",
                 "stage2_validation",
@@ -228,3 +386,4 @@ def export_chunks(self, result: PipelineResult, output_dir: Path) -> Path:
     def save_images(self, output_dir: Path) -> List[Path]:
         """Save extracted images."""
         return self.extractor.save_images(output_dir)
+
diff --git a/src/longparser/schemas.py b/src/longparser/schemas.py
index 60bd47f..6e54f1e 100755
--- a/src/longparser/schemas.py
+++ b/src/longparser/schemas.py
@@ -118,6 +118,8 @@ class PageProfile(BaseModel):
     table_confidence: Optional[float] = None
     has_rtl: bool = False
     has_math: bool = False
+    detected_columns: int = Field(default=1, description="Number of text columns detected on page")
+    reading_order_confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence of reading-order reconstruction")
 
 
 class Page(BaseModel):
@@ -135,6 +137,8 @@ class DocumentMetadata(BaseModel):
     source_file: str
     file_hash: str = ""
     language: Optional[str] = None
+    detected_language: Optional[str] = Field(default=None, description="Auto-detected language code (ISO 639-1) via fast-langdetect")
+    language_confidence: float = Field(default=0.0, ge=0.0, le=1.0, description="Confidence of auto-detected language")
     total_pages: int = 0
     academic_mode: bool = False
     rtl_hint: bool = False
@@ -163,6 +167,17 @@ def all_tables(self) -> list[Table]:
 
 class ProcessingConfig(BaseModel):
     """Configuration for pipeline execution."""
+    # --- v0.1.4: Backend selection ---
+    backend: str = Field(default="docling", description="Extraction backend: 'docling' | 'pymupdf' | 'auto'")
+
+    # --- v0.1.4: Language detection ---
+    languages: Optional[list[str]] = Field(default=None, description="Explicit Tesseract language codes, e.g. ['eng','ara']. Overrides auto-detect.")
+    auto_detect_language: bool = Field(default=True, description="Auto-detect document language before OCR (uses fast-langdetect)")
+
+    # --- v0.1.4: Multi-column layout ---
+    column_count_hint: Optional[int] = Field(default=None, description="Manual column count hint. None = auto-detect by Docling")
+    force_left_to_right: bool = Field(default=False, description="Force left-to-right top-to-bottom reading order")
+
     academic_mode: bool = False
     rtl_hint: bool = False
     do_ocr: bool = True
@@ -202,6 +217,10 @@ class ExtractionMetadata(BaseModel):
     reprocessed_pages: list[int] = Field(default_factory=list)
     ocr_backend_used: Optional[str] = None
     reasons: list[str] = Field(default_factory=list)
+    # --- v0.1.4: OCR routing metadata ---
+    ocr_strategy: str = Field(default="standard", description="OCR strategy used: 'standard' | 'math' | 'full_ocr'")
+    is_scanned: bool = Field(default=False, description="Whether the document was detected as scanned (no text layer)")
+    page_complexity_scores: dict[int, int] = Field(default_factory=dict, description="Per-page complexity scores used for OCR routing")
 
 
 class ChunkingConfig(BaseModel):
@@ -222,12 +241,13 @@ class Chunk(BaseModel):
     chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4()))
     text: str
     token_count: int
-    chunk_type: str  # "section" | "table" | "table_schema" | "list" | "equation" | "continuation"
+    chunk_type: str  # "section" | "table" | "table_schema" | "list" | "equation" | "figure" | "continuation"
     section_path: list[str] = Field(default_factory=list)
     page_numbers: list[int] = Field(default_factory=list)
     block_ids: list[str] = Field(default_factory=list)
     overlap_with_previous: bool = False
     equation_detected: bool = False
+    image_path: Optional[str] = Field(default=None, description="Path to figure image if chunk_type == 'figure'")
     metadata: dict = Field(default_factory=dict)  # row_start, row_end, sheet, col_band
 
 
diff --git a/src/longparser/utils/__init__.py b/src/longparser/utils/__init__.py
index c642b45..7c7ea22 100755
--- a/src/longparser/utils/__init__.py
+++ b/src/longparser/utils/__init__.py
@@ -1,5 +1,14 @@
 """Utility modules for LongParser."""
 
 from .rtl_detector import detect_rtl_language
+from .lang_detect import detect_language, get_tesseract_langs
+from .ocr_router import is_page_scanned, score_page_complexity, get_ocr_strategy
 
-__all__ = ["detect_rtl_language"]
+__all__ = [
+    "detect_rtl_language",
+    "detect_language",
+    "get_tesseract_langs",
+    "is_page_scanned",
+    "score_page_complexity",
+    "get_ocr_strategy",
+]
diff --git a/src/longparser/utils/lang_detect.py b/src/longparser/utils/lang_detect.py
new file mode 100644
index 0000000..b544d4b
--- /dev/null
+++ b/src/longparser/utils/lang_detect.py
@@ -0,0 +1,193 @@
+"""Language detection for document text samples.
+
+Uses ``fast-langdetect`` (Apache-2.0, Facebook FastText model) to detect
+the primary language of a text sample and map it to Tesseract language codes.
+
+This module is designed for zero-failure operation:
+- Falls back to English if ``fast-langdetect`` is not installed
+- Falls back to English if detection confidence is too low
+- Falls back to English on any unexpected error
+- Never raises exceptions that would break the pipeline
+
+Usage::
+
+    from longparser.utils.lang_detect import detect_language, get_tesseract_langs
+
+    lang, confidence = detect_language("هذا نص عربي")  # ("ar", 0.99)
+    tess_codes = get_tesseract_langs("ar")             # ["ara"]
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Optional
+
+logger = logging.getLogger(__name__)
+
+# ---------------------------------------------------------------------------
+# Mapping: ISO 639-1 code (fast-langdetect) → Tesseract language code(s)
+# ---------------------------------------------------------------------------
+_LANG_TO_TESSERACT: dict[str, list[str]] = {
+    "af": ["afr"],   "am": ["amh"],   "ar": ["ara"],   "az": ["aze"],
+    "be": ["bel"],   "bg": ["bul"],   "bn": ["ben"],   "bs": ["bos"],
+    "ca": ["cat"],   "cs": ["ces"],   "cy": ["cym"],   "da": ["dan"],
+    "de": ["deu"],   "el": ["ell"],   "en": ["eng"],   "es": ["spa"],
+    "et": ["est"],   "eu": ["eus"],   "fa": ["fas"],   "fi": ["fin"],
+    "fr": ["fra"],   "ga": ["gle"],   "gl": ["glg"],   "gu": ["guj"],
+    "ha": ["hau"],   "he": ["heb"],   "hi": ["hin"],   "hr": ["hrv"],
+    "hu": ["hun"],   "hy": ["hye"],   "id": ["ind"],   "is": ["isl"],
+    "it": ["ita"],   "ja": ["jpn"],   "jv": ["jav"],   "ka": ["kat"],
+    "kk": ["kaz"],   "km": ["khm"],   "kn": ["kan"],   "ko": ["kor"],
+    "la": ["lat"],   "lt": ["lit"],   "lv": ["lav"],   "mk": ["mkd"],
+    "ml": ["mal"],   "mn": ["mon"],   "mr": ["mar"],   "ms": ["msa"],
+    "my": ["mya"],   "ne": ["nep"],   "nl": ["nld"],   "no": ["nor"],
+    "pa": ["pan"],   "pl": ["pol"],   "pt": ["por"],   "ro": ["ron"],
+    "ru": ["rus"],   "si": ["sin"],   "sk": ["slk"],   "sl": ["slv"],
+    "sq": ["sqi"],   "sr": ["srp"],   "sv": ["swe"],   "sw": ["swa"],
+    "ta": ["tam"],   "te": ["tel"],   "th": ["tha"],   "tl": ["tgl"],
+    "tr": ["tur"],   "uk": ["ukr"],   "ur": ["urd"],   "uz": ["uzb"],
+    "vi": ["vie"],   "yo": ["yor"],
+    # Chinese variants
+    "zh": ["chi_sim", "chi_tra"],
+}
+
+
+def detect_language(
+    text: str,
+    min_confidence: float = 0.5,
+) -> tuple[str, float]:
+    """Detect the primary language of a text sample.
+
+    Parameters
+    ----------
+    text:
+        Text sample to analyze. At least 20 characters recommended.
+    min_confidence:
+        Minimum confidence threshold. Below this, falls back to ``"en"``.
+
+    Returns
+    -------
+    tuple[str, float]
+        ``(language_code, confidence)`` — e.g. ``("ar", 0.99)``.
+        Falls back to ``("en", 0.0)`` on any failure.
+    """
+    if not text or len(text.strip()) < 20:
+        logger.debug("Text too short for language detection, defaulting to English")
+        return "en", 0.0
+
+    try:
+        from fast_langdetect import detect
+        result = detect(text)
+        lang = result.get("lang", "en")
+        score = result.get("score", 0.0)
+
+        if score < min_confidence:
+            logger.info(
+                "Language detection low confidence (%.2f for '%s'), "
+                "defaulting to English", score, lang
+            )
+            return "en", score
+
+        logger.info("Detected language: %s (confidence: %.2f)", lang, score)
+        return lang, score
+
+    except ImportError:
+        logger.warning(
+            "fast-langdetect is not installed. Language detection disabled. "
+            "Install with: pip install fast-langdetect"
+        )
+        return "en", 0.0
+    except Exception as e:
+        logger.warning("Language detection failed: %s — defaulting to English", e)
+        return "en", 0.0
+
+
+def get_tesseract_langs(lang_code: str) -> list[str]:
+    """Map a detected language code to Tesseract language code(s).
+
+    Parameters
+    ----------
+    lang_code:
+        ISO 639-1 language code (e.g. ``"ar"``, ``"en"``).
+
+    Returns
+    -------
+    list[str]
+        Tesseract language codes (e.g. ``["ara"]``, ``["eng"]``).
+    """
+    return _LANG_TO_TESSERACT.get(lang_code, ["eng"])
+
+
+def extract_sample_text(file_path, max_chars: int = 2000) -> str:
+    """Extract a sample of text from a document for language detection.
+
+    Uses a lightweight approach: reads first few KB of the file and
+    extracts printable text. For PDFs, attempts to use PyMuPDF if
+    available, otherwise falls back to reading raw bytes.
+
+    Parameters
+    ----------
+    file_path:
+        Path to the document file.
+    max_chars:
+        Maximum characters to extract.
+
+    Returns
+    -------
+    str
+        Extracted text sample, or empty string if extraction fails.
+    """
+    from pathlib import Path
+    file_path = Path(file_path)
+
+    if not file_path.exists():
+        return ""
+
+    ext = file_path.suffix.lower()
+
+    # For PDFs: try lightweight text extraction
+    if ext == ".pdf":
+        return _extract_pdf_sample(file_path, max_chars)
+
+    # For text-like files: read directly
+    if ext in (".csv", ".txt", ".md"):
+        try:
+            with open(file_path, "r", encoding="utf-8", errors="ignore") as f:
+                return f.read(max_chars)
+        except Exception:
+            return ""
+
+    # For other formats: return empty (language detection will use
+    # text extracted by Docling later)
+    return ""
+
+
+def _extract_pdf_sample(file_path, max_chars: int) -> str:
+    """Extract text sample from a PDF using the lightest method available."""
+    # Try pdfplumber (lightweight, often available)
+    try:
+        import pdfplumber
+        with pdfplumber.open(str(file_path)) as pdf:
+            text = ""
+            for page in pdf.pages[:3]:  # First 3 pages
+                page_text = page.extract_text() or ""
+                text += page_text + "\n"
+                if len(text) >= max_chars:
+                    break
+            return text[:max_chars]
+    except ImportError:
+        pass
+    except Exception:
+        pass
+
+    # Fallback: read raw bytes and extract printable chars
+    try:
+        with open(file_path, "rb") as f:
+            raw = f.read(max_chars * 4)  # Read more bytes since not all are text
+        # Extract ASCII/Unicode text from raw bytes
+        text = raw.decode("utf-8", errors="ignore")
+        # Filter to printable characters
+        printable = "".join(c for c in text if c.isprintable() or c in "\n\t ")
+        return printable[:max_chars]
+    except Exception:
+        return ""
diff --git a/src/longparser/utils/ocr_router.py b/src/longparser/utils/ocr_router.py
new file mode 100644
index 0000000..dd3586d
--- /dev/null
+++ b/src/longparser/utils/ocr_router.py
@@ -0,0 +1,148 @@
+"""Smart OCR routing for scanned PDFs.
+
+Routes pages to the best OCR strategy based on content complexity:
+
+- **standard** — Tesseract with default settings (fast, CPU-native)
+- **math** — Tesseract for text + pix2tex for equations
+- **full_ocr** — Tesseract with ``force_full_page_ocr=True``
+
+All strategies are CPU-friendly. No GPU-dependent engines (Surya, Marker)
+are used in the routing — those are available as separate optional backends.
+
+Usage::
+
+    from longparser.utils.ocr_router import (
+        is_page_scanned, score_page_complexity, get_ocr_strategy,
+    )
+
+    if is_page_scanned(page_text):
+        score = score_page_complexity(page_text, num_blocks=15, has_tables=True)
+        strategy = get_ocr_strategy(score)
+        # strategy = "full_ocr" for score >= 5
+"""
+
+from __future__ import annotations
+
+import logging
+import re
+
+logger = logging.getLogger(__name__)
+
+# Pattern to detect math symbols and simple equations in text.
+# Matches Unicode math symbols and simple algebraic patterns like "x = 5".
+_MATH_RE = re.compile(
+    r'[\u2211\u220F\u222B\u221A\u00B1\u2264\u2265\u2248\u2260\u03B1-\u03C9\u03A3]'
+    r'|[a-z]\s*=\s*[a-z0-9]',
+    re.IGNORECASE,
+)
+
+
+def is_page_scanned(page_text: str, min_chars: int = 30) -> bool:
+    """Check if a page is likely scanned (no usable text layer).
+
+    Parameters
+    ----------
+    page_text:
+        Extracted text from the page.
+    min_chars:
+        Minimum character count to consider the page as having a text layer.
+
+    Returns
+    -------
+    bool
+        ``True`` if the page has fewer than ``min_chars`` printable characters
+        (indicating it's likely a scanned image with no embedded text).
+    """
+    clean = page_text.strip()
+    return len(clean) < min_chars
+
+
+def has_math_content(text: str) -> bool:
+    """Check if text contains mathematical symbols or equation patterns.
+
+    Parameters
+    ----------
+    text:
+        Text to check for math content.
+
+    Returns
+    -------
+    bool
+        ``True`` if math symbols or equation patterns are found.
+    """
+    return bool(_MATH_RE.search(text))
+
+
+def score_page_complexity(
+    page_text: str,
+    num_blocks: int = 0,
+    has_tables: bool = False,
+) -> int:
+    """Score page complexity on a scale of 0-10.
+
+    Used to decide which OCR strategy to apply:
+
+    - **0-2** → ``"standard"`` — Simple page, Tesseract is enough
+    - **3-4** → ``"math"`` — Has equations, add pix2tex
+    - **5+** → ``"full_ocr"`` — Complex layout, use full-page OCR
+
+    Parameters
+    ----------
+    page_text:
+        Extracted text from the page.
+    num_blocks:
+        Number of content blocks on the page.
+    has_tables:
+        Whether the page contains tables.
+
+    Returns
+    -------
+    int
+        Complexity score from 0 to 10.
+    """
+    score = 0
+
+    # Tables add significant complexity
+    if has_tables:
+        score += 3
+
+    # Math content needs pix2tex
+    if has_math_content(page_text):
+        score += 2
+
+    # Many blocks suggest a dense/complex layout
+    if num_blocks > 20:
+        score += 2
+    elif num_blocks > 10:
+        score += 1
+
+    # Very short text on a page with blocks = likely OCR issues
+    if page_text and len(page_text.strip()) < 100 and num_blocks > 5:
+        score += 1
+
+    return min(score, 10)
+
+
+def get_ocr_strategy(complexity_score: int) -> str:
+    """Pick OCR strategy based on page complexity score.
+
+    Parameters
+    ----------
+    complexity_score:
+        Score from :func:`score_page_complexity` (0-10).
+
+    Returns
+    -------
+    str
+        One of:
+
+        - ``"standard"`` — Tesseract with default settings
+        - ``"math"`` — Tesseract + pix2tex for equations
+        - ``"full_ocr"`` — Tesseract with ``force_full_page_ocr=True``
+    """
+    if complexity_score <= 2:
+        return "standard"
+    elif complexity_score <= 4:
+        return "math"
+    else:
+        return "full_ocr"
diff --git a/tests/benchmarks/benchmark_pipeline.py b/tests/benchmarks/benchmark_pipeline.py
new file mode 100644
index 0000000..716ee44
--- /dev/null
+++ b/tests/benchmarks/benchmark_pipeline.py
@@ -0,0 +1,98 @@
+"""Pipeline performance benchmark for regression testing.
+
+Run this BEFORE and AFTER v0.2.x changes to prove no speed regression.
+
+Usage:
+    # Save baseline (v0.1.3)
+    python tests/benchmarks/benchmark_pipeline.py > benchmark_v013.txt
+
+    # After v0.2.x changes
+    python tests/benchmarks/benchmark_pipeline.py > benchmark_v020.txt
+
+    # Compare
+    diff benchmark_v013.txt benchmark_v020.txt
+"""
+
+import time
+import sys
+from pathlib import Path
+
+
+def benchmark_file(file_path: str) -> dict:
+    """Benchmark a single file through the pipeline."""
+    from longparser import DocumentPipeline, ProcessingConfig
+
+    path = Path(file_path)
+    if not path.exists():
+        return {"file": file_path, "status": "SKIPPED (file not found)"}
+
+    pipeline = DocumentPipeline()
+    config = ProcessingConfig()
+
+    t0 = time.time()
+    try:
+        result = pipeline.process_file(path, config=config)
+        elapsed = time.time() - t0
+
+        return {
+            "file": path.name,
+            "time_seconds": round(elapsed, 2),
+            "total_blocks": result.total_blocks,
+            "total_pages": result.document.metadata.total_pages,
+            "status": "OK",
+        }
+    except Exception as e:
+        elapsed = time.time() - t0
+        return {
+            "file": path.name,
+            "time_seconds": round(elapsed, 2),
+            "status": f"ERROR: {e}",
+        }
+
+
+def main():
+    """Run benchmark on all available test fixtures."""
+    # Look for test PDFs in common locations
+    fixture_dirs = [
+        Path("tests/fixtures"),
+        Path("tests"),
+        Path("uploads"),
+    ]
+
+    test_files = []
+    for d in fixture_dirs:
+        if d.exists():
+            test_files.extend(sorted(d.glob("*.pdf")))
+
+    if not test_files:
+        print("No PDF test files found in tests/fixtures/ or uploads/")
+        print("Place some PDFs there and re-run.")
+        sys.exit(1)
+
+    print("=" * 60)
+    print("LongParser Pipeline Benchmark")
+    print("=" * 60)
+    print(f"Files found: {len(test_files)}")
+    print()
+
+    results = []
+    for f in test_files[:5]:  # Cap at 5 files for reasonable benchmark time
+        print(f"Benchmarking: {f.name} ...", end=" ", flush=True)
+        result = benchmark_file(str(f))
+        results.append(result)
+        print(f"{result.get('time_seconds', '?')}s — {result['status']}")
+
+    print()
+    print("-" * 60)
+    print(f"{'File':<30} {'Time':>8} {'Blocks':>8} {'Pages':>6}")
+    print("-" * 60)
+    for r in results:
+        if r["status"] == "OK":
+            print(f"{r['file']:<30} {r['time_seconds']:>7.2f}s {r['total_blocks']:>8} {r['total_pages']:>6}")
+        else:
+            print(f"{r['file']:<30} {r['status']}")
+    print("-" * 60)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/unit/test_backward_compat.py b/tests/unit/test_backward_compat.py
new file mode 100644
index 0000000..fae7d49
--- /dev/null
+++ b/tests/unit/test_backward_compat.py
@@ -0,0 +1,142 @@
+"""Backward compatibility tests for v0.2.x changes.
+
+Ensures that users who wrote code against v0.1.3 can upgrade to v0.2.x
+without changing a single line of their code. Every new field must have
+a default that matches the v0.1.3 behavior.
+"""
+
+import pytest
+
+
+class TestProcessingConfigCompat:
+    """ProcessingConfig() with no args must behave exactly like v0.1.3."""
+
+    def test_default_values_match_v013(self):
+        from longparser.schemas import ProcessingConfig
+        config = ProcessingConfig()
+
+        # v0.1.3 defaults — these must NEVER change
+        assert config.academic_mode is False
+        assert config.rtl_hint is False
+        assert config.do_ocr is True
+        assert config.formula_ocr is True
+        assert config.do_table_structure is True
+        assert config.export_images is True
+        assert config.formula_mode == "smart"
+        assert config.smart_max_equations == 25
+        assert config.smart_max_ocr_seconds == 300.0
+        assert config.exclude_page_headers_footers is True
+
+    def test_new_fields_have_safe_defaults(self):
+        """New v0.2.x fields must default to values that don't change behavior."""
+        from longparser.schemas import ProcessingConfig
+        config = ProcessingConfig()
+
+        # backend must default to docling (existing behavior)
+        backend = getattr(config, "backend", "docling")
+        assert backend == "docling"
+
+        # auto_detect_language defaults to True but only runs if languages=None
+        auto_detect = getattr(config, "auto_detect_language", True)
+        assert auto_detect is True
+
+        # languages=None means "use existing tesseract_lang param"
+        languages = getattr(config, "languages", None)
+        assert languages is None
+
+
+class TestDocumentMetadataCompat:
+    """DocumentMetadata must keep all v0.1.3 fields."""
+
+    def test_v013_fields_exist(self):
+        from longparser.schemas import DocumentMetadata
+        meta = DocumentMetadata(source_file="test.pdf")
+
+        assert meta.source_file == "test.pdf"
+        assert meta.file_hash == ""
+        assert meta.language is None
+        assert meta.total_pages == 0
+        assert meta.academic_mode is False
+        assert meta.rtl_hint is False
+
+
+class TestBlockCompat:
+    """Block schema must keep all v0.1.3 fields and types."""
+
+    def test_block_type_values_unchanged(self):
+        from longparser.schemas import BlockType
+
+        # All v0.1.3 values must still exist
+        assert BlockType.HEADING == "heading"
+        assert BlockType.PARAGRAPH == "paragraph"
+        assert BlockType.LIST_ITEM == "list_item"
+        assert BlockType.TABLE == "table"
+        assert BlockType.FIGURE == "figure"
+        assert BlockType.CAPTION == "caption"
+        assert BlockType.FOOTER == "footer"
+        assert BlockType.HEADER == "header"
+        assert BlockType.EQUATION == "equation"
+        assert BlockType.CODE == "code"
+
+    def test_extractor_type_values_unchanged(self):
+        from longparser.schemas import ExtractorType
+
+        # All v0.1.3 values must still exist
+        assert ExtractorType.DOCLING == "docling"
+        assert ExtractorType.SURYA == "surya"
+        assert ExtractorType.MARKER == "marker"
+        assert ExtractorType.NATIVE_PDF == "native_pdf"
+        assert ExtractorType.PADDLE == "paddle"
+
+
+class TestChunkCompat:
+    """Chunk schema must keep all v0.1.3 fields."""
+
+    def test_chunk_fields_exist(self):
+        from longparser.schemas import Chunk
+        chunk = Chunk(text="test", token_count=1, chunk_type="section")
+
+        assert chunk.text == "test"
+        assert chunk.token_count == 1
+        assert chunk.chunk_type == "section"
+        assert chunk.section_path == []
+        assert chunk.page_numbers == []
+        assert chunk.block_ids == []
+        assert chunk.overlap_with_previous is False
+        assert chunk.equation_detected is False
+
+
+class TestPublicAPICompat:
+    """All v0.1.3 public names must still be importable."""
+
+    def test_all_v013_exports_available(self):
+        from longparser import (  # noqa: F401
+            __version__,
+            Document,
+            Page,
+            Block,
+            Table,
+            TableCell,
+            BlockType,
+            ExtractorType,
+            ProcessingConfig,
+            BoundingBox,
+            Provenance,
+            Confidence,
+            BlockFlags,
+            DocumentMetadata,
+            PageProfile,
+            ExtractionMetadata,
+            ChunkingConfig,
+            Chunk,
+            JobRequest,
+            JobResult,
+        )
+
+    def test_lazy_imports_still_work(self):
+        """Lazy imports from v0.1.3 must still resolve."""
+        from longparser import DocumentPipeline  # noqa: F401
+        from longparser import PipelineOrchestrator  # noqa: F401
+        from longparser import PipelineResult  # noqa: F401
+        from longparser import HybridChunker  # noqa: F401
+        from longparser import DoclingExtractor  # noqa: F401
diff --git a/tests/unit/test_license_safety.py b/tests/unit/test_license_safety.py
new file mode 100644
index 0000000..8afac8b
--- /dev/null
+++ b/tests/unit/test_license_safety.py
@@ -0,0 +1,82 @@
+"""License safety tests — ensure GPL/AGPL packages are never loaded by default.
+
+These tests verify that importing ``longparser`` and using its default
+pipeline does NOT load any GPL/AGPL-licensed package (pymupdf4llm, marker,
+surya). This is critical to maintain LongParser's MIT license.
+"""
+
+import sys
+import pytest
+
+
+# Packages that must NEVER appear in sys.modules after a default import
+_BLOCKED_MODULES = [
+    "pymupdf4llm",
+    "pymupdf",
+    "fitz",           # PyMuPDF's internal module name
+    "marker",
+    "marker.converters",
+    "surya",
+    "surya.ocr",
+]
+
+
+def _clear_blocked_modules():
+    """Remove any pre-loaded blocked modules from sys.modules."""
+    for mod_name in list(sys.modules):
+        for blocked in _BLOCKED_MODULES:
+            if mod_name == blocked or mod_name.startswith(blocked + "."):
+                del sys.modules[mod_name]
+
+
+class TestLicenseSafety:
+    """Verify that core imports do not load GPL/AGPL dependencies."""
+
+    def test_import_longparser_does_not_load_agpl(self):
+        """``import longparser`` must not load any GPL/AGPL module."""
+        _clear_blocked_modules()
+
+        import longparser  # noqa: F401
+
+        for mod_name in _BLOCKED_MODULES:
+            assert mod_name not in sys.modules, (
+                f"GPL/AGPL module '{mod_name}' was loaded by 'import longparser'. "
+                f"This violates the MIT license isolation. "
+                f"Check __init__.py and extractors/__init__.py for stray imports."
+            )
+
+    def test_import_schemas_does_not_load_agpl(self):
+        """``from longparser.schemas import ...`` must not load GPL/AGPL."""
+        _clear_blocked_modules()
+
+        from longparser.schemas import (  # noqa: F401
+            ProcessingConfig, Document, Block, Chunk
+        )
+
+        for mod_name in _BLOCKED_MODULES:
+            assert mod_name not in sys.modules, (
+                f"GPL/AGPL module '{mod_name}' was loaded by schema import."
+            )
+
+    def test_processing_config_default_backend_is_docling(self):
+        """Default backend must be 'docling' (MIT), not a GPL/AGPL backend."""
+        from longparser.schemas import ProcessingConfig
+        config = ProcessingConfig()
+
+        # If backend field exists, it must default to docling
+        backend = getattr(config, "backend", "docling")
+        assert backend == "docling", (
+            f"Default backend is '{backend}', expected 'docling'. "
+            f"Defaulting to a GPL/AGPL backend would violate MIT license."
+        )
+
+    def test_pymupdf_extractor_not_in_extractors_init(self):
+        """PyMuPDFExtractor must NOT be exported from extractors/__init__.py."""
+        from longparser import extractors
+
+        public_names = getattr(extractors, "__all__", dir(extractors))
+
+        assert "PyMuPDFExtractor" not in public_names, (
+            "PyMuPDFExtractor must NOT be in extractors/__init__.py. "
+            "It must only be imported lazily when backend='pymupdf' is set."
+        )