diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml new file mode 100644 index 0000000..63f9217 --- /dev/null +++ b/.github/workflows/license-check.yml @@ -0,0 +1,50 @@ +name: License Safety Check + +on: [push, pull_request] + +jobs: + license-check: + name: Ensure no GPL/AGPL imports in core + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check core files for GPL/AGPL imports + run: | + echo "=== License Safety Check ===" + echo "Verifying no GPL/AGPL package is imported in core code..." + echo "" + + FAIL=0 + + # List of GPL/AGPL package import patterns to block + BLOCKED_PATTERNS="import[[:space:]]+pymupdf|from[[:space:]]+pymupdf|import[[:space:]]+marker\.|from[[:space:]]+marker\.|import[[:space:]]+surya|from[[:space:]]+surya" + + # Files that ARE allowed to import these (isolated backends) + ALLOWED_FILES=( + "pymupdf_extractor.py" + "marker_extractor.py" + ) + + # Build grep exclude args + EXCLUDE_ARGS="" + for f in "${ALLOWED_FILES[@]}"; do + EXCLUDE_ARGS="$EXCLUDE_ARGS --exclude=$f" + done + + # Search all Python files in src/longparser EXCEPT allowed files + MATCHES=$(grep -rnE "$BLOCKED_PATTERNS" src/longparser/ \ + --include='*.py' $EXCLUDE_ARGS || true) + + if [ -n "$MATCHES" ]; then + echo "❌ FAIL: GPL/AGPL imports found in core code!" + echo "" + echo "$MATCHES" + echo "" + echo "These packages must ONLY be imported in their isolated extractor files." + FAIL=1 + else + echo "✅ PASS: No GPL/AGPL imports in core code." + fi + + exit $FAIL diff --git a/FEATURE_ROADMAP.md b/FEATURE_ROADMAP.md new file mode 100644 index 0000000..2ea7b1a --- /dev/null +++ b/FEATURE_ROADMAP.md @@ -0,0 +1,150 @@ +# LongParser — Product & Feature Roadmap + +> This roadmap reflects the current development direction based on community trends, +> competitor analysis, and the RAG ecosystem in 2025–2026. Items are ordered by +> priority within each phase. All dates are targets, not guarantees. + +--- + +## Current State — v0.1.x ✅ + +- 5-stage extraction pipeline (Extract → Validate → HITL → Chunk → Embed → Index) +- Multi-format support: PDF, DOCX, PPTX, XLSX, CSV via Docling +- `HybridChunker` — 6-strategy token-aware, hierarchy-aware, table-aware chunking +- Human-in-the-Loop (HITL) review via LangGraph `interrupt()` +- 3-layer memory chat engine (short-term + rolling summary + long-term facts) +- Multi-provider LLM: OpenAI, Gemini, Groq, OpenRouter +- Multi-backend vector stores: Chroma, FAISS, Qdrant +- FastAPI REST server + ARQ/Redis job queue + Motor/MongoDB +- LangChain `BaseRetriever` + LlamaIndex `BaseReader` adapters +- CPU / GPU install separation via extras + +--- + +## Phase 1 — Accuracy & Quality (v0.2.x) — Q2 2026 + +### Parser Enhancements + +- [ ] **Marker backend** — add `marker-pdf` as an optional extraction backend for higher-fidelity Markdown output on complex academic PDFs +- [ ] **PyMuPDF4LLM backend** — lightweight, fast alternative for speed-critical pipelines (10× faster than Docling for simple PDFs) +- [ ] **Scanned PDF fast path** — route documents to Tesseract vs pix2tex vs Surya automatically based on page complexity score +- [ ] **Multi-column layout detection** — prevent reading-order errors in newspaper/journal-style layouts +- [ ] **Image extraction** — export embedded figures with captions into separate chunks with `type: figure` +- [ ] **Document language auto-detection** — select OCR model automatically based on detected script + +### Chunking Improvements + +- [ ] **Semantic chunking** — optional embedding-based boundary detection (split at semantic shifts, not just token counts) +- [ ] **Sliding window overlap** — configurable overlap strategy per chunk type (more overlap for tables, less for headings) +- [ ] **Cross-reference resolution** — link `(see Figure 3)` and `(Table 2)` references to their target blocks +- [ ] **Summary chunks** — auto-generate a 1–2 sentence summary chunk per section for hierarchical retrieval + +### Quality & Validation + +- [ ] **Chunk quality scorer** — assign a confidence score per chunk based on OCR confidence, completeness, and structural integrity +- [ ] **PII detection** — flag and optionally redact personal information (names, emails, phone numbers) before embedding +- [ ] **Duplicate block detection** — suppress repeated headers/footers that appear on every page + +--- + +## Phase 2 — Agentic & Multimodal (v0.3.x) — Q3 2026 + +### Agentic RAG + +- [ ] **Agentic retrieval loop** — implement query rewriting + iterative retrieval + self-reflection before answer generation +- [ ] **Multi-hop question answering** — chain retrieval steps for questions that span multiple sections or documents +- [ ] **Tool-calling integration** — expose document pipeline as a LangChain/LangGraph tool callable by autonomous agents +- [ ] **Hypothetical Document Embeddings (HyDE)** — generate hypothetical answers to queries for improved retrieval recall + +### Multimodal + +- [ ] **Vision-Language Model (VLM) integration** — use GPT-4o / Gemini Vision to describe figures, charts, and diagrams as text chunks +- [ ] **Chart data extraction** — parse bar/line/pie charts into structured data tables +- [ ] **Slide layout understanding** — treat PPTX slides as visual units with spatial layout context, not just text extraction + +### Reranking & Retrieval + +- [ ] **Cross-encoder reranker** — add optional `sentence-transformers` cross-encoder reranking step after initial retrieval +- [ ] **Hybrid search** — combine dense vector search with BM25 sparse retrieval (reciprocal rank fusion) +- [ ] **Maximum Marginal Relevance (MMR)** — reduce redundancy in retrieved chunks +- [ ] **Metadata filtering** — filter chunks by `page_number`, `section`, `doc_type`, `date` at query time + +--- + +## Phase 3 — Enterprise & Observability (v0.4.x) — Q4 2026 + +### Knowledge Graph + +- [ ] **Entity extraction** — extract named entities (people, organizations, dates, locations) from chunks +- [ ] **Relationship mapping** — build entity relationship graphs from document content +- [ ] **Graph-based retrieval** — traverse the entity graph for multi-hop retrieval (GraphRAG pattern) +- [ ] **Neo4j / NetworkX integration** — persist the knowledge graph to a graph database + +### Evaluation Framework + +- [ ] **Built-in RAG evaluator** — measure retrieval recall@k, answer faithfulness, and context adherence +- [ ] **Chunk attribution** — trace every answer sentence back to the source chunk and page +- [ ] **RAGAS integration** — plug into the RAGAS evaluation framework +- [ ] **Benchmark suite** — reproducible benchmarks against Unstructured, LlamaParse, Docling standalone + +### Observability & Compliance + +- [ ] **LangSmith integration** — trace every pipeline run end-to-end +- [ ] **OpenTelemetry support** — emit spans/traces to any OTel-compatible backend +- [ ] **Audit log** — immutable log of every HITL decision (approve/reject/edit) with timestamps and user IDs +- [ ] **GDPR compliance mode** — PII redaction + right-to-erasure support (delete all chunks for a document) +- [ ] **Role-based access control (RBAC)** — multi-tenant document access in the REST API + +--- + +## Phase 4 — Scale & Ecosystem (v0.5.x+) — 2027 + +### Performance & Scale + +- [ ] **Async parallel extraction** — process multiple documents concurrently in the background worker +- [ ] **Streaming extraction** — yield blocks as they are extracted (no need to wait for full document) +- [ ] **Incremental indexing** — update only changed pages/sections on re-upload +- [ ] **S3 / GCS / Azure Blob** — native cloud storage input (not just local files) +- [ ] **Kubernetes Helm chart** — one-command production deployment + +### New Integrations + +- [ ] **Weaviate** vector store adapter +- [ ] **Pinecone** vector store adapter +- [ ] **Milvus** vector store adapter +- [ ] **DSPy** integration — use DSPy to auto-optimize retrieval prompts +- [ ] **Haystack `DocumentConverter`** component +- [ ] **Flowise / Langflow** node — drag-and-drop visual pipeline builder support + +### Developer Experience + +- [ ] **LongParser CLI** — `longparser parse document.pdf --output chunks.json` +- [ ] **Web UI (HITL Dashboard)** — visual interface for reviewing and editing blocks before embedding +- [ ] **VS Code extension** — preview parsed chunks directly from the editor +- [ ] **Webhook support** — notify external systems when a job completes or requires HITL review + +--- + +## Competitive Positioning + +| Capability | LongParser | Unstructured | LlamaParse | Docling | +|---|---|---|---|---| +| Privacy-first (fully local) | ✅ | ⚠️ (cloud option) | ❌ (API-only) | ✅ | +| HITL review workflow | ✅ | ❌ | ❌ | ❌ | +| Bundled REST API server | ✅ | ✅ (paid) | ✅ (cloud) | ❌ | +| Table-aware chunking | ✅ | ⚠️ | ✅ | ✅ | +| LaTeX / equation OCR | ✅ | ❌ | ⚠️ | ⚠️ | +| LangChain + LlamaIndex | ✅ | ✅ | ✅ | ⚠️ | +| Open source (MIT) | ✅ | ⚠️ (core only) | ❌ | ✅ | +| Knowledge graph (planned) | 🔜 | ❌ | ❌ | ❌ | +| Agentic retrieval (planned) | 🔜 | ❌ | ⚠️ | ❌ | + +--- + +## Guiding Principles + +1. **Privacy by default** — all processing runs locally; no data leaves user infrastructure +2. **Human oversight** — HITL is a first-class citizen, not an afterthought +3. **Composable** — every stage is independently usable; no forced lock-in to the full stack +4. **Production-grade** — async, typed, tested, documented from day one +5. **Ecosystem-native** — LangChain, LlamaIndex, and HuggingFace are first-class integration targets diff --git a/LICENSE-THIRD-PARTY.md b/LICENSE-THIRD-PARTY.md new file mode 100644 index 0000000..257709f --- /dev/null +++ b/LICENSE-THIRD-PARTY.md @@ -0,0 +1,50 @@ +# Third-Party Licenses + +LongParser core is licensed under the **MIT License**. + +Some **optional** backends and integrations use different licenses. +These packages are **never loaded by default** — they are only imported +when you explicitly install them and select them in your configuration. + +## Optional Backend Licenses + +| Package | License | Install Command | When Loaded | +|---------|---------|-----------------|-------------| +| `pymupdf4llm` | AGPL-3.0 or Artifex Commercial | `pip install "longparser[pymupdf]"` | Only when you set `backend="pymupdf"` | +| `marker-pdf` | GPL-3.0-or-later | `pip install "longparser[marker]"` | Only when you set `backend="marker"` *(future)* | +| `surya-ocr` | GPL-3.0-or-later | `pip install "longparser[surya]"` | Only when explicitly imported *(future)* | + +## Core Dependency Licenses (always installed) + +| Package | License | Purpose | +|---------|---------|---------| +| `pydantic` | MIT | Schema validation | +| `docling` | MIT | Default PDF extraction engine | +| `docling-core` | MIT | Docling data models | +| `fast-langdetect` | Apache-2.0 | Document language detection | + +## What This Means for You + +- **If you only use `pip install longparser`** — everything is MIT or Apache-2.0. + You can use LongParser in any project (commercial, proprietary, open source). + +- **If you install `longparser[pymupdf]`** — the `pymupdf4llm` library is + AGPL-3.0 licensed. You must comply with AGPL terms for the PyMuPDF component, + OR purchase a commercial license from [Artifex](https://artifex.com). + LongParser core code remains MIT. + +- **If you install `longparser[marker]`** *(future)* — the `marker-pdf` library + is GPL-3.0 licensed. You must comply with GPL terms for the Marker component. + LongParser core code remains MIT. + +## License Isolation Guarantee + +LongParser uses **lazy imports** to ensure GPL/AGPL packages are never loaded +unless explicitly requested. The following guarantees hold: + +1. `import longparser` does NOT import any GPL/AGPL package +2. `from longparser import DocumentPipeline` does NOT import any GPL/AGPL package +3. `DocumentPipeline().process_file("doc.pdf")` does NOT import any GPL/AGPL + package (uses Docling, which is MIT) +4. GPL/AGPL code is only loaded when you explicitly set `backend="pymupdf"` or + `backend="marker"` in `ProcessingConfig` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 5356c04..4ee1d42 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -104,5 +104,5 @@ The server starts on `http://localhost:8000`. ```python import longparser -print(longparser.__version__) # 0.1.3 +print(longparser.__version__) # 0.1.4 ``` diff --git a/pyproject.toml b/pyproject.toml index afea16d..dbb7cbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "longparser" -version = "0.1.3" +version = "0.1.4" description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines." readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.10" @@ -36,6 +36,7 @@ dependencies = [ "docling>=2.14", "docling-core>=2.13", "langgraph-checkpoint-mongodb>=0.3.1", + "fast-langdetect>=0.3,<1.0", # Apache-2.0 — document language detection ] [project.optional-dependencies] @@ -51,6 +52,20 @@ langchain = [ llamaindex = [ "llama-index-core>=0.10", ] +# ----------- v0.1.4: Optional extraction backends ----------- +# ⚠️ pymupdf4llm is AGPL-3.0 licensed. See LICENSE-THIRD-PARTY.md. +# Only loaded when user sets backend="pymupdf". +pymupdf = [ + "pymupdf4llm>=1.27", +] +# ⚠️ marker-pdf is GPL-3.0. GPU recommended. Future release. +# marker = [ +# "marker-pdf", +# ] +# ⚠️ surya-ocr is GPL-3.0. GPU recommended. Future release. +# surya = [ +# "surya-ocr>=0.17", +# ] # FastAPI REST server + MongoDB + job queue + LangChain chat engine server = [ "fastapi>=0.115", diff --git a/src/longparser/__init__.py b/src/longparser/__init__.py index 7d00c7e..b1b9794 100755 --- a/src/longparser/__init__.py +++ b/src/longparser/__init__.py @@ -25,7 +25,7 @@ from __future__ import annotations -__version__ = "0.1.3" +__version__ = "0.1.4" __author__ = "ENDEVSOLS Team" __license__ = "MIT" @@ -59,6 +59,10 @@ def __getattr__(name: str): if name == "DoclingExtractor": from .extractors import DoclingExtractor return DoclingExtractor + if name == "PyMuPDFExtractor": + # AGPL-isolated — only loaded when explicitly requested + from .extractors.pymupdf_extractor import PyMuPDFExtractor + return PyMuPDFExtractor if name == "PipelineOrchestrator": from .pipeline import PipelineOrchestrator return PipelineOrchestrator @@ -101,6 +105,7 @@ def __getattr__(name: str): "JobResult", # Lazily imported (require extras) "DoclingExtractor", + "PyMuPDFExtractor", "PipelineOrchestrator", "DocumentPipeline", "PipelineResult", diff --git a/src/longparser/extractors/pymupdf_extractor.py b/src/longparser/extractors/pymupdf_extractor.py new file mode 100644 index 0000000..aecf375 --- /dev/null +++ b/src/longparser/extractors/pymupdf_extractor.py @@ -0,0 +1,493 @@ +"""PyMuPDF4LLM-based extractor for fast, CPU-native PDF extraction. + +⚠️ LICENSE NOTICE — AGPL-3.0 + pymupdf4llm is dual-licensed under AGPL-3.0 or Artifex Commercial License. + By using this backend, you agree to the terms of the AGPL-3.0 license + unless you have purchased a commercial license from Artifex Software, Inc. + + This module is NOT imported by default — users must explicitly opt in + via ``pip install longparser[pymupdf]`` and ``backend='pymupdf'``. + +⚠️ ISOLATION RULES (do NOT violate) + 1. This file must NEVER be imported by ``extractors/__init__.py`` + 2. This file must NEVER be imported at module level by ``orchestrator.py`` + 3. This file must ONLY be imported behind ``if backend == "pymupdf":`` + 4. ``import longparser`` must NEVER trigger loading this file + +Best for: + - Native PDFs with embedded text (not scanned) + - Speed-critical pipelines (10-50× faster than Docling) + - CPU-only environments (no GPU, no ML models) + +NOT suitable for: + - Scanned PDFs (no OCR capability) + - Complex tables with merged cells + - Documents needing deep heading hierarchy detection + +Usage:: + + from longparser import ProcessingConfig, DocumentPipeline + + pipeline = DocumentPipeline( + config=ProcessingConfig(backend="pymupdf") + ) + result = pipeline.process_file("report.pdf") +""" + +from __future__ import annotations + +import hashlib +import logging +import uuid +from pathlib import Path +from typing import Optional, List, Tuple + +from ..schemas import ( + Document, Page, Block, Table, TableCell, + BlockType, ExtractorType, ProcessingConfig, + BoundingBox, Provenance, Confidence, BlockFlags, + DocumentMetadata, PageProfile, ExtractionMetadata, +) +from .base import BaseExtractor + +logger = logging.getLogger(__name__) + + +def _require_pymupdf(): + """Check that pymupdf4llm is installed; raise clear error if not. + + Returns the ``pymupdf4llm`` module on success. + """ + try: + import pymupdf4llm + return pymupdf4llm + except ImportError: + raise ImportError( + "\n" + "╔══════════════════════════════════════════════════════════╗\n" + "║ pymupdf4llm is not installed. ║\n" + "║ ║\n" + "║ Install: pip install 'longparser[pymupdf]' ║\n" + "║ ║\n" + "║ ⚠️ pymupdf4llm is licensed under AGPL-3.0. ║\n" + "║ By installing it, you agree to AGPL terms for that ║\n" + "║ component. LongParser core remains MIT-licensed. ║\n" + "║ ║\n" + "║ For commercial use without AGPL obligations, purchase ║\n" + "║ a license from https://artifex.com ║\n" + "╚══════════════════════════════════════════════════════════╝\n" + ) + + +def _require_pymupdf_fitz(): + """Import the fitz (PyMuPDF) module for page-level operations.""" + try: + import pymupdf as fitz + return fitz + except ImportError: + try: + import fitz + return fitz + except ImportError: + raise ImportError( + "PyMuPDF (fitz) is required for the pymupdf backend. " + "Install with: pip install 'longparser[pymupdf]'" + ) + + +class PyMuPDFExtractor(BaseExtractor): + """Fast, CPU-native PDF extractor using PyMuPDF4LLM. + + Converts PDFs to structured Markdown and maps the output to + LongParser's ``Document`` / ``Block`` model. Uses no ML models, + no GPU — pure C-based PDF parsing via MuPDF. + + Attributes + ---------- + extractor_type : ExtractorType + Always ``ExtractorType.NATIVE_PDF``. + version : str + Extractor version string. + """ + + extractor_type = ExtractorType.NATIVE_PDF + version = "1.0.0" + + def __init__(self): + """Initialize and verify pymupdf4llm is available.""" + _require_pymupdf() + self._images: list = [] + logger.info( + "PyMuPDF4LLM backend initialized (CPU-native, no OCR, no GPU)" + ) + + def extract( + self, + file_path: Path, + config: ProcessingConfig, + page_numbers: Optional[List[int]] = None, + ) -> Tuple[Document, ExtractionMetadata]: + """Extract a PDF using PyMuPDF4LLM. + + Parameters + ---------- + file_path: + Path to the PDF file. + config: + Processing configuration. + page_numbers: + Optional list of 0-indexed page numbers to extract. + + Returns + ------- + tuple[Document, ExtractionMetadata] + Extracted document and metadata. + """ + import pymupdf4llm + + file_path = Path(file_path) + logger.info("Extracting with PyMuPDF4LLM: %s", file_path.name) + + # Validate file type + if file_path.suffix.lower() != ".pdf": + raise ValueError( + f"PyMuPDF4LLM backend only supports PDF files, got: {file_path.suffix}" + ) + + # File hash + file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest()[:16] + + # Extract with pymupdf4llm + kwargs = {"show_progress": False} + if page_numbers is not None: + kwargs["pages"] = page_numbers + + md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs) + + # Get page-level info using PyMuPDF directly + fitz = _require_pymupdf_fitz() + pdf_doc = fitz.open(str(file_path)) + total_pages = len(pdf_doc) + + # Extract images if config.export_images + self._images = [] + if config.export_images: + self._extract_images(pdf_doc, config) + + # Build Document from Markdown + document = self._markdown_to_document( + md_text=md_text, + pdf_doc=pdf_doc, + file_path=file_path, + file_hash=file_hash, + total_pages=total_pages, + config=config, + ) + + pdf_doc.close() + + meta = ExtractionMetadata( + strategy_used="pymupdf4llm", + ocr_backend_used="none (native text)", + ) + + logger.info( + "PyMuPDF4LLM extraction complete: %d pages, %d blocks", + total_pages, len(document.all_blocks), + ) + + return document, meta + + def _markdown_to_document( + self, + md_text: str, + pdf_doc, + file_path: Path, + file_hash: str, + total_pages: int, + config: ProcessingConfig, + ) -> Document: + """Convert Markdown text to a LongParser Document model.""" + metadata = DocumentMetadata( + source_file=str(file_path), + file_hash=file_hash, + total_pages=total_pages, + ) + + pages: list[Page] = [] + + # Split markdown by page breaks (pymupdf4llm uses "---" or form feeds) + page_chunks = self._split_by_pages(md_text, total_pages) + + for page_idx, page_md in enumerate(page_chunks): + page_no = page_idx + 1 + + # Get page dimensions from PyMuPDF + if page_idx < len(pdf_doc): + rect = pdf_doc[page_idx].rect + width, height = rect.width, rect.height + else: + width, height = 612.0, 792.0 # Letter default + + # Parse markdown blocks + blocks = self._parse_markdown_blocks(page_md, page_no, file_path) + + # Build page profile + profile = PageProfile( + page_number=page_no, + layout_confidence=0.9, # PyMuPDF is reliable for native PDFs + ) + + pages.append(Page( + page_number=page_no, + width=width, + height=height, + blocks=blocks, + profile=profile, + )) + + return Document(metadata=metadata, pages=pages) + + def _split_by_pages(self, md_text: str, total_pages: int) -> list[str]: + """Split markdown text into per-page chunks.""" + import re + + # pymupdf4llm inserts page separators + # Common patterns: "-----" (5+ dashes), or form feed characters + parts = re.split(r'\n-{3,}\n|\f', md_text) + + # If splitting didn't work, put everything on page 1 + if len(parts) <= 1: + return [md_text] + + # Pad to total_pages if needed + while len(parts) < total_pages: + parts.append("") + + return parts[:total_pages] + + def _parse_markdown_blocks( + self, + page_md: str, + page_no: int, + file_path: Path, + ) -> list[Block]: + """Parse markdown text into Block objects.""" + blocks: list[Block] = [] + lines = page_md.strip().split("\n") + order_idx = 0 + + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + + if not stripped: + i += 1 + continue + + # Detect block type + if stripped.startswith("#"): + # Heading + level = len(stripped) - len(stripped.lstrip("#")) + text = stripped.lstrip("#").strip() + block = self._make_block( + BlockType.HEADING, text, order_idx, page_no, + file_path, heading_level=min(level, 6), + ) + blocks.append(block) + + elif stripped.startswith("|") and "|" in stripped[1:]: + # Table — collect all table lines + table_lines = [stripped] + i += 1 + while i < len(lines) and lines[i].strip().startswith("|"): + table_lines.append(lines[i].strip()) + i += 1 + table_md = "\n".join(table_lines) + table_obj = self._parse_table(table_lines) + block = self._make_block( + BlockType.TABLE, table_md, order_idx, page_no, + file_path, table=table_obj, + ) + blocks.append(block) + order_idx += 1 + continue # Already incremented i + + elif stripped.startswith(("- ", "* ", "+ ")) or ( + len(stripped) > 2 and stripped[0].isdigit() and stripped[1] in ".)" + ): + # List item + text = stripped.lstrip("-*+ ").lstrip("0123456789.)").strip() + block = self._make_block( + BlockType.LIST_ITEM, text, order_idx, page_no, file_path, + ) + blocks.append(block) + + elif stripped.startswith("```"): + # Code block + code_lines = [] + i += 1 + while i < len(lines) and not lines[i].strip().startswith("```"): + code_lines.append(lines[i]) + i += 1 + code_text = "\n".join(code_lines) + block = self._make_block( + BlockType.CODE, code_text, order_idx, page_no, file_path, + ) + blocks.append(block) + i += 1 # Skip closing ``` + order_idx += 1 + continue + + elif stripped.startswith("$$") or stripped.startswith("\\["): + # Equation block + eq_lines = [stripped] + if not (stripped.endswith("$$") and len(stripped) > 2): + i += 1 + while i < len(lines): + eq_line = lines[i].strip() + eq_lines.append(eq_line) + if eq_line.endswith("$$") or eq_line.endswith("\\]"): + break + i += 1 + eq_text = "\n".join(eq_lines) + block = self._make_block( + BlockType.EQUATION, eq_text, order_idx, page_no, file_path, + ) + blocks.append(block) + + else: + # Regular paragraph + block = self._make_block( + BlockType.PARAGRAPH, stripped, order_idx, page_no, file_path, + ) + blocks.append(block) + + order_idx += 1 + i += 1 + + return blocks + + def _make_block( + self, + block_type: BlockType, + text: str, + order_index: int, + page_no: int, + file_path: Path, + heading_level: Optional[int] = None, + table: Optional[Table] = None, + ) -> Block: + """Create a Block with standard provenance.""" + return Block( + type=block_type, + text=text, + order_index=order_index, + heading_level=heading_level, + provenance=Provenance( + source_file=str(file_path), + page_number=page_no, + bbox=BoundingBox(x0=0, y0=0, x1=0, y1=0), + extractor=self.extractor_type, + extractor_version=self.version, + ), + confidence=Confidence(overall=0.9), + table=table, + ) + + def _parse_table(self, table_lines: list[str]) -> Table: + """Parse a Markdown table into a Table object.""" + # Filter out separator lines (|---|---|) + data_lines = [ + line for line in table_lines + if line.strip() and not all(c in "|-: " for c in line.strip()) + ] + + if not data_lines: + return Table(n_rows=0, n_cols=0) + + cells: list[TableCell] = [] + n_cols = 0 + + for row_idx, line in enumerate(data_lines): + parts = [p.strip() for p in line.strip("|").split("|")] + n_cols = max(n_cols, len(parts)) + for col_idx, cell_text in enumerate(parts): + cells.append(TableCell( + r0=row_idx, c0=col_idx, text=cell_text + )) + + return Table( + n_rows=len(data_lines), + n_cols=n_cols, + cells=cells, + table_confidence=0.85, + ) + + def _extract_images(self, pdf_doc, config: ProcessingConfig): + """Extract images from PDF pages.""" + for page_idx in range(len(pdf_doc)): + page = pdf_doc[page_idx] + image_list = page.get_images(full=True) + for img_idx, img in enumerate(image_list): + try: + xref = img[0] + base_image = pdf_doc.extract_image(xref) + if base_image: + self._images.append({ + "page": page_idx + 1, + "index": img_idx, + "data": base_image["image"], + "ext": base_image.get("ext", "png"), + }) + except Exception as e: + logger.debug("Failed to extract image on page %d: %s", page_idx + 1, e) + + def save_images(self, output_dir: Path) -> list[Path]: + """Save extracted images to disk. + + Parameters + ---------- + output_dir: + Directory to save images to. + + Returns + ------- + list[Path] + Paths to saved image files. + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + saved = [] + + for img_info in self._images: + fname = f"page_{img_info['page']:03d}_img_{img_info['index']:02d}.{img_info['ext']}" + fpath = output_dir / fname + with open(fpath, "wb") as f: + f.write(img_info["data"]) + saved.append(fpath) + + logger.info("Saved %d images to %s", len(saved), output_dir) + return saved + + def to_markdown(self, document: Document) -> str: + """Convert Document back to Markdown.""" + parts = [] + for page in document.pages: + for block in page.blocks: + if block.type == BlockType.HEADING: + level = block.heading_level or 1 + parts.append(f"{'#' * level} {block.text}") + elif block.type == BlockType.TABLE: + parts.append(block.text) + elif block.type == BlockType.LIST_ITEM: + parts.append(f"- {block.text}") + elif block.type == BlockType.CODE: + parts.append(f"```\n{block.text}\n```") + elif block.type == BlockType.EQUATION: + parts.append(f"$$\n{block.text}\n$$") + else: + parts.append(block.text) + parts.append("") + return "\n".join(parts) diff --git a/src/longparser/integrations/__init__.py b/src/longparser/integrations/__init__.py index b8eae82..44055de 100755 --- a/src/longparser/integrations/__init__.py +++ b/src/longparser/integrations/__init__.py @@ -2,9 +2,9 @@ Install the extras to use these adapters:: - pip install clean_rag[langchain] - pip install clean_rag[llamaindex] - pip install clean_rag[all] + pip install longparser[langchain] + pip install longparser[llamaindex] + pip install longparser[all] """ from __future__ import annotations diff --git a/src/longparser/integrations/langchain.py b/src/longparser/integrations/langchain.py index 59bdba0..7848c31 100755 --- a/src/longparser/integrations/langchain.py +++ b/src/longparser/integrations/langchain.py @@ -5,7 +5,7 @@ Install the extra to use this adapter:: - pip install clean_rag[langchain] + pip install longparser[langchain] Usage:: @@ -27,7 +27,7 @@ _INSTALL_MSG = ( "langchain-core is required for the LangChain adapter. " - "Install it with: pip install clean_rag[langchain]" + "Install it with: pip install longparser[langchain]" ) @@ -95,6 +95,7 @@ def lazy_load(self) -> Iterator["LCDocument"]: from ..pipeline import PipelineOrchestrator pipeline = PipelineOrchestrator( + config=self.config, tesseract_lang=self.tesseract_lang, tessdata_path=self.tessdata_path, ) diff --git a/src/longparser/integrations/llamaindex.py b/src/longparser/integrations/llamaindex.py index a8d4344..d5437b9 100755 --- a/src/longparser/integrations/llamaindex.py +++ b/src/longparser/integrations/llamaindex.py @@ -5,7 +5,7 @@ Install the extra to use this adapter:: - pip install clean_rag[llamaindex] + pip install longparser[llamaindex] Usage:: @@ -27,7 +27,7 @@ _INSTALL_MSG = ( "llama-index-core is required for the LlamaIndex adapter. " - "Install it with: pip install clean_rag[llamaindex]" + "Install it with: pip install longparser[llamaindex]" ) @@ -105,6 +105,7 @@ def load_data( file = Path(file) pipeline = PipelineOrchestrator( + config=self.config, tesseract_lang=self.tesseract_lang, tessdata_path=self.tessdata_path, ) diff --git a/src/longparser/pipeline/orchestrator.py b/src/longparser/pipeline/orchestrator.py index 202be9e..5062a48 100755 --- a/src/longparser/pipeline/orchestrator.py +++ b/src/longparser/pipeline/orchestrator.py @@ -1,4 +1,13 @@ -"""Simple pipeline orchestrator for LongParser.""" +"""Simple pipeline orchestrator for LongParser. + +Supports multiple extraction backends: + +- ``"docling"`` (default) — Docling with Tesseract CLI OCR (MIT) +- ``"pymupdf"`` — PyMuPDF4LLM for fast native PDF extraction (AGPL, optional) +- ``"auto"`` — Automatic backend selection based on document properties + +Language detection runs before OCR to set the correct Tesseract language. +""" from pathlib import Path from dataclasses import dataclass, field @@ -11,6 +20,7 @@ from ..extractors import DoclingExtractor from ..extractors.docling_extractor import HierarchyChunk from ..chunkers import HybridChunker +from ..utils.lang_detect import detect_language, get_tesseract_langs, extract_sample_text logger = logging.getLogger(__name__) @@ -30,43 +40,189 @@ def total_blocks(self) -> int: class PipelineOrchestrator: """ - Simple pipeline orchestrator using Docling. + Pipeline orchestrator with backend selection and language detection. Flow: - 1. Docling extracts with Tesseract CLI OCR - 2. Layout analysis detects structure - 3. HierarchicalChunker preserves heading hierarchy + 1. (Optional) Auto-detect document language + 2. Select backend: Docling, PyMuPDF, or auto-route + 3. Extract with chosen backend + 4. HierarchicalChunker preserves heading hierarchy + + Parameters + ---------- + config: + Processing configuration with backend, language, and layout settings. + Only used for backend selection during init. Per-file config is passed + to ``process_file()``. + tesseract_lang: + Languages for Tesseract OCR (default: ``["eng"]``). Overridden by + ``config.languages`` or auto-detection if enabled. + tessdata_path: + Path to tessdata directory with language models and configs. + force_full_page_ocr: + If True, OCR entire page even if embedded text exists. """ - def __init__(self, tesseract_lang: List[str] = None, tessdata_path: str = None, force_full_page_ocr: bool = False): - """ - Initialize pipeline. - - Args: - tesseract_lang: Languages for Tesseract OCR (default: ["eng"]) - tessdata_path: Path to tessdata directory with language models and configs. - force_full_page_ocr: If True, OCR entire page even if embedded text exists. + def __init__( + self, + config: Optional[ProcessingConfig] = None, + tesseract_lang: List[str] = None, + tessdata_path: str = None, + force_full_page_ocr: bool = False, + ): + self._config = config or ProcessingConfig() + self._tessdata_path = tessdata_path + self._force_full_page_ocr = force_full_page_ocr + self._base_tesseract_lang = tesseract_lang + + # Determine backend from config + backend = self._config.backend + + if backend == "pymupdf": + # Lazy import — only loaded when user explicitly requests it + from ..extractors.pymupdf_extractor import PyMuPDFExtractor + self.extractor = PyMuPDFExtractor() + self._backend_name = "pymupdf" + logger.info("Pipeline initialized with PyMuPDF4LLM backend (CPU-native, fast)") + + elif backend == "auto": + # Auto mode: start with Docling (safe default), route at process time + self.extractor = DoclingExtractor( + tesseract_lang=tesseract_lang, + tessdata_path=tessdata_path, + force_full_page_ocr=force_full_page_ocr, + ) + self._backend_name = "auto" + logger.info("Pipeline initialized in auto mode (will choose backend per document)") + + else: + # Default: Docling (MIT, always available) + self.extractor = DoclingExtractor( + tesseract_lang=tesseract_lang, + tessdata_path=tessdata_path, + force_full_page_ocr=force_full_page_ocr, + ) + self._backend_name = "docling" + logger.info("Pipeline initialized with Docling backend (default)") + + def _resolve_languages( + self, + file_path: Path, + config: ProcessingConfig, + ) -> list[str]: + """Resolve OCR languages via user override or auto-detection. + + Priority order: + 1. ``config.languages`` (explicit user override — always wins) + 2. ``self._base_tesseract_lang`` (constructor param) + 3. Auto-detection via ``fast-langdetect`` (if enabled) + 4. Default: ``["eng"]`` """ - self.extractor = DoclingExtractor( - tesseract_lang=tesseract_lang, - tessdata_path=tessdata_path, - force_full_page_ocr=force_full_page_ocr, - ) - + # 1. Explicit user override + if config.languages: + logger.info("Using user-specified languages: %s", config.languages) + return config.languages + + # 2. Constructor param + if self._base_tesseract_lang: + # If auto-detect is enabled, try to improve on constructor default + if config.auto_detect_language: + detected_langs = self._auto_detect(file_path) + if detected_langs: + return detected_langs + return self._base_tesseract_lang + + # 3. Auto-detect + if config.auto_detect_language: + detected_langs = self._auto_detect(file_path) + if detected_langs: + return detected_langs + + # 4. Default + return ["eng"] + + def _auto_detect(self, file_path: Path) -> Optional[list[str]]: + """Run language detection and return Tesseract codes, or None.""" + sample = extract_sample_text(file_path) + if not sample or len(sample.strip()) < 20: + return None + + lang_code, confidence = detect_language(sample) + if confidence > 0.0: + tess_langs = get_tesseract_langs(lang_code) + logger.info( + "Auto-detected language: %s (%.0f%%) → Tesseract: %s", + lang_code, confidence * 100, tess_langs, + ) + # Store for later use in document metadata + self._detected_lang = lang_code + self._detected_lang_confidence = confidence + return tess_langs + + return None + + def _should_use_pymupdf(self, file_path: Path) -> bool: + """Check if PyMuPDF is a better choice for this file (auto mode).""" + ext = file_path.suffix.lower() + + # PyMuPDF only handles PDFs + if ext != ".pdf": + return False + + # Check if PDF has a text layer (= native, not scanned) + sample = extract_sample_text(file_path, max_chars=500) + if sample and len(sample.strip()) > 100: + # Has text → native PDF → PyMuPDF is faster + try: + from ..extractors.pymupdf_extractor import PyMuPDFExtractor + return True + except ImportError: + # pymupdf4llm not installed — fall back to Docling + logger.debug("Auto mode: pymupdf4llm not installed, using Docling") + return False + + # Scanned PDF or too little text → use Docling (has OCR) + return False + def process(self, request: JobRequest) -> PipelineResult: """Process a document.""" start_time = time.time() file_path = Path(request.file_path) config = request.config + + # Initialize language detection state + self._detected_lang = None + self._detected_lang_confidence = 0.0 logger.info(f"Processing: {file_path.name}") - + + # Auto-mode: decide backend per document + if self._backend_name == "auto" and self._should_use_pymupdf(file_path): + from ..extractors.pymupdf_extractor import PyMuPDFExtractor + extractor = PyMuPDFExtractor() + logger.info("Auto mode selected: PyMuPDF4LLM (native PDF detected)") + else: + extractor = self.extractor + + # Resolve languages for Docling backend + if isinstance(extractor, DoclingExtractor): + resolved_langs = self._resolve_languages(file_path, config) + extractor._languages = resolved_langs + # Extract document - document, meta = self.extractor.extract(file_path, config) - - # Get hierarchy - hierarchy = self.extractor.get_hierarchy(file_path, config) + document, meta = extractor.extract(file_path, config) + + # Inject language detection results into metadata + if self._detected_lang: + document.metadata.detected_language = self._detected_lang + document.metadata.language_confidence = self._detected_lang_confidence + + # Get hierarchy (only DoclingExtractor has this) + if isinstance(extractor, DoclingExtractor): + hierarchy = extractor.get_hierarchy(file_path, config) + else: + hierarchy = [] processing_time = time.time() - start_time logger.info(f"Completed in {processing_time:.2f}s") @@ -164,6 +320,8 @@ def export_results(self, result: PipelineResult, output_dir: Path) -> dict: "total_blocks": len(all_blocks), "total_tables": total_tables, "processing_time_seconds": result.processing_time_seconds, + "detected_language": result.document.metadata.detected_language, + "language_confidence": result.document.metadata.language_confidence, "stages_completed": [ "stage1_extraction", "stage2_validation", @@ -228,3 +386,4 @@ def export_chunks(self, result: PipelineResult, output_dir: Path) -> Path: def save_images(self, output_dir: Path) -> List[Path]: """Save extracted images.""" return self.extractor.save_images(output_dir) + diff --git a/src/longparser/schemas.py b/src/longparser/schemas.py index 60bd47f..6e54f1e 100755 --- a/src/longparser/schemas.py +++ b/src/longparser/schemas.py @@ -118,6 +118,8 @@ class PageProfile(BaseModel): table_confidence: Optional[float] = None has_rtl: bool = False has_math: bool = False + detected_columns: int = Field(default=1, description="Number of text columns detected on page") + reading_order_confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence of reading-order reconstruction") class Page(BaseModel): @@ -135,6 +137,8 @@ class DocumentMetadata(BaseModel): source_file: str file_hash: str = "" language: Optional[str] = None + detected_language: Optional[str] = Field(default=None, description="Auto-detected language code (ISO 639-1) via fast-langdetect") + language_confidence: float = Field(default=0.0, ge=0.0, le=1.0, description="Confidence of auto-detected language") total_pages: int = 0 academic_mode: bool = False rtl_hint: bool = False @@ -163,6 +167,17 @@ def all_tables(self) -> list[Table]: class ProcessingConfig(BaseModel): """Configuration for pipeline execution.""" + # --- v0.1.4: Backend selection --- + backend: str = Field(default="docling", description="Extraction backend: 'docling' | 'pymupdf' | 'auto'") + + # --- v0.1.4: Language detection --- + languages: Optional[list[str]] = Field(default=None, description="Explicit Tesseract language codes, e.g. ['eng','ara']. Overrides auto-detect.") + auto_detect_language: bool = Field(default=True, description="Auto-detect document language before OCR (uses fast-langdetect)") + + # --- v0.1.4: Multi-column layout --- + column_count_hint: Optional[int] = Field(default=None, description="Manual column count hint. None = auto-detect by Docling") + force_left_to_right: bool = Field(default=False, description="Force left-to-right top-to-bottom reading order") + academic_mode: bool = False rtl_hint: bool = False do_ocr: bool = True @@ -202,6 +217,10 @@ class ExtractionMetadata(BaseModel): reprocessed_pages: list[int] = Field(default_factory=list) ocr_backend_used: Optional[str] = None reasons: list[str] = Field(default_factory=list) + # --- v0.1.4: OCR routing metadata --- + ocr_strategy: str = Field(default="standard", description="OCR strategy used: 'standard' | 'math' | 'full_ocr'") + is_scanned: bool = Field(default=False, description="Whether the document was detected as scanned (no text layer)") + page_complexity_scores: dict[int, int] = Field(default_factory=dict, description="Per-page complexity scores used for OCR routing") class ChunkingConfig(BaseModel): @@ -222,12 +241,13 @@ class Chunk(BaseModel): chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4())) text: str token_count: int - chunk_type: str # "section" | "table" | "table_schema" | "list" | "equation" | "continuation" + chunk_type: str # "section" | "table" | "table_schema" | "list" | "equation" | "figure" | "continuation" section_path: list[str] = Field(default_factory=list) page_numbers: list[int] = Field(default_factory=list) block_ids: list[str] = Field(default_factory=list) overlap_with_previous: bool = False equation_detected: bool = False + image_path: Optional[str] = Field(default=None, description="Path to figure image if chunk_type == 'figure'") metadata: dict = Field(default_factory=dict) # row_start, row_end, sheet, col_band diff --git a/src/longparser/utils/__init__.py b/src/longparser/utils/__init__.py index c642b45..7c7ea22 100755 --- a/src/longparser/utils/__init__.py +++ b/src/longparser/utils/__init__.py @@ -1,5 +1,14 @@ """Utility modules for LongParser.""" from .rtl_detector import detect_rtl_language +from .lang_detect import detect_language, get_tesseract_langs +from .ocr_router import is_page_scanned, score_page_complexity, get_ocr_strategy -__all__ = ["detect_rtl_language"] +__all__ = [ + "detect_rtl_language", + "detect_language", + "get_tesseract_langs", + "is_page_scanned", + "score_page_complexity", + "get_ocr_strategy", +] diff --git a/src/longparser/utils/lang_detect.py b/src/longparser/utils/lang_detect.py new file mode 100644 index 0000000..b544d4b --- /dev/null +++ b/src/longparser/utils/lang_detect.py @@ -0,0 +1,193 @@ +"""Language detection for document text samples. + +Uses ``fast-langdetect`` (Apache-2.0, Facebook FastText model) to detect +the primary language of a text sample and map it to Tesseract language codes. + +This module is designed for zero-failure operation: +- Falls back to English if ``fast-langdetect`` is not installed +- Falls back to English if detection confidence is too low +- Falls back to English on any unexpected error +- Never raises exceptions that would break the pipeline + +Usage:: + + from longparser.utils.lang_detect import detect_language, get_tesseract_langs + + lang, confidence = detect_language("هذا نص عربي") # ("ar", 0.99) + tess_codes = get_tesseract_langs("ar") # ["ara"] +""" + +from __future__ import annotations + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Mapping: ISO 639-1 code (fast-langdetect) → Tesseract language code(s) +# --------------------------------------------------------------------------- +_LANG_TO_TESSERACT: dict[str, list[str]] = { + "af": ["afr"], "am": ["amh"], "ar": ["ara"], "az": ["aze"], + "be": ["bel"], "bg": ["bul"], "bn": ["ben"], "bs": ["bos"], + "ca": ["cat"], "cs": ["ces"], "cy": ["cym"], "da": ["dan"], + "de": ["deu"], "el": ["ell"], "en": ["eng"], "es": ["spa"], + "et": ["est"], "eu": ["eus"], "fa": ["fas"], "fi": ["fin"], + "fr": ["fra"], "ga": ["gle"], "gl": ["glg"], "gu": ["guj"], + "ha": ["hau"], "he": ["heb"], "hi": ["hin"], "hr": ["hrv"], + "hu": ["hun"], "hy": ["hye"], "id": ["ind"], "is": ["isl"], + "it": ["ita"], "ja": ["jpn"], "jv": ["jav"], "ka": ["kat"], + "kk": ["kaz"], "km": ["khm"], "kn": ["kan"], "ko": ["kor"], + "la": ["lat"], "lt": ["lit"], "lv": ["lav"], "mk": ["mkd"], + "ml": ["mal"], "mn": ["mon"], "mr": ["mar"], "ms": ["msa"], + "my": ["mya"], "ne": ["nep"], "nl": ["nld"], "no": ["nor"], + "pa": ["pan"], "pl": ["pol"], "pt": ["por"], "ro": ["ron"], + "ru": ["rus"], "si": ["sin"], "sk": ["slk"], "sl": ["slv"], + "sq": ["sqi"], "sr": ["srp"], "sv": ["swe"], "sw": ["swa"], + "ta": ["tam"], "te": ["tel"], "th": ["tha"], "tl": ["tgl"], + "tr": ["tur"], "uk": ["ukr"], "ur": ["urd"], "uz": ["uzb"], + "vi": ["vie"], "yo": ["yor"], + # Chinese variants + "zh": ["chi_sim", "chi_tra"], +} + + +def detect_language( + text: str, + min_confidence: float = 0.5, +) -> tuple[str, float]: + """Detect the primary language of a text sample. + + Parameters + ---------- + text: + Text sample to analyze. At least 20 characters recommended. + min_confidence: + Minimum confidence threshold. Below this, falls back to ``"en"``. + + Returns + ------- + tuple[str, float] + ``(language_code, confidence)`` — e.g. ``("ar", 0.99)``. + Falls back to ``("en", 0.0)`` on any failure. + """ + if not text or len(text.strip()) < 20: + logger.debug("Text too short for language detection, defaulting to English") + return "en", 0.0 + + try: + from fast_langdetect import detect + result = detect(text) + lang = result.get("lang", "en") + score = result.get("score", 0.0) + + if score < min_confidence: + logger.info( + "Language detection low confidence (%.2f for '%s'), " + "defaulting to English", score, lang + ) + return "en", score + + logger.info("Detected language: %s (confidence: %.2f)", lang, score) + return lang, score + + except ImportError: + logger.warning( + "fast-langdetect is not installed. Language detection disabled. " + "Install with: pip install fast-langdetect" + ) + return "en", 0.0 + except Exception as e: + logger.warning("Language detection failed: %s — defaulting to English", e) + return "en", 0.0 + + +def get_tesseract_langs(lang_code: str) -> list[str]: + """Map a detected language code to Tesseract language code(s). + + Parameters + ---------- + lang_code: + ISO 639-1 language code (e.g. ``"ar"``, ``"en"``). + + Returns + ------- + list[str] + Tesseract language codes (e.g. ``["ara"]``, ``["eng"]``). + """ + return _LANG_TO_TESSERACT.get(lang_code, ["eng"]) + + +def extract_sample_text(file_path, max_chars: int = 2000) -> str: + """Extract a sample of text from a document for language detection. + + Uses a lightweight approach: reads first few KB of the file and + extracts printable text. For PDFs, attempts to use PyMuPDF if + available, otherwise falls back to reading raw bytes. + + Parameters + ---------- + file_path: + Path to the document file. + max_chars: + Maximum characters to extract. + + Returns + ------- + str + Extracted text sample, or empty string if extraction fails. + """ + from pathlib import Path + file_path = Path(file_path) + + if not file_path.exists(): + return "" + + ext = file_path.suffix.lower() + + # For PDFs: try lightweight text extraction + if ext == ".pdf": + return _extract_pdf_sample(file_path, max_chars) + + # For text-like files: read directly + if ext in (".csv", ".txt", ".md"): + try: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + return f.read(max_chars) + except Exception: + return "" + + # For other formats: return empty (language detection will use + # text extracted by Docling later) + return "" + + +def _extract_pdf_sample(file_path, max_chars: int) -> str: + """Extract text sample from a PDF using the lightest method available.""" + # Try pdfplumber (lightweight, often available) + try: + import pdfplumber + with pdfplumber.open(str(file_path)) as pdf: + text = "" + for page in pdf.pages[:3]: # First 3 pages + page_text = page.extract_text() or "" + text += page_text + "\n" + if len(text) >= max_chars: + break + return text[:max_chars] + except ImportError: + pass + except Exception: + pass + + # Fallback: read raw bytes and extract printable chars + try: + with open(file_path, "rb") as f: + raw = f.read(max_chars * 4) # Read more bytes since not all are text + # Extract ASCII/Unicode text from raw bytes + text = raw.decode("utf-8", errors="ignore") + # Filter to printable characters + printable = "".join(c for c in text if c.isprintable() or c in "\n\t ") + return printable[:max_chars] + except Exception: + return "" diff --git a/src/longparser/utils/ocr_router.py b/src/longparser/utils/ocr_router.py new file mode 100644 index 0000000..dd3586d --- /dev/null +++ b/src/longparser/utils/ocr_router.py @@ -0,0 +1,148 @@ +"""Smart OCR routing for scanned PDFs. + +Routes pages to the best OCR strategy based on content complexity: + +- **standard** — Tesseract with default settings (fast, CPU-native) +- **math** — Tesseract for text + pix2tex for equations +- **full_ocr** — Tesseract with ``force_full_page_ocr=True`` + +All strategies are CPU-friendly. No GPU-dependent engines (Surya, Marker) +are used in the routing — those are available as separate optional backends. + +Usage:: + + from longparser.utils.ocr_router import ( + is_page_scanned, score_page_complexity, get_ocr_strategy, + ) + + if is_page_scanned(page_text): + score = score_page_complexity(page_text, num_blocks=15, has_tables=True) + strategy = get_ocr_strategy(score) + # strategy = "full_ocr" for score >= 5 +""" + +from __future__ import annotations + +import logging +import re + +logger = logging.getLogger(__name__) + +# Pattern to detect math symbols and simple equations in text. +# Matches Unicode math symbols and simple algebraic patterns like "x = 5". +_MATH_RE = re.compile( + r'[\u2211\u220F\u222B\u221A\u00B1\u2264\u2265\u2248\u2260\u03B1-\u03C9\u03A3]' + r'|[a-z]\s*=\s*[a-z0-9]', + re.IGNORECASE, +) + + +def is_page_scanned(page_text: str, min_chars: int = 30) -> bool: + """Check if a page is likely scanned (no usable text layer). + + Parameters + ---------- + page_text: + Extracted text from the page. + min_chars: + Minimum character count to consider the page as having a text layer. + + Returns + ------- + bool + ``True`` if the page has fewer than ``min_chars`` printable characters + (indicating it's likely a scanned image with no embedded text). + """ + clean = page_text.strip() + return len(clean) < min_chars + + +def has_math_content(text: str) -> bool: + """Check if text contains mathematical symbols or equation patterns. + + Parameters + ---------- + text: + Text to check for math content. + + Returns + ------- + bool + ``True`` if math symbols or equation patterns are found. + """ + return bool(_MATH_RE.search(text)) + + +def score_page_complexity( + page_text: str, + num_blocks: int = 0, + has_tables: bool = False, +) -> int: + """Score page complexity on a scale of 0-10. + + Used to decide which OCR strategy to apply: + + - **0-2** → ``"standard"`` — Simple page, Tesseract is enough + - **3-4** → ``"math"`` — Has equations, add pix2tex + - **5+** → ``"full_ocr"`` — Complex layout, use full-page OCR + + Parameters + ---------- + page_text: + Extracted text from the page. + num_blocks: + Number of content blocks on the page. + has_tables: + Whether the page contains tables. + + Returns + ------- + int + Complexity score from 0 to 10. + """ + score = 0 + + # Tables add significant complexity + if has_tables: + score += 3 + + # Math content needs pix2tex + if has_math_content(page_text): + score += 2 + + # Many blocks suggest a dense/complex layout + if num_blocks > 20: + score += 2 + elif num_blocks > 10: + score += 1 + + # Very short text on a page with blocks = likely OCR issues + if page_text and len(page_text.strip()) < 100 and num_blocks > 5: + score += 1 + + return min(score, 10) + + +def get_ocr_strategy(complexity_score: int) -> str: + """Pick OCR strategy based on page complexity score. + + Parameters + ---------- + complexity_score: + Score from :func:`score_page_complexity` (0-10). + + Returns + ------- + str + One of: + + - ``"standard"`` — Tesseract with default settings + - ``"math"`` — Tesseract + pix2tex for equations + - ``"full_ocr"`` — Tesseract with ``force_full_page_ocr=True`` + """ + if complexity_score <= 2: + return "standard" + elif complexity_score <= 4: + return "math" + else: + return "full_ocr" diff --git a/tests/benchmarks/benchmark_pipeline.py b/tests/benchmarks/benchmark_pipeline.py new file mode 100644 index 0000000..716ee44 --- /dev/null +++ b/tests/benchmarks/benchmark_pipeline.py @@ -0,0 +1,98 @@ +"""Pipeline performance benchmark for regression testing. + +Run this BEFORE and AFTER v0.2.x changes to prove no speed regression. + +Usage: + # Save baseline (v0.1.3) + python tests/benchmarks/benchmark_pipeline.py > benchmark_v013.txt + + # After v0.2.x changes + python tests/benchmarks/benchmark_pipeline.py > benchmark_v020.txt + + # Compare + diff benchmark_v013.txt benchmark_v020.txt +""" + +import time +import sys +from pathlib import Path + + +def benchmark_file(file_path: str) -> dict: + """Benchmark a single file through the pipeline.""" + from longparser import DocumentPipeline, ProcessingConfig + + path = Path(file_path) + if not path.exists(): + return {"file": file_path, "status": "SKIPPED (file not found)"} + + pipeline = DocumentPipeline() + config = ProcessingConfig() + + t0 = time.time() + try: + result = pipeline.process_file(path, config=config) + elapsed = time.time() - t0 + + return { + "file": path.name, + "time_seconds": round(elapsed, 2), + "total_blocks": result.total_blocks, + "total_pages": result.document.metadata.total_pages, + "status": "OK", + } + except Exception as e: + elapsed = time.time() - t0 + return { + "file": path.name, + "time_seconds": round(elapsed, 2), + "status": f"ERROR: {e}", + } + + +def main(): + """Run benchmark on all available test fixtures.""" + # Look for test PDFs in common locations + fixture_dirs = [ + Path("tests/fixtures"), + Path("tests"), + Path("uploads"), + ] + + test_files = [] + for d in fixture_dirs: + if d.exists(): + test_files.extend(sorted(d.glob("*.pdf"))) + + if not test_files: + print("No PDF test files found in tests/fixtures/ or uploads/") + print("Place some PDFs there and re-run.") + sys.exit(1) + + print("=" * 60) + print("LongParser Pipeline Benchmark") + print("=" * 60) + print(f"Files found: {len(test_files)}") + print() + + results = [] + for f in test_files[:5]: # Cap at 5 files for reasonable benchmark time + print(f"Benchmarking: {f.name} ...", end=" ", flush=True) + result = benchmark_file(str(f)) + results.append(result) + print(f"{result.get('time_seconds', '?')}s — {result['status']}") + + print() + print("-" * 60) + print(f"{'File':<30} {'Time':>8} {'Blocks':>8} {'Pages':>6}") + print("-" * 60) + for r in results: + if r["status"] == "OK": + print(f"{r['file']:<30} {r['time_seconds']:>7.2f}s {r['total_blocks']:>8} {r['total_pages']:>6}") + else: + print(f"{r['file']:<30} {r['status']}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/tests/unit/test_backward_compat.py b/tests/unit/test_backward_compat.py new file mode 100644 index 0000000..fae7d49 --- /dev/null +++ b/tests/unit/test_backward_compat.py @@ -0,0 +1,142 @@ +"""Backward compatibility tests for v0.2.x changes. + +Ensures that users who wrote code against v0.1.3 can upgrade to v0.2.x +without changing a single line of their code. Every new field must have +a default that matches the v0.1.3 behavior. +""" + +import pytest + + +class TestProcessingConfigCompat: + """ProcessingConfig() with no args must behave exactly like v0.1.3.""" + + def test_default_values_match_v013(self): + from longparser.schemas import ProcessingConfig + config = ProcessingConfig() + + # v0.1.3 defaults — these must NEVER change + assert config.academic_mode is False + assert config.rtl_hint is False + assert config.do_ocr is True + assert config.formula_ocr is True + assert config.do_table_structure is True + assert config.export_images is True + assert config.formula_mode == "smart" + assert config.smart_max_equations == 25 + assert config.smart_max_ocr_seconds == 300.0 + assert config.exclude_page_headers_footers is True + + def test_new_fields_have_safe_defaults(self): + """New v0.2.x fields must default to values that don't change behavior.""" + from longparser.schemas import ProcessingConfig + config = ProcessingConfig() + + # backend must default to docling (existing behavior) + backend = getattr(config, "backend", "docling") + assert backend == "docling" + + # auto_detect_language defaults to True but only runs if languages=None + auto_detect = getattr(config, "auto_detect_language", True) + assert auto_detect is True + + # languages=None means "use existing tesseract_lang param" + languages = getattr(config, "languages", None) + assert languages is None + + +class TestDocumentMetadataCompat: + """DocumentMetadata must keep all v0.1.3 fields.""" + + def test_v013_fields_exist(self): + from longparser.schemas import DocumentMetadata + meta = DocumentMetadata(source_file="test.pdf") + + assert meta.source_file == "test.pdf" + assert meta.file_hash == "" + assert meta.language is None + assert meta.total_pages == 0 + assert meta.academic_mode is False + assert meta.rtl_hint is False + + +class TestBlockCompat: + """Block schema must keep all v0.1.3 fields and types.""" + + def test_block_type_values_unchanged(self): + from longparser.schemas import BlockType + + # All v0.1.3 values must still exist + assert BlockType.HEADING == "heading" + assert BlockType.PARAGRAPH == "paragraph" + assert BlockType.LIST_ITEM == "list_item" + assert BlockType.TABLE == "table" + assert BlockType.FIGURE == "figure" + assert BlockType.CAPTION == "caption" + assert BlockType.FOOTER == "footer" + assert BlockType.HEADER == "header" + assert BlockType.EQUATION == "equation" + assert BlockType.CODE == "code" + + def test_extractor_type_values_unchanged(self): + from longparser.schemas import ExtractorType + + # All v0.1.3 values must still exist + assert ExtractorType.DOCLING == "docling" + assert ExtractorType.SURYA == "surya" + assert ExtractorType.MARKER == "marker" + assert ExtractorType.NATIVE_PDF == "native_pdf" + assert ExtractorType.PADDLE == "paddle" + + +class TestChunkCompat: + """Chunk schema must keep all v0.1.3 fields.""" + + def test_chunk_fields_exist(self): + from longparser.schemas import Chunk + chunk = Chunk(text="test", token_count=1, chunk_type="section") + + assert chunk.text == "test" + assert chunk.token_count == 1 + assert chunk.chunk_type == "section" + assert chunk.section_path == [] + assert chunk.page_numbers == [] + assert chunk.block_ids == [] + assert chunk.overlap_with_previous is False + assert chunk.equation_detected is False + + +class TestPublicAPICompat: + """All v0.1.3 public names must still be importable.""" + + def test_all_v013_exports_available(self): + from longparser import ( # noqa: F401 + __version__, + Document, + Page, + Block, + Table, + TableCell, + BlockType, + ExtractorType, + ProcessingConfig, + BoundingBox, + Provenance, + Confidence, + BlockFlags, + DocumentMetadata, + PageProfile, + ExtractionMetadata, + ChunkingConfig, + Chunk, + JobRequest, + JobResult, + ) + + def test_lazy_imports_still_work(self): + """Lazy imports from v0.1.3 must still resolve.""" + from longparser import DocumentPipeline # noqa: F401 + from longparser import PipelineOrchestrator # noqa: F401 + from longparser import PipelineResult # noqa: F401 + from longparser import HybridChunker # noqa: F401 + from longparser import DoclingExtractor # noqa: F401 diff --git a/tests/unit/test_license_safety.py b/tests/unit/test_license_safety.py new file mode 100644 index 0000000..8afac8b --- /dev/null +++ b/tests/unit/test_license_safety.py @@ -0,0 +1,82 @@ +"""License safety tests — ensure GPL/AGPL packages are never loaded by default. + +These tests verify that importing ``longparser`` and using its default +pipeline does NOT load any GPL/AGPL-licensed package (pymupdf4llm, marker, +surya). This is critical to maintain LongParser's MIT license. +""" + +import sys +import pytest + + +# Packages that must NEVER appear in sys.modules after a default import +_BLOCKED_MODULES = [ + "pymupdf4llm", + "pymupdf", + "fitz", # PyMuPDF's internal module name + "marker", + "marker.converters", + "surya", + "surya.ocr", +] + + +def _clear_blocked_modules(): + """Remove any pre-loaded blocked modules from sys.modules.""" + for mod_name in list(sys.modules): + for blocked in _BLOCKED_MODULES: + if mod_name == blocked or mod_name.startswith(blocked + "."): + del sys.modules[mod_name] + + +class TestLicenseSafety: + """Verify that core imports do not load GPL/AGPL dependencies.""" + + def test_import_longparser_does_not_load_agpl(self): + """``import longparser`` must not load any GPL/AGPL module.""" + _clear_blocked_modules() + + import longparser # noqa: F401 + + for mod_name in _BLOCKED_MODULES: + assert mod_name not in sys.modules, ( + f"GPL/AGPL module '{mod_name}' was loaded by 'import longparser'. " + f"This violates the MIT license isolation. " + f"Check __init__.py and extractors/__init__.py for stray imports." + ) + + def test_import_schemas_does_not_load_agpl(self): + """``from longparser.schemas import ...`` must not load GPL/AGPL.""" + _clear_blocked_modules() + + from longparser.schemas import ( # noqa: F401 + ProcessingConfig, Document, Block, Chunk + ) + + for mod_name in _BLOCKED_MODULES: + assert mod_name not in sys.modules, ( + f"GPL/AGPL module '{mod_name}' was loaded by schema import." + ) + + def test_processing_config_default_backend_is_docling(self): + """Default backend must be 'docling' (MIT), not a GPL/AGPL backend.""" + from longparser.schemas import ProcessingConfig + config = ProcessingConfig() + + # If backend field exists, it must default to docling + backend = getattr(config, "backend", "docling") + assert backend == "docling", ( + f"Default backend is '{backend}', expected 'docling'. " + f"Defaulting to a GPL/AGPL backend would violate MIT license." + ) + + def test_pymupdf_extractor_not_in_extractors_init(self): + """PyMuPDFExtractor must NOT be exported from extractors/__init__.py.""" + from longparser import extractors + + public_names = getattr(extractors, "__all__", dir(extractors)) + + assert "PyMuPDFExtractor" not in public_names, ( + "PyMuPDFExtractor must NOT be in extractors/__init__.py. " + "It must only be imported lazily when backend='pymupdf' is set." + )