ENDEVSOLS · MUZAMMILPERVAIZ · Apr 23, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml
@@ -0,0 +1,50 @@
+name: License Safety Check
+
+on: [push, pull_request]
+
+jobs:
+  license-check:
+    name: Ensure no GPL/AGPL imports in core
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/checkout@v4
+
+      - name: Check core files for GPL/AGPL imports
+        run: |
+          echo "=== License Safety Check ==="
+          echo "Verifying no GPL/AGPL package is imported in core code..."
+          echo ""
+
+          FAIL=0
+
+          # List of GPL/AGPL package import patterns to block
+          BLOCKED_PATTERNS="import[[:space:]]+pymupdf|from[[:space:]]+pymupdf|import[[:space:]]+marker\.|from[[:space:]]+marker\.|import[[:space:]]+surya|from[[:space:]]+surya"
+
+          # Files that ARE allowed to import these (isolated backends)
+          ALLOWED_FILES=(
+            "pymupdf_extractor.py"
+            "marker_extractor.py"
+          )
+
+          # Build grep exclude args
+          EXCLUDE_ARGS=""
+          for f in "${ALLOWED_FILES[@]}"; do
+            EXCLUDE_ARGS="$EXCLUDE_ARGS --exclude=$f"
+          done
+
+          # Search all Python files in src/longparser EXCEPT allowed files
+          MATCHES=$(grep -rnE "$BLOCKED_PATTERNS" src/longparser/ \
+            --include='*.py' $EXCLUDE_ARGS || true)
+
+          if [ -n "$MATCHES" ]; then
+            echo "❌ FAIL: GPL/AGPL imports found in core code!"
+            echo ""
+            echo "$MATCHES"
+            echo ""
+            echo "These packages must ONLY be imported in their isolated extractor files."
+            FAIL=1
+          else
+            echo "✅ PASS: No GPL/AGPL imports in core code."
+          fi
+
+          exit $FAIL
diff --git a/FEATURE_ROADMAP.md b/FEATURE_ROADMAP.md
@@ -0,0 +1,150 @@
+# LongParser — Product & Feature Roadmap
+
+> This roadmap reflects the current development direction based on community trends,
+> competitor analysis, and the RAG ecosystem in 2025–2026. Items are ordered by
+> priority within each phase. All dates are targets, not guarantees.
+
+---
+
+## Current State — v0.1.x ✅
+
+- 5-stage extraction pipeline (Extract → Validate → HITL → Chunk → Embed → Index)
+- Multi-format support: PDF, DOCX, PPTX, XLSX, CSV via Docling
+- `HybridChunker` — 6-strategy token-aware, hierarchy-aware, table-aware chunking
+- Human-in-the-Loop (HITL) review via LangGraph `interrupt()`
+- 3-layer memory chat engine (short-term + rolling summary + long-term facts)
+- Multi-provider LLM: OpenAI, Gemini, Groq, OpenRouter
+- Multi-backend vector stores: Chroma, FAISS, Qdrant
+- FastAPI REST server + ARQ/Redis job queue + Motor/MongoDB
+- LangChain `BaseRetriever` + LlamaIndex `BaseReader` adapters
+- CPU / GPU install separation via extras
+
+---
+
+## Phase 1 — Accuracy & Quality (v0.2.x) — Q2 2026
+
+### Parser Enhancements
+
+- [ ] **Marker backend** — add `marker-pdf` as an optional extraction backend for higher-fidelity Markdown output on complex academic PDFs
+- [ ] **PyMuPDF4LLM backend** — lightweight, fast alternative for speed-critical pipelines (10× faster than Docling for simple PDFs)
+- [ ] **Scanned PDF fast path** — route documents to Tesseract vs pix2tex vs Surya automatically based on page complexity score
+- [ ] **Multi-column layout detection** — prevent reading-order errors in newspaper/journal-style layouts
+- [ ] **Image extraction** — export embedded figures with captions into separate chunks with `type: figure`
+- [ ] **Document language auto-detection** — select OCR model automatically based on detected script
+
+### Chunking Improvements
+
+- [ ] **Semantic chunking** — optional embedding-based boundary detection (split at semantic shifts, not just token counts)
+- [ ] **Sliding window overlap** — configurable overlap strategy per chunk type (more overlap for tables, less for headings)
+- [ ] **Cross-reference resolution** — link `(see Figure 3)` and `(Table 2)` references to their target blocks
+- [ ] **Summary chunks** — auto-generate a 1–2 sentence summary chunk per section for hierarchical retrieval
+
+### Quality & Validation
+
+- [ ] **Chunk quality scorer** — assign a confidence score per chunk based on OCR confidence, completeness, and structural integrity
+- [ ] **PII detection** — flag and optionally redact personal information (names, emails, phone numbers) before embedding
+- [ ] **Duplicate block detection** — suppress repeated headers/footers that appear on every page
+
+---
+
+## Phase 2 — Agentic & Multimodal (v0.3.x) — Q3 2026
+
+### Agentic RAG
+
+- [ ] **Agentic retrieval loop** — implement query rewriting + iterative retrieval + self-reflection before answer generation
+- [ ] **Multi-hop question answering** — chain retrieval steps for questions that span multiple sections or documents
+- [ ] **Tool-calling integration** — expose document pipeline as a LangChain/LangGraph tool callable by autonomous agents
+- [ ] **Hypothetical Document Embeddings (HyDE)** — generate hypothetical answers to queries for improved retrieval recall
+
+### Multimodal
+
+- [ ] **Vision-Language Model (VLM) integration** — use GPT-4o / Gemini Vision to describe figures, charts, and diagrams as text chunks
+- [ ] **Chart data extraction** — parse bar/line/pie charts into structured data tables
+- [ ] **Slide layout understanding** — treat PPTX slides as visual units with spatial layout context, not just text extraction
+
+### Reranking & Retrieval
+
+- [ ] **Cross-encoder reranker** — add optional `sentence-transformers` cross-encoder reranking step after initial retrieval
+- [ ] **Hybrid search** — combine dense vector search with BM25 sparse retrieval (reciprocal rank fusion)
+- [ ] **Maximum Marginal Relevance (MMR)** — reduce redundancy in retrieved chunks
+- [ ] **Metadata filtering** — filter chunks by `page_number`, `section`, `doc_type`, `date` at query time
+
+---
+
+## Phase 3 — Enterprise & Observability (v0.4.x) — Q4 2026
+
+### Knowledge Graph
+
+- [ ] **Entity extraction** — extract named entities (people, organizations, dates, locations) from chunks
+- [ ] **Relationship mapping** — build entity relationship graphs from document content
+- [ ] **Graph-based retrieval** — traverse the entity graph for multi-hop retrieval (GraphRAG pattern)
+- [ ] **Neo4j / NetworkX integration** — persist the knowledge graph to a graph database
+
+### Evaluation Framework
+
+- [ ] **Built-in RAG evaluator** — measure retrieval recall@k, answer faithfulness, and context adherence
+- [ ] **Chunk attribution** — trace every answer sentence back to the source chunk and page
+- [ ] **RAGAS integration** — plug into the RAGAS evaluation framework
+- [ ] **Benchmark suite** — reproducible benchmarks against Unstructured, LlamaParse, Docling standalone
+
+### Observability & Compliance
+
+- [ ] **LangSmith integration** — trace every pipeline run end-to-end
+- [ ] **OpenTelemetry support** — emit spans/traces to any OTel-compatible backend
+- [ ] **Audit log** — immutable log of every HITL decision (approve/reject/edit) with timestamps and user IDs
+- [ ] **GDPR compliance mode** — PII redaction + right-to-erasure support (delete all chunks for a document)
+- [ ] **Role-based access control (RBAC)** — multi-tenant document access in the REST API
+
+---
+
+## Phase 4 — Scale & Ecosystem (v0.5.x+) — 2027
+
+### Performance & Scale
+
+- [ ] **Async parallel extraction** — process multiple documents concurrently in the background worker
+- [ ] **Streaming extraction** — yield blocks as they are extracted (no need to wait for full document)
+- [ ] **Incremental indexing** — update only changed pages/sections on re-upload
+- [ ] **S3 / GCS / Azure Blob** — native cloud storage input (not just local files)
+- [ ] **Kubernetes Helm chart** — one-command production deployment
+
+### New Integrations
+
+- [ ] **Weaviate** vector store adapter
+- [ ] **Pinecone** vector store adapter
+- [ ] **Milvus** vector store adapter
+- [ ] **DSPy** integration — use DSPy to auto-optimize retrieval prompts
+- [ ] **Haystack `DocumentConverter`** component
+- [ ] **Flowise / Langflow** node — drag-and-drop visual pipeline builder support
+
+### Developer Experience
+
+- [ ] **LongParser CLI** — `longparser parse document.pdf --output chunks.json`
+- [ ] **Web UI (HITL Dashboard)** — visual interface for reviewing and editing blocks before embedding
+- [ ] **VS Code extension** — preview parsed chunks directly from the editor
+- [ ] **Webhook support** — notify external systems when a job completes or requires HITL review
+
+---
+
+## Competitive Positioning
+
+| Capability | LongParser | Unstructured | LlamaParse | Docling |
+|---|---|---|---|---|
+| Privacy-first (fully local) | ✅ | ⚠️ (cloud option) | ❌ (API-only) | ✅ |
+| HITL review workflow | ✅ | ❌ | ❌ | ❌ |
+| Bundled REST API server | ✅ | ✅ (paid) | ✅ (cloud) | ❌ |
+| Table-aware chunking | ✅ | ⚠️ | ✅ | ✅ |
+| LaTeX / equation OCR | ✅ | ❌ | ⚠️ | ⚠️ |
+| LangChain + LlamaIndex | ✅ | ✅ | ✅ | ⚠️ |
+| Open source (MIT) | ✅ | ⚠️ (core only) | ❌ | ✅ |
+| Knowledge graph (planned) | 🔜 | ❌ | ❌ | ❌ |
+| Agentic retrieval (planned) | 🔜 | ❌ | ⚠️ | ❌ |
+
+---
+
+## Guiding Principles
+
+1. **Privacy by default** — all processing runs locally; no data leaves user infrastructure
+2. **Human oversight** — HITL is a first-class citizen, not an afterthought
+3. **Composable** — every stage is independently usable; no forced lock-in to the full stack
+4. **Production-grade** — async, typed, tested, documented from day one
+5. **Ecosystem-native** — LangChain, LlamaIndex, and HuggingFace are first-class integration targets
diff --git a/LICENSE-THIRD-PARTY.md b/LICENSE-THIRD-PARTY.md
@@ -0,0 +1,50 @@
+# Third-Party Licenses
+
+LongParser core is licensed under the **MIT License**.
+
+Some **optional** backends and integrations use different licenses.
+These packages are **never loaded by default** — they are only imported
+when you explicitly install them and select them in your configuration.
+
+## Optional Backend Licenses
+
+| Package | License | Install Command | When Loaded |
+|---------|---------|-----------------|-------------|
+| `pymupdf4llm` | AGPL-3.0 or Artifex Commercial | `pip install "longparser[pymupdf]"` | Only when you set `backend="pymupdf"` |
+| `marker-pdf` | GPL-3.0-or-later | `pip install "longparser[marker]"` | Only when you set `backend="marker"` *(future)* |
+| `surya-ocr` | GPL-3.0-or-later | `pip install "longparser[surya]"` | Only when explicitly imported *(future)* |
+
+## Core Dependency Licenses (always installed)
+
+| Package | License | Purpose |
+|---------|---------|---------|
+| `pydantic` | MIT | Schema validation |
+| `docling` | MIT | Default PDF extraction engine |
+| `docling-core` | MIT | Docling data models |
+| `fast-langdetect` | Apache-2.0 | Document language detection |
+
+## What This Means for You
+
+- **If you only use `pip install longparser`** — everything is MIT or Apache-2.0.
+  You can use LongParser in any project (commercial, proprietary, open source).
+
+- **If you install `longparser[pymupdf]`** — the `pymupdf4llm` library is
+  AGPL-3.0 licensed. You must comply with AGPL terms for the PyMuPDF component,
+  OR purchase a commercial license from [Artifex](https://artifex.com).
+  LongParser core code remains MIT.
+
+- **If you install `longparser[marker]`** *(future)* — the `marker-pdf` library
+  is GPL-3.0 licensed. You must comply with GPL terms for the Marker component.
+  LongParser core code remains MIT.
+
+## License Isolation Guarantee
+
+LongParser uses **lazy imports** to ensure GPL/AGPL packages are never loaded
+unless explicitly requested. The following guarantees hold:
+
+1. `import longparser` does NOT import any GPL/AGPL package
+2. `from longparser import DocumentPipeline` does NOT import any GPL/AGPL package
+3. `DocumentPipeline().process_file("doc.pdf")` does NOT import any GPL/AGPL
+   package (uses Docling, which is MIT)
+4. GPL/AGPL code is only loaded when you explicitly set `backend="pymupdf"` or
+   `backend="marker"` in `ProcessingConfig`
diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md
@@ -104,5 +104,5 @@ The server starts on `http://localhost:8000`.
 
 ```python
 import longparser
-print(longparser.__version__)  # 0.1.3
+print(longparser.__version__)  # 0.1.4
 ```
diff --git a/pyproject.toml b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta"
 
 [project]
 name = "longparser"
-version = "0.1.3"
+version = "0.1.4"
 description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines."
 readme = {file = "README.md", content-type = "text/markdown"}
 requires-python = ">=3.10"
@@ -36,6 +36,7 @@ dependencies = [
     "docling>=2.14",
     "docling-core>=2.13",
     "langgraph-checkpoint-mongodb>=0.3.1",
+    "fast-langdetect>=0.3,<1.0",  # Apache-2.0 — document language detection
 ]
 
 [project.optional-dependencies]
@@ -51,6 +52,20 @@ langchain = [
 llamaindex = [
     "llama-index-core>=0.10",
 ]
+# ----------- v0.1.4: Optional extraction backends -----------
+# ⚠️ pymupdf4llm is AGPL-3.0 licensed. See LICENSE-THIRD-PARTY.md.
+# Only loaded when user sets backend="pymupdf".
+pymupdf = [
+    "pymupdf4llm>=1.27",
+]
+# ⚠️ marker-pdf is GPL-3.0. GPU recommended. Future release.
+# marker = [
+#     "marker-pdf",
+# ]
+# ⚠️ surya-ocr is GPL-3.0. GPU recommended. Future release.
+# surya = [
+#     "surya-ocr>=0.17",
+# ]
 # FastAPI REST server + MongoDB + job queue + LangChain chat engine
 server = [
     "fastapi>=0.115",

diff --git a/src/longparser/__init__.py b/src/longparser/__init__.py
@@ -25,7 +25,7 @@
 
 from __future__ import annotations
 
-__version__ = "0.1.3"
+__version__ = "0.1.4"
 __author__ = "ENDEVSOLS Team"
 __license__ = "MIT"
 
@@ -59,6 +59,10 @@ def __getattr__(name: str):
     if name == "DoclingExtractor":
         from .extractors import DoclingExtractor
         return DoclingExtractor
+    if name == "PyMuPDFExtractor":
+        # AGPL-isolated — only loaded when explicitly requested
+        from .extractors.pymupdf_extractor import PyMuPDFExtractor
+        return PyMuPDFExtractor
     if name == "PipelineOrchestrator":
         from .pipeline import PipelineOrchestrator
         return PipelineOrchestrator
@@ -101,6 +105,7 @@ def __getattr__(name: str):
     "JobResult",
     # Lazily imported (require extras)
     "DoclingExtractor",
+    "PyMuPDFExtractor",
     "PipelineOrchestrator",
     "DocumentPipeline",
     "PipelineResult",