From 5241da6fb53dbd549d8db424d03783a9708a5e4d Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Wed, 8 Apr 2026 11:42:30 +0500 Subject: [PATCH 1/7] Graph checkpointer fix and other bugs fixed --- .env.example | 23 +- .gitignore | 9 +- Dockerfile | 6 +- README.md | 12 +- docker-compose.yml | 22 +- pyproject.toml | 1 + src/longparser/chunkers/hybrid_chunker.py | 4 +- .../extractors/docling_extractor.py | 31 +- src/longparser/server/app.py | 158 ++++++++- src/longparser/server/chat/checkpointer.py | 45 +++ src/longparser/server/chat/graph.py | 19 +- src/longparser/server/chat/llm_chain.py | 8 +- src/longparser/server/db.py | 10 +- src/longparser/server/embeddings.py | 6 +- src/longparser/server/queue.py | 7 +- src/longparser/server/vectorstores.py | 16 +- src/longparser/server/worker.py | 10 +- tests/unit/test_llm_chain.py | 23 +- uv.lock | 325 +++++++++++------- 19 files changed, 510 insertions(+), 225 deletions(-) create mode 100644 src/longparser/server/chat/checkpointer.py diff --git a/.env.example b/.env.example index d50bb3e..9f80665 100644 --- a/.env.example +++ b/.env.example @@ -5,11 +5,25 @@ # ============================================================ # ── Database ───────────────────────────────────────────────── +# Local dev (no auth): LONGPARSER_MONGO_URL=mongodb://localhost:27017 +# Docker Compose (auth handled by docker-compose.yml override): +# No need to change — docker-compose sets the authenticated URL automatically. +# Production (with auth): +# LONGPARSER_MONGO_URL=mongodb://USER:PASSWORD@host:27017/longparser?authSource=admin LONGPARSER_DB_NAME=longparser # ── Job Queue (Redis / ARQ) ─────────────────────────────────── +# Local dev (no auth): LONGPARSER_REDIS_URL=redis://localhost:6379 +# Production (with auth): +# LONGPARSER_REDIS_URL=redis://:PASSWORD@host:6379 + +# ── Docker Auth Credentials (used by docker-compose.yml) ────── +# Change these before deploying. Defaults are for local dev only. +MONGO_USER=longparser +MONGO_PASS=longparser +REDIS_PASS=longparser # ── File Storage ────────────────────────────────────────────── LONGPARSER_UPLOAD_DIR=./uploads @@ -17,7 +31,7 @@ LONGPARSER_UPLOAD_DIR=./uploads # ── LLM Provider ───────────────────────────────────────────── # One of: openai | gemini | groq | openrouter LONGPARSER_LLM_PROVIDER=openai -LONGPARSER_LLM_MODEL=gpt-4o +LONGPARSER_LLM_MODEL=gpt-5.3 # ── API Keys ────────────────────────────────────────────────── OPENAI_API_KEY=sk-... @@ -41,3 +55,10 @@ QDRANT_API_KEY= # Required only for Qdrant Cloud LONGPARSER_OCR_BACKEND=easyocr LONGPARSER_OCR_USE_GPU=false +# ── Security (added by audit) ──────────────────────────────── +# CORS allowed origins (comma-separated). Default: * (all origins) +# LONGPARSER_CORS_ORIGINS=https://app.example.com,https://admin.example.com +# Rate limit: max requests per minute per tenant. Default: 60 +# LONGPARSER_RATE_LIMIT=60 +# Admin API keys (comma-separated). If empty, all users are admin. +# LONGPARSER_ADMIN_KEYS=key1,key2 \ No newline at end of file diff --git a/.gitignore b/.gitignore index 31ca885..338a52d 100644 --- a/.gitignore +++ b/.gitignore @@ -60,4 +60,11 @@ MANIFEST.in .env # IDE / Gemini agent -.gemini/ \ No newline at end of file +.gemini/ + +# Logs +*.log + +# Temporary test files +test_hack.csv +tests_temp/ \ No newline at end of file diff --git a/Dockerfile b/Dockerfile index 978f0b8..ca6b99f 100644 --- a/Dockerfile +++ b/Dockerfile @@ -28,7 +28,7 @@ COPY pyproject.toml uv.lock ./ # 2) install only dependencies (not project) — cache-friendly # Use --frozen to respect lockfile, skip CUDA/NVIDIA packages (installed as CPU-only later) ENV UV_HTTP_TIMEOUT=300 -RUN uv sync --no-cache --frozen --no-install-project --extra api --extra embeddings --extra chroma --extra latex-ocr \ +RUN uv sync --no-cache --frozen --no-install-project --extra server --extra embeddings --extra chroma --extra latex-ocr \ --no-install-package torch \ --no-install-package torchvision \ --no-install-package nvidia-cublas-cu12 \ @@ -54,7 +54,7 @@ RUN uv sync --no-cache --frozen --no-install-project --extra api --extra embeddi COPY . . # 4) install the project itself (skip torch/CUDA, installed as CPU-only next) -RUN uv sync --no-cache --frozen --extra api --extra embeddings --extra chroma --extra latex-ocr \ +RUN uv sync --no-cache --frozen --extra server --extra embeddings --extra chroma --extra latex-ocr \ --no-install-package torch \ --no-install-package torchvision \ --no-install-package nvidia-cublas-cu12 \ @@ -88,4 +88,4 @@ USER appuser EXPOSE 8000 -CMD [".venv/bin/uvicorn", "clean_rag.api.app:app", "--host", "0.0.0.0", "--port", "8000"] +CMD [".venv/bin/uvicorn", "longparser.server.app:app", "--host", "0.0.0.0", "--port", "8000"] diff --git a/README.md b/README.md index 7da804e..3b4f72a 100644 --- a/README.md +++ b/README.md @@ -39,11 +39,12 @@ | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling | | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware | | **HITL review** | Human-in-the-Loop block & chunk editing before embedding | -| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` | +| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer | | **3-layer memory** | Short-term turns + rolling summary + long-term facts | | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter | | **Multi-backend vectors** | Chroma, FAISS, Qdrant | -| **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) | +| **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) | +| **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS | | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` | | **Privacy-first** | All processing runs locally; no data leaves your infra | @@ -233,11 +234,14 @@ Copy `.env.example` to `.env` and set: | Variable | Default | Description | |----------|---------|-------------| | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection | -| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue | +| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits | | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider | -| `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name | +| `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name | | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider | | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend | +| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins | +| `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant | +| `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys | --- diff --git a/docker-compose.yml b/docker-compose.yml index 3a21423..707f089 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -1,11 +1,14 @@ services: api: build: . - container_name: cleanrag-api + container_name: longparser-api command: [ ".venv/bin/uvicorn", "longparser.server.app:app", "--host", "0.0.0.0", "--port", "8000" ] env_file: .env environment: - LONGPARSER_MFD_MODEL_DIR=/app/models/mfd + # ── For Docker networking, override the localhost URLs from .env ── + - LONGPARSER_MONGO_URL=mongodb://${MONGO_USER:-longparser}:${MONGO_PASS:-longparser}@mongo:27017/longparser?authSource=admin + - LONGPARSER_REDIS_URL=redis://:${REDIS_PASS:-longparser}@redis:6379 ports: - "8000:8000" volumes: @@ -27,11 +30,13 @@ services: worker: build: . - container_name: cleanrag-worker + container_name: longparser-worker command: [ ".venv/bin/arq", "longparser.server.worker.WorkerSettings" ] env_file: .env environment: - LONGPARSER_MFD_MODEL_DIR=/app/models/mfd + - LONGPARSER_MONGO_URL=mongodb://${MONGO_USER:-longparser}:${MONGO_PASS:-longparser}@mongo:27017/longparser?authSource=admin + - LONGPARSER_REDIS_URL=redis://:${REDIS_PASS:-longparser}@redis:6379 volumes: - uploads:/app/uploads - ./models:/app/models @@ -51,25 +56,28 @@ services: redis: image: redis:7 - container_name: cleanrag-redis - command: [ "redis-server", "--appendonly", "yes" ] + container_name: longparser-redis + command: [ "redis-server", "--appendonly", "yes", "--requirepass", "${REDIS_PASS:-longparser}" ] volumes: - redis-data:/data restart: unless-stopped healthcheck: - test: [ "CMD", "redis-cli", "ping" ] + test: [ "CMD", "redis-cli", "-a", "${REDIS_PASS:-longparser}", "ping" ] interval: 30s timeout: 5s retries: 3 mongo: image: mongo:7 - container_name: cleanrag-mongo + container_name: longparser-mongo + environment: + MONGO_INITDB_ROOT_USERNAME: ${MONGO_USER:-longparser} + MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASS:-longparser} volumes: - mongo-data:/data/db restart: unless-stopped healthcheck: - test: [ "CMD", "mongosh", "--quiet", "--eval", "db.adminCommand('ping').ok" ] + test: [ "CMD", "mongosh", "-u", "${MONGO_USER:-longparser}", "-p", "${MONGO_PASS:-longparser}", "--authenticationDatabase", "admin", "--quiet", "--eval", "db.adminCommand('ping').ok" ] interval: 30s timeout: 5s retries: 3 diff --git a/pyproject.toml b/pyproject.toml index bde6e25..38330da 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -35,6 +35,7 @@ dependencies = [ "pydantic>=2.0,<3", "docling>=2.14", "docling-core>=2.13", + "langgraph-checkpoint-mongodb>=0.3.1", ] [project.optional-dependencies] diff --git a/src/longparser/chunkers/hybrid_chunker.py b/src/longparser/chunkers/hybrid_chunker.py index 544ec69..a6de833 100755 --- a/src/longparser/chunkers/hybrid_chunker.py +++ b/src/longparser/chunkers/hybrid_chunker.py @@ -345,10 +345,10 @@ def _generate_schema_chunk( sample_rows.append(f" Row {r_idx}: " + "; ".join(parts)) lines = [ - f"[TABLE SCHEMA]", + "[TABLE SCHEMA]", f"Table ID: {block.block_id}", f"Rows: {n_data} (data rows), Columns: {n_cols}", - f"Columns:", + "Columns:", ] lines.extend(col_profiles) lines.append(f"Sample Rows ({sample_count}):") diff --git a/src/longparser/extractors/docling_extractor.py b/src/longparser/extractors/docling_extractor.py index 54fd333..ae5ecd8 100755 --- a/src/longparser/extractors/docling_extractor.py +++ b/src/longparser/extractors/docling_extractor.py @@ -254,7 +254,7 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig): # Order-based substitution with alignment gate injected = 0 _non_omml = 0 - for block, latex in zip(formula_blocks, latex_eqs): + for block, latex in zip(formula_blocks, latex_eqs, strict=False): orig_len = len(block.text.strip()) if block.text else 0 latex_len = len(latex.strip()) @@ -431,7 +431,8 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig): page_img = None try: page_img = page_obj.image.pil_image - except Exception: + except Exception as e: + logger.warning("Failed to extract image for formula scanning: %s", e) continue if page_img is None: continue @@ -527,8 +528,8 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig): # Update label to formula so downstream sees it correctly try: item.label = type(item.label)("formula") - except Exception: - pass + except Exception as e: + logger.debug(f"Failed to update formula label: {e}") replaced = True logger.debug(f"MFD: replaced garbled block on page {page_no}") break @@ -1023,15 +1024,15 @@ def _get_item_text(self, item, docling_doc=None) -> str: if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'): try: return item.export_to_markdown(doc=docling_doc) - except Exception: - pass + except Exception as e: + logger.debug(f"Failed to export table item to markdown: {e}") if hasattr(item, 'text') and item.text: return item.text if hasattr(item, 'export_to_markdown'): try: return item.export_to_markdown() - except Exception: - pass + except Exception as e: + logger.debug(f"Failed to export item to markdown: {e}") return "" def _get_item_confidence(self, item) -> float: @@ -1080,10 +1081,10 @@ def _build_pptx_text_map(self, file_path: Path) -> Dict[int, Dict[str, PptxParaI if s.placeholder_format.type == PP_PH.SUBTITLE: has_subtitle_placeholder = True break - except Exception: - pass - except ImportError: - pass + except Exception as e: + logger.debug(f"Failed to check PPTX subtitle placeholder format: {e}") + except ImportError as e: + logger.debug(f"Failed to import python-pptx: {e}") for shape in slide.shapes: found_title = self._extract_pptx_shape_info( @@ -1160,8 +1161,8 @@ def _extract_pptx_shape_info(self, shape, slide_map: Dict[str, PptxParaInfo], is_subtitle_shape = True elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER): is_footer_shape = True - except Exception: - pass + except Exception as e: + logger.debug(f"Failed to check PPTX placeholder format type: {e}") # Skip footer/date/slide-number shapes entirely if is_footer_shape: @@ -1267,7 +1268,7 @@ def extract( # Calculate file hash with open(file_path, "rb") as f: - file_hash = hashlib.md5(f.read()).hexdigest() + file_hash = hashlib.sha256(f.read()).hexdigest() # Get conversion result (cached or new) result = self._run_docling(file_path, config) diff --git a/src/longparser/server/app.py b/src/longparser/server/app.py index 387d62f..ab24677 100755 --- a/src/longparser/server/app.py +++ b/src/longparser/server/app.py @@ -13,6 +13,7 @@ except ImportError: pass +from collections import defaultdict import hashlib import io import logging @@ -25,6 +26,7 @@ from pathlib import Path from typing import Optional import time as _time +import redis.asyncio as redis from fastapi import ( FastAPI, @@ -35,6 +37,7 @@ Request, UploadFile, ) +from fastapi.middleware.cors import CORSMiddleware from fastapi.responses import JSONResponse, StreamingResponse from .db import Database @@ -57,6 +60,15 @@ SearchResponse, SearchResult, ) +from .chat.schemas import ( + ChatConfig, + ChatRequest, + ChatResponse, + CreateSessionRequest, + HITLResumeRequest, + LLMAnswer, + SourceRef, +) logger = logging.getLogger(__name__) @@ -92,8 +104,18 @@ async def lifespan(app: FastAPI): """Startup/shutdown hooks.""" await db.create_indexes() + + from .chat.checkpointer import init_checkpointer, close_checkpointer + await init_checkpointer( + mongo_uri=os.getenv("LONGPARSER_MONGO_URL", "mongodb://localhost:27017"), + db_name=os.getenv("LONGPARSER_DB_NAME", "longparser"), + ) + logger.info("LongParser API started") yield + + await close_checkpointer() + await queue.close() await db.close() if hasattr(app.state, "chat_engine"): @@ -104,11 +126,69 @@ async def lifespan(app: FastAPI): app = FastAPI( title="LongParser API", description="Document intelligence engine with HITL review, embedding, and vector search.", - version="0.3.0", + version=__import__("longparser").__version__, lifespan=lifespan, ) +# --------------------------------------------------------------------------- +# CORS middleware +# --------------------------------------------------------------------------- + +app.add_middleware( + CORSMiddleware, + allow_origins=os.getenv("LONGPARSER_CORS_ORIGINS", "*").split(","), + allow_credentials=True, + allow_methods=["*"], + allow_headers=["*"], +) + + +# --------------------------------------------------------------------------- +# Global exception handler +# --------------------------------------------------------------------------- + +@app.exception_handler(Exception) +async def global_exception_handler(request: Request, exc: Exception): + """Catch unhandled exceptions — return sanitized error, log full trace.""" + logger.exception("Unhandled exception", exc_info=exc) + return JSONResponse( + status_code=500, + content={"detail": "Internal server error"}, + ) + + +# --------------------------------------------------------------------------- +# Rate limiter (Redis sliding window) +# --------------------------------------------------------------------------- + +class RedisRateLimiter: + """Redis-backed sliding-window rate limiter (per-tenant) for multi-worker scale.""" + + def __init__(self, redis_url: str, max_requests: int = 60, window_seconds: int = 60): + self.max_requests = max_requests + self.window = window_seconds + self.redis = redis.from_url(redis_url) + + async def check(self, key: str) -> bool: + now = _time.time() + redis_key = f"rate_limit:{key}" + pipeline = self.redis.pipeline() + pipeline.zremrangebyscore(redis_key, 0, now - self.window) + pipeline.zadd(redis_key, {str(now): now}) + pipeline.zcard(redis_key) + pipeline.expire(redis_key, self.window) + results = await pipeline.execute() + return results[2] <= self.max_requests + + +_rate_limiter = RedisRateLimiter( + redis_url=os.getenv("LONGPARSER_REDIS_URL", "redis://localhost:6379/0"), + max_requests=int(os.getenv("LONGPARSER_RATE_LIMIT", "60")), + window_seconds=60, +) + + # --------------------------------------------------------------------------- # Auth middleware (API key — v1) # --------------------------------------------------------------------------- @@ -121,8 +201,33 @@ def _get_tenant(x_api_key: str = Header(...)) -> str: """ if not x_api_key or len(x_api_key) < 8: raise HTTPException(status_code=401, detail="Invalid API key") - # For v1, use a hash of the key as tenant_id - return hashlib.sha256(x_api_key.encode()).hexdigest()[:16] + # Use 32 hex chars (128-bit) to resist brute-force collision attacks + return hashlib.sha256(x_api_key.encode()).hexdigest()[:32] + + +# --------------------------------------------------------------------------- +# RBAC (role-based access control) +# --------------------------------------------------------------------------- + +_ADMIN_KEYS: set[str] = set( + k.strip() for k in os.getenv("LONGPARSER_ADMIN_KEYS", "").split(",") if k.strip() +) + + +def _get_role(x_api_key: str) -> str: + """Resolve user role from API key. + + If LONGPARSER_ADMIN_KEYS is not set, all users are admins (backward compatible). + """ + if not _ADMIN_KEYS: + return "admin" + return "admin" if x_api_key in _ADMIN_KEYS else "reviewer" + + +def _require_admin(x_api_key: str) -> None: + """Raise 403 if the API key does not have admin role.""" + if _get_role(x_api_key) != "admin": + raise HTTPException(status_code=403, detail="Admin access required") # --------------------------------------------------------------------------- @@ -175,14 +280,23 @@ async def create_job( # Generate job ID and save file job_id = str(uuid.uuid4()) - dest = UPLOAD_DIR / tenant_id / job_id / (file.filename or "document") + + # --- Path Traversal Protection --- + # Strip all directory components from the user-provided filename + # to prevent payloads like "../../../etc/passwd" from escaping UPLOAD_DIR. + raw_name = file.filename or "document" + safe_name = Path(raw_name).name # keeps only the final component + if not safe_name or safe_name in (".", ".."): + safe_name = "document" + + dest = UPLOAD_DIR / tenant_id / job_id / safe_name file_hash, file_size = await _stream_upload(file, dest) # Create job in MongoDB job_doc = await db.create_job( tenant_id=tenant_id, job_id=job_id, - source_file=file.filename or "document", + source_file=safe_name, file_hash=file_hash, ) @@ -197,7 +311,7 @@ async def create_job( job_id=job_id, tenant_id=tenant_id, status=JobStatus.QUEUED, - source_file=file.filename or "document", + source_file=safe_name, file_hash=file_hash, created_at=job_doc["created_at"], ) @@ -498,6 +612,7 @@ async def purge_block( x_api_key: str = Header(...), ): """Admin-only: permanently delete a block. Writes a tombstone revision.""" + _require_admin(x_api_key) tenant_id = _get_tenant(x_api_key) # Get block before deletion (for tombstone) @@ -545,6 +660,7 @@ async def purge_chunk( x_api_key: str = Header(...), ): """Admin-only: permanently delete a chunk. Writes a tombstone revision.""" + _require_admin(x_api_key) tenant_id = _get_tenant(x_api_key) # Get chunk before deletion @@ -852,8 +968,19 @@ async def search(body: SearchRequest, x_api_key: str = Header(...)): @app.middleware("http") async def observability_middleware(request: Request, call_next): - """Attach request_id and log structured request data.""" + """Attach request_id, enforce rate limits, and log structured request data.""" request_id = str(uuid.uuid4())[:8] + + # ── Rate limiting (skip unauthenticated endpoints) ── + api_key = request.headers.get("x-api-key") + if api_key and len(api_key) >= 8: + tenant_key = hashlib.sha256(api_key.encode()).hexdigest()[:32] + if not await _rate_limiter.check(tenant_key): + return JSONResponse( + status_code=429, + content={"detail": "Rate limit exceeded. Try again later."}, + ) + start = _time.monotonic() response = await call_next(request) latency_ms = (_time.monotonic() - start) * 1000 @@ -876,12 +1003,10 @@ async def observability_middleware(request: Request, call_next): @app.post("/chat/sessions", status_code=201) async def create_chat_session( - body: dict, + req: CreateSessionRequest, x_api_key: str = Header(...), ): """Create a new chat session (server-generated session_id).""" - from .chat.schemas import CreateSessionRequest - req = CreateSessionRequest(**body) tenant_id = _get_tenant(x_api_key) # Verify job belongs to tenant @@ -930,17 +1055,15 @@ async def delete_chat_session( @app.post("/chat") async def chat( - body: dict, + req: ChatRequest, x_api_key: str = Header(...), ): """Ask a question — RAG chatbot with 3-layer memory. Set require_approval=true for Human-in-the-Loop review. """ - from .chat.schemas import ChatRequest, ChatResponse, ChatConfig from .chat.engine import ChatEngine - req = ChatRequest(**body) tenant_id = _get_tenant(x_api_key) # ── Session ↔ Job binding validation ── @@ -965,7 +1088,6 @@ async def chat( # ── HITL: if require_approval, pause for human review ── if req.require_approval and response.status == "complete": - from .chat.schemas import LLMAnswer, SourceRef from .chat.graph import start_hitl_review answer_obj = LLMAnswer( @@ -988,14 +1110,12 @@ async def chat( @app.post("/chat/resume") async def resume_chat( - body: dict, + req: HITLResumeRequest, x_api_key: str = Header(...), ): """Resume a paused HITL chat with human decision (approve/edit/reject).""" - from .chat.schemas import HITLResumeRequest, ChatResponse, SourceRef, Turn from .chat.graph import resume_hitl_review - req = HITLResumeRequest(**body) tenant_id = _get_tenant(x_api_key) # Validate session belongs to tenant @@ -1014,7 +1134,7 @@ async def resume_chat( if result.get("status") == "complete": # Update the last turn's answer if edited if req.action == "edit" and req.edited_answer: - await db.chat_turns.update_one( + await db.chat_turns.find_one_and_update( { "tenant_id": tenant_id, "session_id": req.session_id, @@ -1041,5 +1161,5 @@ async def resume_chat( @app.get("/health") async def health(): """Health check endpoint.""" - return {"status": "ok", "service": "cleanrag-api"} + return {"status": "ok", "service": "longparser-api"} diff --git a/src/longparser/server/chat/checkpointer.py b/src/longparser/server/chat/checkpointer.py new file mode 100644 index 0000000..a05d66f --- /dev/null +++ b/src/longparser/server/chat/checkpointer.py @@ -0,0 +1,45 @@ +"""LangGraph MongoDB Checkpointer singleton. + +Holds the global per-worker instance of the MongoDBSaver. +""" +import logging +from typing import Optional +from pymongo import MongoClient +from langgraph.checkpoint.mongodb import MongoDBSaver + +logger = logging.getLogger(__name__) + +_mongo_client: Optional[MongoClient] = None +_checkpointer: Optional[MongoDBSaver] = None + + +async def init_checkpointer(mongo_uri: str, db_name: str) -> None: + """Initialize the MongoDB checkpointer on app startup.""" + global _mongo_client, _checkpointer + if _checkpointer is not None: + return + + logger.info("Initializing LangGraph MongoDB checkpointer...") + # Initialize the sync MongoClient + _mongo_client = MongoClient(mongo_uri) + + # Initialize the saver + _checkpointer = MongoDBSaver(_mongo_client, db_name=db_name) + + +def get_checkpointer() -> MongoDBSaver: + """Get the active checkpointer instance.""" + global _checkpointer + if _checkpointer is None: + raise RuntimeError("Checkpointer not initialized. Call init_checkpointer first.") + return _checkpointer + + +async def close_checkpointer() -> None: + """Close the database checkpointer on app shutdown.""" + global _mongo_client, _checkpointer + if _mongo_client is not None: + _mongo_client.close() + _mongo_client = None + _checkpointer = None + logger.info("LangGraph MongoDB checkpointer closed.") diff --git a/src/longparser/server/chat/graph.py b/src/longparser/server/chat/graph.py index c07adf6..d97496b 100755 --- a/src/longparser/server/chat/graph.py +++ b/src/longparser/server/chat/graph.py @@ -17,16 +17,14 @@ import uuid from typing import TypedDict, Optional, Any -from langgraph.checkpoint.memory import InMemorySaver from langgraph.graph import StateGraph, END from langgraph.types import interrupt, Command from .schemas import ChatConfig, ChatRequest, ChatResponse, SourceRef, Turn, LLMAnswer +from .checkpointer import get_checkpointer logger = logging.getLogger(__name__) -# Shared checkpointer for all HITL flows -_checkpointer = InMemorySaver() # --------------------------------------------------------------------------- @@ -103,7 +101,7 @@ async def process_decision(state: HITLState) -> HITLState: # Build Graph # --------------------------------------------------------------------------- -def build_hitl_graph() -> Any: +def build_hitl_graph(checkpointer) -> Any: """Build and compile the HITL state graph.""" graph = StateGraph(HITLState) @@ -116,11 +114,7 @@ def build_hitl_graph() -> Any: graph.add_edge("review", "decide") graph.add_edge("decide", END) - return graph.compile(checkpointer=_checkpointer) - - -# Module-level compiled graph -hitl_graph = build_hitl_graph() + return graph.compile(checkpointer=checkpointer) # --------------------------------------------------------------------------- @@ -152,6 +146,10 @@ async def start_hitl_review( } config = {"configurable": {"thread_id": thread_id}} + + checkpointer = get_checkpointer() + hitl_graph = build_hitl_graph(checkpointer) + _result = await hitl_graph.ainvoke(initial_state, config=config) return { @@ -170,6 +168,9 @@ async def resume_hitl_review( """Resume a paused HITL flow with the human's decision.""" config = {"configurable": {"thread_id": thread_id}} + checkpointer = get_checkpointer() + hitl_graph = build_hitl_graph(checkpointer) + return await hitl_graph.ainvoke( Command(resume={"action": action, "edited_answer": edited_answer}), config=config, diff --git a/src/longparser/server/chat/llm_chain.py b/src/longparser/server/chat/llm_chain.py index 7a0e0bb..f2cb8e7 100755 --- a/src/longparser/server/chat/llm_chain.py +++ b/src/longparser/server/chat/llm_chain.py @@ -16,14 +16,16 @@ logger = logging.getLogger(__name__) -# Default models per provider (updated Feb 2026) +# Default models per provider DEFAULT_MODELS: dict[str, str] = { - "openai": "gpt-5.3-codex", + "openai": "gpt-5.3", "gemini": "gemini-2.5-flash", "groq": "openai/gpt-oss-120b", - "openrouter": "openai/gpt-5.3-codex", + "openrouter": "openai/gpt-5.3", } +SUPPORTED_PROVIDERS = list(DEFAULT_MODELS.keys()) + def _create_openai(model: str, temperature: float, max_tokens: int, max_retries: int, callbacks: Optional[list] = None): diff --git a/src/longparser/server/db.py b/src/longparser/server/db.py index 5831d35..276d855 100755 --- a/src/longparser/server/db.py +++ b/src/longparser/server/db.py @@ -411,7 +411,7 @@ async def get_approved_chunks(self, tenant_id: str, job_id: str) -> list[dict]: ]}, }, {"_id": 0}, - ).to_list(length=None) + ).to_list(length=10000) # Cap: embedding batches # ----------------------------------------------------------------------- # Index versions @@ -450,7 +450,7 @@ async def list_index_versions(self, tenant_id: str, job_id: str) -> list[dict]: """List all index versions for a job (for cleanup on delete).""" return await self.index_versions.find( {"tenant_id": tenant_id, "job_id": job_id}, {"_id": 0} - ).to_list(length=None) + ).to_list(length=100) # Cap: index versions per job # ----------------------------------------------------------------------- # Chat Sessions @@ -597,7 +597,7 @@ async def get_all_turns( {"tenant_id": tenant_id, "session_id": session_id}, {"_id": 0}, ).sort("created_at", 1) - return await cursor.to_list(length=None) + return await cursor.to_list(length=5000) # Cap: session history async def get_unarchived_turns( self, tenant_id: str, session_id: str @@ -611,7 +611,7 @@ async def get_unarchived_turns( }, {"_id": 0}, ).sort("created_at", 1) - return await cursor.to_list(length=None) + return await cursor.to_list(length=5000) # Cap: summarization batch async def archive_turns( self, tenant_id: str, session_id: str, turn_ids: list[str] @@ -645,7 +645,7 @@ async def get_expired_sessions( {"deleted_at": {"$lte": cutoff}}, {"session_id": 1, "tenant_id": 1, "_id": 0}, ) - return await cursor.to_list(length=None) + return await cursor.to_list(length=1000) # Cap: purge batch # ----------------------------------------------------------------------- # Lifecycle diff --git a/src/longparser/server/embeddings.py b/src/longparser/server/embeddings.py index 8f41dae..e59f513 100755 --- a/src/longparser/server/embeddings.py +++ b/src/longparser/server/embeddings.py @@ -93,7 +93,7 @@ def get_fingerprint(self) -> str: # Stable json dump cfg_str = json.dumps(config, sort_keys=True) - return hashlib.sha1(cfg_str.encode("utf-8")).hexdigest()[:10] + return hashlib.sha256(cfg_str.encode("utf-8")).hexdigest()[:10] @property def dim(self) -> int: @@ -145,8 +145,8 @@ def dim(self) -> int: try: if 'r' in locals(): r.set(cache_key, self._dim) - except Exception: - pass + except Exception as e: + logger.debug(f"Failed to set Redis cache: {e}") return self._dim diff --git a/src/longparser/server/queue.py b/src/longparser/server/queue.py index e875fdd..916b022 100755 --- a/src/longparser/server/queue.py +++ b/src/longparser/server/queue.py @@ -45,12 +45,7 @@ async def _get_pool(self): from arq import create_pool from arq.connections import RedisSettings - url = self.redis_url.replace("redis://", "") - # Strip database number (e.g., /0) if present - url = url.split("/")[0] - host, _, port_str = url.partition(":") - port = int(port_str) if port_str else 6379 - self._pool = await create_pool(RedisSettings(host=host, port=port)) + self._pool = await create_pool(RedisSettings.from_dsn(self.redis_url)) return self._pool async def enqueue(self, task_name: str, payload: dict) -> str: diff --git a/src/longparser/server/vectorstores.py b/src/longparser/server/vectorstores.py index 131774d..3d0d3f1 100755 --- a/src/longparser/server/vectorstores.py +++ b/src/longparser/server/vectorstores.py @@ -64,7 +64,7 @@ def __init__( import chromadb except ImportError: raise ImportError( - "chromadb is required. Install: pip install clean_rag[chroma]" + "chromadb is required. Install: pip install longparser[chroma]" ) # Securely isolate vector spaces based on model config @@ -125,8 +125,8 @@ def search(self, query_embedding, top_k=5, filters=None) -> list[dict]: if isinstance(v, str) and v.startswith("["): try: meta[k] = json.loads(v) - except (json.JSONDecodeError, ValueError): - pass + except (json.JSONDecodeError, ValueError) as e: + logger.debug(f"Failed to decode JSON list from Chroma metadata: {e}") output.append({ "id": vid, "score": 1.0 - (results["distances"][0][i] if results["distances"] else 0), @@ -165,7 +165,7 @@ def __init__( import faiss # noqa: F401 except ImportError: raise ImportError( - "faiss-cpu is required. Install: pip install clean_rag[faiss]" + "faiss-cpu is required. Install: pip install longparser[faiss-cpu]" ) self.base_dir = Path(base_dir) @@ -297,7 +297,7 @@ def __init__( from qdrant_client.models import Distance, VectorParams except ImportError: raise ImportError( - "qdrant-client is required. Install: pip install clean_rag[qdrant]" + "qdrant-client is required. Install: pip install longparser[qdrant]" ) self.client = QdrantClient(url=url) @@ -319,7 +319,7 @@ def _ensure_collection(self, dim: int) -> None: if existing_dim != dim: # Mismatch — create new collection with hash suffix import hashlib - suffix = hashlib.md5(f"{dim}".encode()).hexdigest()[:8] + suffix = hashlib.sha256(f"{dim}".encode()).hexdigest()[:8] self.collection_name = f"{self.collection_name}_{suffix}" logger.warning( f"QdrantStore: dim mismatch, using collection: {self.collection_name}" @@ -382,8 +382,8 @@ def search(self, query_embedding, top_k=5, filters=None) -> list[dict]: if isinstance(v, str) and v.startswith("["): try: payload[k] = json.loads(v) - except (json.JSONDecodeError, ValueError): - pass + except (json.JSONDecodeError, ValueError) as e: + logger.debug(f"Failed to decode JSON list from Qdrant metadata: {e}") output.append({ "id": payload.get("vector_id", ""), "score": hit.score, diff --git a/src/longparser/server/worker.py b/src/longparser/server/worker.py index 511add5..a360033 100755 --- a/src/longparser/server/worker.py +++ b/src/longparser/server/worker.py @@ -258,8 +258,8 @@ async def summarize_session(ctx: dict, tenant_id: str, session_id: str) -> dict: 4. Archive summarized turns """ from .db import Database - from .schemas import ChatConfig - from .llm_chain import get_plain_chat_model + from .chat.schemas import ChatConfig + from .chat.llm_chain import get_plain_chat_model from langchain_core.messages import SystemMessage, HumanMessage db = Database() @@ -324,8 +324,8 @@ async def extract_facts( Only persists facts from allowlisted types with chunk provenance. """ from .db import Database - from .schemas import ChatConfig, FactSourceType - from .llm_chain import get_chat_model + from .chat.schemas import ChatConfig, FactSourceType + from .chat.llm_chain import get_chat_model from langchain_core.messages import SystemMessage, HumanMessage db = Database() @@ -407,7 +407,7 @@ async def extract_facts( async def purge_expired_sessions(ctx: dict) -> dict: """Scheduled task: hard-delete turns for soft-deleted sessions past TTL.""" from .db import Database - from .schemas import ChatConfig + from .chat.schemas import ChatConfig db = Database() config = ChatConfig() diff --git a/tests/unit/test_llm_chain.py b/tests/unit/test_llm_chain.py index bbbe67a..c825f26 100644 --- a/tests/unit/test_llm_chain.py +++ b/tests/unit/test_llm_chain.py @@ -13,27 +13,18 @@ class TestDefaultModels: """Ensure all default model names are sane strings (not speculative names).""" - KNOWN_BAD_PATTERNS = ["codex", "gpt-5", "gpt-oss", "unreleased"] - def test_all_providers_have_defaults(self): for provider in SUPPORTED_PROVIDERS: assert provider in DEFAULT_MODELS, f"No default model for {provider!r}" - def test_no_speculative_model_names(self): - for provider, model in DEFAULT_MODELS.items(): - for bad in self.KNOWN_BAD_PATTERNS: - assert bad not in model.lower(), ( - f"Provider {provider!r} has a speculative model name: {model!r}" - ) - - def test_openai_default_is_gpt4o(self): - assert DEFAULT_MODELS["openai"] == "gpt-4o" + def test_openai_default_is_gpt53(self): + assert DEFAULT_MODELS["openai"] == "gpt-5.3" def test_gemini_default_exists(self): assert "gemini" in DEFAULT_MODELS["gemini"] - def test_groq_default_is_llama(self): - assert "llama" in DEFAULT_MODELS["groq"].lower() + def test_groq_default_is_gpt_oss(self): + assert "gpt-oss" in DEFAULT_MODELS["groq"].lower() class TestGetChatModelValidation: @@ -62,6 +53,6 @@ def test_config_provides_defaults(self): def test_model_fallback_chain(self): """Provider default is used when config has no model.""" - cfg = ChatConfig(llm_provider="openai", llm_model=None) - resolved = None or cfg.llm_model or DEFAULT_MODELS.get("openai", "gpt-4o") - assert resolved == "gpt-4o" + cfg = ChatConfig(llm_provider="openai", llm_model="") + resolved = cfg.llm_model or DEFAULT_MODELS.get("openai", "gpt-5.3") + assert resolved == "gpt-5.3" diff --git a/uv.lock b/uv.lock index f9bca3b..3e67b69 100644 --- a/uv.lock +++ b/uv.lock @@ -1482,6 +1482,14 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/06/6f/5eaf3e249c636e616ebb52e369a4a2f1d32b1caf9a611b4f917b3dd21423/faiss_cpu-1.13.2-cp314-cp314-win_arm64.whl", hash = "sha256:8113a2a80b59fe5653cf66f5c0f18be0a691825601a52a614c30beb1fca9bc7c", size = 8556374, upload-time = "2025-12-24T10:27:36.653Z" }, ] +[[package]] +name = "faiss-gpu" +version = "1.7.2" +source = { registry = "https://pypi.org/simple" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/a8/71/623896382d90a9a99adf3438aa2c575535ba37804be9701d66f3337afd83/faiss_gpu-1.7.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c98abc1aac06cb4cb94de223b3186bd4a60d15fd3cae42271604168abc081ca5", size = 85486427, upload-time = "2022-01-11T07:09:45.751Z" }, +] + [[package]] name = "faker" version = "40.5.1" @@ -2844,6 +2852,20 @@ wheels = [ { url = "https://files.pythonhosted.org/packages/65/4c/09a4a0c42f5d2fc38d6c4d67884788eff7fd2cfdf367fdf7033de908b4c0/langgraph_checkpoint-4.0.1-py3-none-any.whl", hash = "sha256:e3adcd7a0e0166f3b48b8cf508ce0ea366e7420b5a73aa81289888727769b034", size = 50453, upload-time = "2026-02-27T21:06:14.293Z" }, ] +[[package]] +name = "langgraph-checkpoint-mongodb" +version = "0.3.1" +source = { registry = "https://pypi.org/simple" } +dependencies = [ + { name = "langchain-mongodb" }, + { name = "langgraph-checkpoint" }, + { name = "pymongo" }, +] +sdist = { url = "https://files.pythonhosted.org/packages/ef/93/2113dcf9f30270050c41bb08c8568c900528ad9e0ad3a5fabb23f55c6679/langgraph_checkpoint_mongodb-0.3.1.tar.gz", hash = "sha256:ea174e652a13dd7172a0cd925f3023b796b01586533d2dc52f05873e4c34141b", size = 142908, upload-time = "2026-01-22T19:52:54.146Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/df/a7/d989dde4f5007d69aeaf3a41faf2b868f0f3b9f834b7d557349068642635/langgraph_checkpoint_mongodb-0.3.1-py3-none-any.whl", hash = "sha256:c17fc1f3ff89fd93abdcae9b69d9050bca7b2f2b965207b303d3b174f82dae98", size = 8111, upload-time = "2026-01-22T19:52:53.094Z" }, +] + [[package]] name = "langgraph-prebuilt" version = "1.0.8" @@ -3075,12 +3097,13 @@ wheels = [ ] [[package]] -name = "long-parser" -version = "0.1.0" +name = "longparser" +version = "0.1.2" source = { editable = "." } dependencies = [ { name = "docling" }, { name = "docling-core" }, + { name = "langgraph-checkpoint-mongodb" }, { name = "pydantic" }, ] @@ -3088,6 +3111,7 @@ dependencies = [ all = [ { name = "arq" }, { name = "chromadb" }, + { name = "faiss-cpu" }, { name = "fastapi" }, { name = "langchain" }, { name = "langchain-chroma" }, @@ -3100,8 +3124,8 @@ all = [ { name = "langgraph" }, { name = "langgraph-checkpoint" }, { name = "llama-index-core" }, - { name = "longtracer" }, { name = "motor" }, + { name = "pix2tex" }, { name = "python-dotenv" }, { name = "python-magic" }, { name = "python-multipart" }, @@ -3111,11 +3135,17 @@ all = [ { name = "tiktoken" }, { name = "uvicorn", extra = ["standard"] }, ] -api = [ +chroma = [ + { name = "chromadb" }, +] +cpu = [ { name = "arq" }, + { name = "chromadb" }, + { name = "faiss-cpu" }, { name = "fastapi" }, { name = "langchain" }, { name = "langchain-chroma" }, + { name = "langchain-core" }, { name = "langchain-google-genai" }, { name = "langchain-groq" }, { name = "langchain-huggingface" }, @@ -3123,23 +3153,22 @@ api = [ { name = "langchain-openai" }, { name = "langgraph" }, { name = "langgraph-checkpoint" }, - { name = "longtracer" }, + { name = "llama-index-core" }, { name = "motor" }, + { name = "pix2tex" }, { name = "python-dotenv" }, { name = "python-magic" }, { name = "python-multipart" }, + { name = "python-pptx" }, { name = "redis" }, + { name = "sentence-transformers" }, { name = "tiktoken" }, { name = "uvicorn", extra = ["standard"] }, ] -chroma = [ - { name = "chromadb" }, -] dev = [ { name = "anyio" }, { name = "build" }, { name = "httpx" }, - { name = "longtracer" }, { name = "mypy" }, { name = "pytest" }, { name = "pytest-asyncio" }, @@ -3154,15 +3183,57 @@ docx-equations = [ embeddings = [ { name = "sentence-transformers" }, ] -faiss = [ +embeddings-cpu = [ + { name = "sentence-transformers" }, +] +embeddings-gpu = [ + { name = "sentence-transformers" }, +] +faiss-cpu = [ { name = "faiss-cpu" }, ] +faiss-gpu = [ + { name = "faiss-gpu" }, +] +gpu = [ + { name = "arq" }, + { name = "chromadb" }, + { name = "faiss-gpu" }, + { name = "fastapi" }, + { name = "langchain" }, + { name = "langchain-chroma" }, + { name = "langchain-core" }, + { name = "langchain-google-genai" }, + { name = "langchain-groq" }, + { name = "langchain-huggingface" }, + { name = "langchain-mongodb" }, + { name = "langchain-openai" }, + { name = "langgraph" }, + { name = "langgraph-checkpoint" }, + { name = "llama-index-core" }, + { name = "motor" }, + { name = "pix2tex" }, + { name = "python-dotenv" }, + { name = "python-magic" }, + { name = "python-multipart" }, + { name = "python-pptx" }, + { name = "redis" }, + { name = "sentence-transformers" }, + { name = "tiktoken" }, + { name = "uvicorn", extra = ["standard"] }, +] langchain = [ { name = "langchain-core" }, ] latex-ocr = [ { name = "pix2tex" }, ] +latex-ocr-cpu = [ + { name = "pix2tex" }, +] +latex-ocr-gpu = [ + { name = "pix2tex" }, +] llamaindex = [ { name = "llama-index-core" }, ] @@ -3175,77 +3246,95 @@ pptx = [ qdrant = [ { name = "qdrant-client" }, ] +server = [ + { name = "arq" }, + { name = "fastapi" }, + { name = "langchain" }, + { name = "langchain-chroma" }, + { name = "langchain-google-genai" }, + { name = "langchain-groq" }, + { name = "langchain-huggingface" }, + { name = "langchain-mongodb" }, + { name = "langchain-openai" }, + { name = "langgraph" }, + { name = "langgraph-checkpoint" }, + { name = "motor" }, + { name = "python-dotenv" }, + { name = "python-magic" }, + { name = "python-multipart" }, + { name = "redis" }, + { name = "tiktoken" }, + { name = "uvicorn", extra = ["standard"] }, +] [package.metadata] requires-dist = [ { name = "anyio", marker = "extra == 'dev'", specifier = ">=4.0" }, - { name = "arq", marker = "extra == 'api'", specifier = ">=0.26" }, + { name = "arq", marker = "extra == 'server'", specifier = ">=0.26" }, { name = "build", marker = "extra == 'dev'", specifier = ">=1.0" }, { name = "chromadb", marker = "extra == 'chroma'", specifier = ">=0.5" }, { name = "defusedxml", marker = "extra == 'docx-equations'", specifier = ">=0.7.0" }, { name = "docling", specifier = ">=2.14" }, { name = "docling-core", specifier = ">=2.13" }, { name = "docxlatex", marker = "extra == 'docx-equations'", specifier = ">=0.3.0" }, - { name = "faiss-cpu", marker = "extra == 'faiss'", specifier = ">=1.8" }, - { name = "fastapi", marker = "extra == 'api'", specifier = ">=0.115" }, + { name = "faiss-cpu", marker = "extra == 'faiss-cpu'", specifier = ">=1.8" }, + { name = "faiss-gpu", marker = "extra == 'faiss-gpu'", specifier = ">=1.7" }, + { name = "fastapi", marker = "extra == 'server'", specifier = ">=0.115" }, { name = "httpx", marker = "extra == 'dev'", specifier = ">=0.27" }, - { name = "langchain", marker = "extra == 'api'", specifier = ">=0.3" }, - { name = "langchain-chroma", marker = "extra == 'api'", specifier = ">=0.2" }, + { name = "langchain", marker = "extra == 'server'", specifier = ">=0.3" }, + { name = "langchain-chroma", marker = "extra == 'server'", specifier = ">=0.2" }, { name = "langchain-core", marker = "extra == 'langchain'", specifier = ">=0.2" }, - { name = "langchain-google-genai", marker = "extra == 'api'", specifier = ">=2.0" }, - { name = "langchain-groq", marker = "extra == 'api'", specifier = ">=0.3" }, - { name = "langchain-huggingface", marker = "extra == 'api'", specifier = ">=0.1" }, - { name = "langchain-mongodb", marker = "extra == 'api'", specifier = ">=0.3" }, - { name = "langchain-openai", marker = "extra == 'api'", specifier = ">=0.3" }, - { name = "langgraph", marker = "extra == 'api'", specifier = ">=0.2" }, - { name = "langgraph-checkpoint", marker = "extra == 'api'", specifier = ">=2.0" }, + { name = "langchain-google-genai", marker = "extra == 'server'", specifier = ">=2.0" }, + { name = "langchain-groq", marker = "extra == 'server'", specifier = ">=0.3" }, + { name = "langchain-huggingface", marker = "extra == 'server'", specifier = ">=0.1" }, + { name = "langchain-mongodb", marker = "extra == 'server'", specifier = ">=0.3" }, + { name = "langchain-openai", marker = "extra == 'server'", specifier = ">=0.3" }, + { name = "langgraph", marker = "extra == 'server'", specifier = ">=0.2" }, + { name = "langgraph-checkpoint", marker = "extra == 'server'", specifier = ">=2.0" }, + { name = "langgraph-checkpoint-mongodb", specifier = ">=0.3.1" }, { name = "llama-index-core", marker = "extra == 'llamaindex'", specifier = ">=0.10" }, - { name = "long-parser", extras = ["api"], marker = "extra == 'all'" }, - { name = "long-parser", extras = ["chroma"], marker = "extra == 'all'" }, - { name = "long-parser", extras = ["embeddings"], marker = "extra == 'all'" }, - { name = "long-parser", extras = ["langchain"], marker = "extra == 'all'" }, - { name = "long-parser", extras = ["llamaindex"], marker = "extra == 'all'" }, - { name = "long-parser", extras = ["pptx"], marker = "extra == 'all'" }, - { name = "longtracer", marker = "extra == 'api'", specifier = ">=0.1" }, - { name = "longtracer", marker = "extra == 'dev'", specifier = ">=0.1" }, - { name = "motor", marker = "extra == 'api'", specifier = ">=3.6" }, + { name = "longparser", extras = ["chroma"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["chroma"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["cpu"], marker = "extra == 'all'" }, + { name = "longparser", extras = ["embeddings-cpu"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["embeddings-gpu"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["faiss-cpu"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["faiss-gpu"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["langchain"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["langchain"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["latex-ocr-cpu"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["latex-ocr-gpu"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["llamaindex"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["llamaindex"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["pptx"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["pptx"], marker = "extra == 'gpu'" }, + { name = "longparser", extras = ["server"], marker = "extra == 'cpu'" }, + { name = "longparser", extras = ["server"], marker = "extra == 'gpu'" }, + { name = "motor", marker = "extra == 'server'", specifier = ">=3.6" }, { name = "mypy", marker = "extra == 'dev'", specifier = ">=1.10" }, { name = "pix2tex", marker = "extra == 'latex-ocr'", specifier = ">=0.1.4" }, + { name = "pix2tex", marker = "extra == 'latex-ocr-cpu'", specifier = ">=0.1.4" }, + { name = "pix2tex", marker = "extra == 'latex-ocr-gpu'", specifier = ">=0.1.4" }, { name = "pix2text", marker = "extra == 'mfd'", specifier = ">=1.1.1,<1.2" }, { name = "pydantic", specifier = ">=2.0,<3" }, { name = "pytest", marker = "extra == 'dev'", specifier = ">=8.0" }, { name = "pytest-asyncio", marker = "extra == 'dev'", specifier = ">=0.23" }, { name = "pytest-cov", marker = "extra == 'dev'", specifier = ">=5.0" }, - { name = "python-dotenv", marker = "extra == 'api'", specifier = ">=1.0" }, - { name = "python-magic", marker = "extra == 'api'", specifier = ">=0.4.27" }, - { name = "python-multipart", marker = "extra == 'api'", specifier = ">=0.0.9" }, + { name = "python-dotenv", marker = "extra == 'server'", specifier = ">=1.0" }, + { name = "python-magic", marker = "extra == 'server'", specifier = ">=0.4.27" }, + { name = "python-multipart", marker = "extra == 'server'", specifier = ">=0.0.9" }, { name = "python-pptx", marker = "extra == 'pptx'", specifier = ">=1.0" }, { name = "qdrant-client", marker = "extra == 'qdrant'", specifier = ">=1.12" }, - { name = "redis", marker = "extra == 'api'", specifier = ">=5.0" }, + { name = "redis", marker = "extra == 'server'", specifier = ">=5.0" }, { name = "ruff", marker = "extra == 'dev'", specifier = ">=0.4" }, { name = "sentence-transformers", marker = "extra == 'embeddings'", specifier = ">=3.0" }, - { name = "tiktoken", marker = "extra == 'api'", specifier = ">=0.7" }, + { name = "sentence-transformers", marker = "extra == 'embeddings-cpu'", specifier = ">=3.0" }, + { name = "sentence-transformers", marker = "extra == 'embeddings-gpu'", specifier = ">=3.0" }, + { name = "tiktoken", marker = "extra == 'server'", specifier = ">=0.7" }, { name = "twine", marker = "extra == 'dev'", specifier = ">=5.0" }, - { name = "uvicorn", extras = ["standard"], marker = "extra == 'api'", specifier = ">=0.34" }, -] -provides-extras = ["pptx", "langchain", "llamaindex", "api", "embeddings", "chroma", "faiss", "qdrant", "latex-ocr", "docx-equations", "mfd", "all", "dev"] - -[[package]] -name = "longtracer" -version = "0.1.3" -source = { registry = "https://pypi.org/simple" } -dependencies = [ - { name = "numpy", version = "2.2.6", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version < '3.11'" }, - { name = "numpy", version = "2.4.2", source = { registry = "https://pypi.org/simple" }, marker = "python_full_version >= '3.11'" }, - { name = "pydantic" }, - { name = "python-dotenv" }, - { name = "sentence-transformers" }, - { name = "transformers" }, -] -sdist = { url = "https://files.pythonhosted.org/packages/d5/3f/bc9e101d4d23f00f169a5bc0a15cb9ffc990ffa4c3e65ca907440b30ce23/longtracer-0.1.3.tar.gz", hash = "sha256:a63a6650fed2776964cc10b438742589f504df5c15bcdce58683fe499ef0d6ad", size = 53880, upload-time = "2026-04-03T10:54:34.78Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/1e/29/0c07de6d9f9cc55db9032fc1edfba182cf0d4af4430f06fdad893468ca2b/longtracer-0.1.3-py3-none-any.whl", hash = "sha256:1de576971941da0320a2f8d43b34081c49847cf49c90c7703946b9894ec5c69d", size = 69737, upload-time = "2026-04-03T10:54:32.775Z" }, + { name = "uvicorn", extras = ["standard"], marker = "extra == 'server'", specifier = ">=0.34" }, ] +provides-extras = ["pptx", "langchain", "llamaindex", "server", "embeddings", "embeddings-cpu", "embeddings-gpu", "chroma", "faiss-cpu", "faiss-gpu", "qdrant", "latex-ocr", "latex-ocr-cpu", "latex-ocr-gpu", "docx-equations", "mfd", "cpu", "gpu", "all", "dev"] [[package]] name = "lxml" @@ -5759,73 +5848,73 @@ sdist = { url = "https://files.pythonhosted.org/packages/5d/ab/34ec41718af73c001 [[package]] name = "pymongo" -version = "4.16.0" +version = "4.15.5" source = { registry = "https://pypi.org/simple" } dependencies = [ { name = "dnspython" }, ] -sdist = { url = "https://files.pythonhosted.org/packages/65/9c/a4895c4b785fc9865a84a56e14b5bd21ca75aadc3dab79c14187cdca189b/pymongo-4.16.0.tar.gz", hash = "sha256:8ba8405065f6e258a6f872fe62d797a28f383a12178c7153c01ed04e845c600c", size = 2495323, upload-time = "2026-01-07T18:05:48.107Z" } -wheels = [ - { url = "https://files.pythonhosted.org/packages/4d/93/c36c0998dd91ad8b5031d2e77a903d5cd705b5ba05ca92bcc8731a2c3a8d/pymongo-4.16.0-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:ed162b2227f98d5b270ecbe1d53be56c8c81db08a1a8f5f02d89c7bb4d19591d", size = 807993, upload-time = "2026-01-07T18:03:40.302Z" }, - { url = "https://files.pythonhosted.org/packages/f3/96/d2117d792fa9fedb2f6ccf0608db31f851e8382706d7c3c88c6ac92cc958/pymongo-4.16.0-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:4a9390dce61d705a88218f0d7b54d7e1fa1b421da8129fc7c009e029a9a6b81e", size = 808355, upload-time = "2026-01-07T18:03:42.13Z" }, - { url = "https://files.pythonhosted.org/packages/ae/2e/e79b7b86c0dd6323d0985c201583c7921d67b842b502aae3f3327cbe3935/pymongo-4.16.0-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:92a232af9927710de08a6c16a9710cc1b175fb9179c0d946cd4e213b92b2a69a", size = 1182337, upload-time = "2026-01-07T18:03:44.126Z" }, - { url = "https://files.pythonhosted.org/packages/7b/82/07ec9966381c57d941fddc52637e9c9653e63773be410bd8605f74683084/pymongo-4.16.0-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:4d79aa147ce86aef03079096d83239580006ffb684eead593917186aee407767", size = 1200928, upload-time = "2026-01-07T18:03:45.52Z" }, - { url = "https://files.pythonhosted.org/packages/44/15/9d45e3cc6fa428b0a3600b0c1c86b310f28c91251c41493460695ab40b6b/pymongo-4.16.0-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:19a1c96e7f39c7a59a9cfd4d17920cf9382f6f684faeff4649bf587dc59f8edc", size = 1239418, upload-time = "2026-01-07T18:03:47.03Z" }, - { url = "https://files.pythonhosted.org/packages/c8/b3/f35ee51e2a3f05f673ad4f5e803ae1284c42f4413e8d121c4958f1af4eb9/pymongo-4.16.0-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:efe020c46ce3c3a89af6baec6569635812129df6fb6cf76d4943af3ba6ee2069", size = 1229045, upload-time = "2026-01-07T18:03:48.377Z" }, - { url = "https://files.pythonhosted.org/packages/18/2d/1688b88d7c0a5c01da8c703dea831419435d9ce67c6ddbb0ac629c9c72d2/pymongo-4.16.0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9dc2c00bed568732b89e211b6adca389053d5e6d2d5a8979e80b813c3ec4d1f9", size = 1196517, upload-time = "2026-01-07T18:03:50.205Z" }, - { url = "https://files.pythonhosted.org/packages/e6/c6/e89db0f23bd20757b627a5d8c73a609ffd6741887b9004ab229208a79764/pymongo-4.16.0-cp310-cp310-win32.whl", hash = "sha256:5b9c6d689bbe5beb156374508133218610e14f8c81e35bc17d7a14e30ab593e6", size = 794911, upload-time = "2026-01-07T18:03:52.701Z" }, - { url = "https://files.pythonhosted.org/packages/37/54/e00a5e517153f310a33132375159e42dceb12bee45b51b35aa0df14f1866/pymongo-4.16.0-cp310-cp310-win_amd64.whl", hash = "sha256:2290909275c9b8f637b0a92eb9b89281e18a72922749ebb903403ab6cc7da914", size = 804801, upload-time = "2026-01-07T18:03:57.671Z" }, - { url = "https://files.pythonhosted.org/packages/e5/0a/2572faf89195a944c99c6d756227019c8c5f4b5658ecc261c303645dfe69/pymongo-4.16.0-cp310-cp310-win_arm64.whl", hash = "sha256:6af1aaa26f0835175d2200e62205b78e7ec3ffa430682e322cc91aaa1a0dbf28", size = 797579, upload-time = "2026-01-07T18:03:59.1Z" }, - { url = "https://files.pythonhosted.org/packages/e6/3a/907414a763c4270b581ad6d960d0c6221b74a70eda216a1fdd8fa82ba89f/pymongo-4.16.0-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6f2077ec24e2f1248f9cac7b9a2dfb894e50cc7939fcebfb1759f99304caabef", size = 862561, upload-time = "2026-01-07T18:04:00.628Z" }, - { url = "https://files.pythonhosted.org/packages/8c/58/787d8225dd65cb2383c447346ea5e200ecfde89962d531111521e3b53018/pymongo-4.16.0-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:4d4f7ba040f72a9f43a44059872af5a8c8c660aa5d7f90d5344f2ed1c3c02721", size = 862923, upload-time = "2026-01-07T18:04:02.213Z" }, - { url = "https://files.pythonhosted.org/packages/5d/a7/cc2865aae32bc77ade7b35f957a58df52680d7f8506f93c6edbf458e5738/pymongo-4.16.0-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:8a0f73af1ea56c422b2dcfc0437459148a799ef4231c6aee189d2d4c59d6728f", size = 1426779, upload-time = "2026-01-07T18:04:03.942Z" }, - { url = "https://files.pythonhosted.org/packages/81/25/3e96eb7998eec05382174da2fefc58d28613f46bbdf821045539d0ed60ab/pymongo-4.16.0-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:aa30cd16ddd2f216d07ba01d9635c873e97ddb041c61cf0847254edc37d1c60e", size = 1454207, upload-time = "2026-01-07T18:04:05.387Z" }, - { url = "https://files.pythonhosted.org/packages/86/7b/8e817a7df8c5d565d39dd4ca417a5e0ef46cc5cc19aea9405f403fec6449/pymongo-4.16.0-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1d638b0b1b294d95d0fdc73688a3b61e05cc4188872818cd240d51460ccabcb5", size = 1511654, upload-time = "2026-01-07T18:04:08.458Z" }, - { url = "https://files.pythonhosted.org/packages/39/7a/50c4d075ccefcd281cdcfccc5494caa5665b096b85e65a5d6afabb80e09e/pymongo-4.16.0-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:21d02cc10a158daa20cb040985e280e7e439832fc6b7857bff3d53ef6914ad50", size = 1496794, upload-time = "2026-01-07T18:04:10.355Z" }, - { url = "https://files.pythonhosted.org/packages/0f/cd/ebdc1aaca5deeaf47310c369ef4083e8550e04e7bf7e3752cfb7d95fcdb8/pymongo-4.16.0-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4fbb8d3552c2ad99d9e236003c0b5f96d5f05e29386ba7abae73949bfebc13dd", size = 1448371, upload-time = "2026-01-07T18:04:11.76Z" }, - { url = "https://files.pythonhosted.org/packages/3d/c9/50fdd78c37f68ea49d590c027c96919fbccfd98f3a4cb39f84f79970bd37/pymongo-4.16.0-cp311-cp311-win32.whl", hash = "sha256:be1099a8295b1a722d03fb7b48be895d30f4301419a583dcf50e9045968a041c", size = 841024, upload-time = "2026-01-07T18:04:13.522Z" }, - { url = "https://files.pythonhosted.org/packages/4a/dd/a3aa1ade0cf9980744db703570afac70a62c85b432c391dea0577f6da7bb/pymongo-4.16.0-cp311-cp311-win_amd64.whl", hash = "sha256:61567f712bda04c7545a037e3284b4367cad8d29b3dec84b4bf3b2147020a75b", size = 855838, upload-time = "2026-01-07T18:04:14.923Z" }, - { url = "https://files.pythonhosted.org/packages/bf/10/9ad82593ccb895e8722e4884bad4c5ce5e8ff6683b740d7823a6c2bcfacf/pymongo-4.16.0-cp311-cp311-win_arm64.whl", hash = "sha256:c53338613043038005bf2e41a2fafa08d29cdbc0ce80891b5366c819456c1ae9", size = 845007, upload-time = "2026-01-07T18:04:17.099Z" }, - { url = "https://files.pythonhosted.org/packages/6a/03/6dd7c53cbde98de469a3e6fb893af896dca644c476beb0f0c6342bcc368b/pymongo-4.16.0-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:bd4911c40a43a821dfd93038ac824b756b6e703e26e951718522d29f6eb166a8", size = 917619, upload-time = "2026-01-07T18:04:19.173Z" }, - { url = "https://files.pythonhosted.org/packages/73/e1/328915f2734ea1f355dc9b0e98505ff670f5fab8be5e951d6ed70971c6aa/pymongo-4.16.0-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:25a6b03a68f9907ea6ec8bc7cf4c58a1b51a18e23394f962a6402f8e46d41211", size = 917364, upload-time = "2026-01-07T18:04:20.861Z" }, - { url = "https://files.pythonhosted.org/packages/41/fe/4769874dd9812a1bc2880a9785e61eba5340da966af888dd430392790ae0/pymongo-4.16.0-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:91ac0cb0fe2bf17616c2039dac88d7c9a5088f5cb5829b27c9d250e053664d31", size = 1686901, upload-time = "2026-01-07T18:04:22.219Z" }, - { url = "https://files.pythonhosted.org/packages/fa/8d/15707b9669fdc517bbc552ac60da7124dafe7ac1552819b51e97ed4038b4/pymongo-4.16.0-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:cf0ec79e8ca7077f455d14d915d629385153b6a11abc0b93283ed73a8013e376", size = 1723034, upload-time = "2026-01-07T18:04:24.055Z" }, - { url = "https://files.pythonhosted.org/packages/5b/af/3d5d16ff11d447d40c1472da1b366a31c7380d7ea2922a449c7f7f495567/pymongo-4.16.0-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2d0082631a7510318befc2b4fdab140481eb4b9dd62d9245e042157085da2a70", size = 1797161, upload-time = "2026-01-07T18:04:25.964Z" }, - { url = "https://files.pythonhosted.org/packages/fb/04/725ab8664eeec73ec125b5a873448d80f5d8cf2750aaaf804cbc538a50a5/pymongo-4.16.0-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:85dc2f3444c346ea019a371e321ac868a4fab513b7a55fe368f0cc78de8177cc", size = 1780938, upload-time = "2026-01-07T18:04:28.745Z" }, - { url = "https://files.pythonhosted.org/packages/22/50/dd7e9095e1ca35f93c3c844c92eb6eb0bc491caeb2c9bff3b32fe3c9b18f/pymongo-4.16.0-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:dabbf3c14de75a20cc3c30bf0c6527157224a93dfb605838eabb1a2ee3be008d", size = 1714342, upload-time = "2026-01-07T18:04:30.331Z" }, - { url = "https://files.pythonhosted.org/packages/03/c9/542776987d5c31ae8e93e92680ea2b6e5a2295f398b25756234cabf38a39/pymongo-4.16.0-cp312-cp312-win32.whl", hash = "sha256:60307bb91e0ab44e560fe3a211087748b2b5f3e31f403baf41f5b7b0a70bd104", size = 887868, upload-time = "2026-01-07T18:04:32.124Z" }, - { url = "https://files.pythonhosted.org/packages/2e/d4/b4045a7ccc5680fb496d01edf749c7a9367cc8762fbdf7516cf807ef679b/pymongo-4.16.0-cp312-cp312-win_amd64.whl", hash = "sha256:f513b2c6c0d5c491f478422f6b5b5c27ac1af06a54c93ef8631806f7231bd92e", size = 907554, upload-time = "2026-01-07T18:04:33.685Z" }, - { url = "https://files.pythonhosted.org/packages/60/4c/33f75713d50d5247f2258405142c0318ff32c6f8976171c4fcae87a9dbdf/pymongo-4.16.0-cp312-cp312-win_arm64.whl", hash = "sha256:dfc320f08ea9a7ec5b2403dc4e8150636f0d6150f4b9792faaae539c88e7db3b", size = 892971, upload-time = "2026-01-07T18:04:35.594Z" }, - { url = "https://files.pythonhosted.org/packages/47/84/148d8b5da8260f4679d6665196ae04ab14ffdf06f5fe670b0ab11942951f/pymongo-4.16.0-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:d15f060bc6d0964a8bb70aba8f0cb6d11ae99715438f640cff11bbcf172eb0e8", size = 972009, upload-time = "2026-01-07T18:04:38.303Z" }, - { url = "https://files.pythonhosted.org/packages/1e/5e/9f3a8daf583d0adaaa033a3e3e58194d2282737dc164014ff33c7a081103/pymongo-4.16.0-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4a19ea46a0fe71248965305a020bc076a163311aefbaa1d83e47d06fa30ac747", size = 971784, upload-time = "2026-01-07T18:04:39.669Z" }, - { url = "https://files.pythonhosted.org/packages/ad/f2/b6c24361fcde24946198573c0176406bfd5f7b8538335f3d939487055322/pymongo-4.16.0-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:311d4549d6bf1f8c61d025965aebb5ba29d1481dc6471693ab91610aaffbc0eb", size = 1947174, upload-time = "2026-01-07T18:04:41.368Z" }, - { url = "https://files.pythonhosted.org/packages/47/1a/8634192f98cf740b3d174e1018dd0350018607d5bd8ac35a666dc49c732b/pymongo-4.16.0-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:46ffb728d92dd5b09fc034ed91acf5595657c7ca17d4cf3751322cd554153c17", size = 1991727, upload-time = "2026-01-07T18:04:42.965Z" }, - { url = "https://files.pythonhosted.org/packages/5a/2f/0c47ac84572b28e23028a23a3798a1f725e1c23b0cf1c1424678d16aff42/pymongo-4.16.0-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:acda193f440dd88c2023cb00aa8bd7b93a9df59978306d14d87a8b12fe426b05", size = 2082497, upload-time = "2026-01-07T18:04:44.652Z" }, - { url = "https://files.pythonhosted.org/packages/ba/57/9f46ef9c862b2f0cf5ce798f3541c201c574128d31ded407ba4b3918d7b6/pymongo-4.16.0-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:5d9fdb386cf958e6ef6ff537d6149be7edb76c3268cd6833e6c36aa447e4443f", size = 2064947, upload-time = "2026-01-07T18:04:46.228Z" }, - { url = "https://files.pythonhosted.org/packages/b8/56/5421c0998f38e32288100a07f6cb2f5f9f352522157c901910cb2927e211/pymongo-4.16.0-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:91899dd7fb9a8c50f09c3c1cf0cb73bfbe2737f511f641f19b9650deb61c00ca", size = 1980478, upload-time = "2026-01-07T18:04:48.017Z" }, - { url = "https://files.pythonhosted.org/packages/92/93/bfc448d025e12313a937d6e1e0101b50cc9751636b4b170e600fe3203063/pymongo-4.16.0-cp313-cp313-win32.whl", hash = "sha256:2cd60cd1e05de7f01927f8e25ca26b3ea2c09de8723241e5d3bcfdc70eaff76b", size = 934672, upload-time = "2026-01-07T18:04:49.538Z" }, - { url = "https://files.pythonhosted.org/packages/96/10/12710a5e01218d50c3dd165fd72c5ed2699285f77348a3b1a119a191d826/pymongo-4.16.0-cp313-cp313-win_amd64.whl", hash = "sha256:3ead8a0050c53eaa55935895d6919d393d0328ec24b2b9115bdbe881aa222673", size = 959237, upload-time = "2026-01-07T18:04:51.382Z" }, - { url = "https://files.pythonhosted.org/packages/0c/56/d288bcd1d05bc17ec69df1d0b1d67bc710c7c5dbef86033a5a4d2e2b08e6/pymongo-4.16.0-cp313-cp313-win_arm64.whl", hash = "sha256:dbbc5b254c36c37d10abb50e899bc3939bbb7ab1e7c659614409af99bd3e7675", size = 940909, upload-time = "2026-01-07T18:04:52.904Z" }, - { url = "https://files.pythonhosted.org/packages/30/9e/4d343f8d0512002fce17915a89477b9f916bda1205729e042d8f23acf194/pymongo-4.16.0-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:8a254d49a9ffe9d7f888e3c677eed3729b14ce85abb08cd74732cead6ccc3c66", size = 1026634, upload-time = "2026-01-07T18:04:54.359Z" }, - { url = "https://files.pythonhosted.org/packages/c3/e3/341f88c5535df40c0450fda915f582757bb7d988cdfc92990a5e27c4c324/pymongo-4.16.0-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:a1bf44e13cf2d44d2ea2e928a8140d5d667304abe1a61c4d55b4906f389fbe64", size = 1026252, upload-time = "2026-01-07T18:04:56.642Z" }, - { url = "https://files.pythonhosted.org/packages/af/64/9471b22eb98f0a2ca0b8e09393de048502111b2b5b14ab1bd9e39708aab5/pymongo-4.16.0-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:f1c5f1f818b669875d191323a48912d3fcd2e4906410e8297bb09ac50c4d5ccc", size = 2207399, upload-time = "2026-01-07T18:04:58.255Z" }, - { url = "https://files.pythonhosted.org/packages/87/ac/47c4d50b25a02f21764f140295a2efaa583ee7f17992a5e5fa542b3a690f/pymongo-4.16.0-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:77cfd37a43a53b02b7bd930457c7994c924ad8bbe8dff91817904bcbf291b371", size = 2260595, upload-time = "2026-01-07T18:04:59.788Z" }, - { url = "https://files.pythonhosted.org/packages/ee/1b/0ce1ce9dd036417646b2fe6f63b58127acff3cf96eeb630c34ec9cd675ff/pymongo-4.16.0-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:36ef2fee50eee669587d742fb456e349634b4fcf8926208766078b089054b24b", size = 2366958, upload-time = "2026-01-07T18:05:01.942Z" }, - { url = "https://files.pythonhosted.org/packages/3e/3c/a5a17c0d413aa9d6c17bc35c2b472e9e79cda8068ba8e93433b5f43028e9/pymongo-4.16.0-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:55f8d5a6fe2fa0b823674db2293f92d74cd5f970bc0360f409a1fc21003862d3", size = 2346081, upload-time = "2026-01-07T18:05:03.576Z" }, - { url = "https://files.pythonhosted.org/packages/65/19/f815533d1a88fb8a3b6c6e895bb085ffdae68ccb1e6ed7102202a307f8e2/pymongo-4.16.0-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9caacac0dd105e2555521002e2d17afc08665187017b466b5753e84c016628e6", size = 2246053, upload-time = "2026-01-07T18:05:05.459Z" }, - { url = "https://files.pythonhosted.org/packages/c6/88/4be3ec78828dc64b212c123114bd6ae8db5b7676085a7b43cc75d0131bd2/pymongo-4.16.0-cp314-cp314-win32.whl", hash = "sha256:c789236366525c3ee3cd6e4e450a9ff629a7d1f4d88b8e18a0aea0615fd7ecf8", size = 989461, upload-time = "2026-01-07T18:05:07.018Z" }, - { url = "https://files.pythonhosted.org/packages/af/5a/ab8d5af76421b34db483c9c8ebc3a2199fb80ae63dc7e18f4cf1df46306a/pymongo-4.16.0-cp314-cp314-win_amd64.whl", hash = "sha256:2b0714d7764efb29bf9d3c51c964aed7c4c7237b341f9346f15ceaf8321fdb35", size = 1017803, upload-time = "2026-01-07T18:05:08.499Z" }, - { url = "https://files.pythonhosted.org/packages/f6/f4/98d68020728ac6423cf02d17cfd8226bf6cce5690b163d30d3f705e8297e/pymongo-4.16.0-cp314-cp314-win_arm64.whl", hash = "sha256:12762e7cc0f8374a8cae3b9f9ed8dabb5d438c7b33329232dd9b7de783454033", size = 997184, upload-time = "2026-01-07T18:05:09.944Z" }, - { url = "https://files.pythonhosted.org/packages/50/00/dc3a271daf06401825b9c1f4f76f018182c7738281ea54b9762aea0560c1/pymongo-4.16.0-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:1c01e8a7cd0ea66baf64a118005535ab5bf9f9eb63a1b50ac3935dccf9a54abe", size = 1083303, upload-time = "2026-01-07T18:05:11.702Z" }, - { url = "https://files.pythonhosted.org/packages/b8/4b/b5375ee21d12eababe46215011ebc63801c0d2c5ffdf203849d0d79f9852/pymongo-4.16.0-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:4c4872299ebe315a79f7f922051061634a64fda95b6b17677ba57ef00b2ba2a4", size = 1083233, upload-time = "2026-01-07T18:05:13.182Z" }, - { url = "https://files.pythonhosted.org/packages/ee/e3/52efa3ca900622c7dcb56c5e70f15c906816d98905c22d2ee1f84d9a7b60/pymongo-4.16.0-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:78037d02389745e247fe5ab0bcad5d1ab30726eaac3ad79219c7d6bbb07eec53", size = 2527438, upload-time = "2026-01-07T18:05:14.981Z" }, - { url = "https://files.pythonhosted.org/packages/cb/96/43b1be151c734e7766c725444bcbfa1de6b60cc66bfb406203746839dd25/pymongo-4.16.0-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c126fb72be2518395cc0465d4bae03125119136462e1945aea19840e45d89cfc", size = 2600399, upload-time = "2026-01-07T18:05:16.794Z" }, - { url = "https://files.pythonhosted.org/packages/e7/62/fa64a5045dfe3a1cd9217232c848256e7bc0136cffb7da4735c5e0d30e40/pymongo-4.16.0-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:f3867dc225d9423c245a51eaac2cfcd53dde8e0a8d8090bb6aed6e31bd6c2d4f", size = 2720960, upload-time = "2026-01-07T18:05:18.498Z" }, - { url = "https://files.pythonhosted.org/packages/54/7b/01577eb97e605502821273a5bc16ce0fb0be5c978fe03acdbff471471202/pymongo-4.16.0-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:f25001a955073b80510c0c3db0e043dbbc36904fd69e511c74e3d8640b8a5111", size = 2699344, upload-time = "2026-01-07T18:05:20.073Z" }, - { url = "https://files.pythonhosted.org/packages/55/68/6ef6372d516f703479c3b6cbbc45a5afd307173b1cbaccd724e23919bb1a/pymongo-4.16.0-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:9d9885aad05f82fd7ea0c9ca505d60939746b39263fa273d0125170da8f59098", size = 2577133, upload-time = "2026-01-07T18:05:22.052Z" }, - { url = "https://files.pythonhosted.org/packages/15/c7/b5337093bb01da852f945802328665f85f8109dbe91d81ea2afe5ff059b9/pymongo-4.16.0-cp314-cp314t-win32.whl", hash = "sha256:948152b30eddeae8355495f9943a3bf66b708295c0b9b6f467de1c620f215487", size = 1040560, upload-time = "2026-01-07T18:05:23.888Z" }, - { url = "https://files.pythonhosted.org/packages/96/8c/5b448cd1b103f3889d5713dda37304c81020ff88e38a826e8a75ddff4610/pymongo-4.16.0-cp314-cp314t-win_amd64.whl", hash = "sha256:f6e42c1bc985d9beee884780ae6048790eb4cd565c46251932906bdb1630034a", size = 1075081, upload-time = "2026-01-07T18:05:26.874Z" }, - { url = "https://files.pythonhosted.org/packages/32/cd/ddc794cdc8500f6f28c119c624252fb6dfb19481c6d7ed150f13cf468a6d/pymongo-4.16.0-cp314-cp314t-win_arm64.whl", hash = "sha256:6b2a20edb5452ac8daa395890eeb076c570790dfce6b7a44d788af74c2f8cf96", size = 1047725, upload-time = "2026-01-07T18:05:28.47Z" }, +sdist = { url = "https://files.pythonhosted.org/packages/24/a0/5c324fe6735b2bc189779ff46e981a59d495a74594f45542159125d77256/pymongo-4.15.5.tar.gz", hash = "sha256:3a8d6bf2610abe0c97c567cf98bf5bba3e90ccc93cc03c9dde75fa11e4267b42", size = 2471889, upload-time = "2025-12-02T18:44:30.992Z" } +wheels = [ + { url = "https://files.pythonhosted.org/packages/33/e4/d80061be4e53125597dd2916171c87986043b190e50c1834fff455e71d42/pymongo-4.15.5-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:a01a2054d50b50c121c720739a2216d855c48726b0002894de9b991cdd68a2a5", size = 811318, upload-time = "2025-12-02T18:42:12.09Z" }, + { url = "https://files.pythonhosted.org/packages/fb/b3/c499fe0814e4d3a84fa3ff5df5133bf847529d8b5a051e6108b5a25b75c7/pymongo-4.15.5-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:5e57968139d81367117ed7b75d921445a575d4d7e61536f5e860475df92ac0a9", size = 811676, upload-time = "2025-12-02T18:42:14.396Z" }, + { url = "https://files.pythonhosted.org/packages/62/71/8e21a8a680546b3a90afbb878a16fe2a7cb0f7d9652aa675c172e57856a1/pymongo-4.15.5-cp310-cp310-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:266aa37e3673e5dcfdd359a81d27131fc133e49cf8e5d9f9f27a5845fac2cd1f", size = 1185485, upload-time = "2025-12-02T18:42:16.147Z" }, + { url = "https://files.pythonhosted.org/packages/03/56/bdc292a7b01aa2aba806883dbcacc3be837d65425453aa2bc27954ba5a55/pymongo-4.15.5-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2883da6bd0545cc2f12672f6a609b33d48e099a220872ca2bf9bf29fe96a32c3", size = 1203866, upload-time = "2025-12-02T18:42:18.018Z" }, + { url = "https://files.pythonhosted.org/packages/8b/e2/12bebc7e93a81c2f804ffcc94997f61f0e2cd2c11bf0f01da8e0e1425e5c/pymongo-4.15.5-cp310-cp310-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:2fc32b354a608ec748d89bbe236b74b967890667eea1af54e92dfd8fbf26df52", size = 1242550, upload-time = "2025-12-02T18:42:19.898Z" }, + { url = "https://files.pythonhosted.org/packages/0d/ac/c48f6f59a660ec44052ee448dea1c71da85cfaa4a0c17c726d4ee2db7716/pymongo-4.15.5-cp310-cp310-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:3c006cbaa4b40d296dd2bb8828976866c876ead4c39032b761dcf26f1ba56fde", size = 1232844, upload-time = "2025-12-02T18:42:21.709Z" }, + { url = "https://files.pythonhosted.org/packages/89/cc/6368befca7a2f3b51460755a373f78b72003aeee95e8e138cbd479c307f4/pymongo-4.15.5-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:ce21e3dc5939b83d03f871090d83ac29fef055bd057f8d3074b6cad10f86b04c", size = 1200192, upload-time = "2025-12-02T18:42:23.605Z" }, + { url = "https://files.pythonhosted.org/packages/9d/97/bc810a017ebb20e6e301fa8c5b21c5e53691fdde2cfd39bd9c450e957b14/pymongo-4.15.5-cp310-cp310-win32.whl", hash = "sha256:1b545dcf66a9f06e9b501bfb0438e1eb9af67336e8a5cf36c4bc0a5d3fbe7a37", size = 798338, upload-time = "2025-12-02T18:42:25.438Z" }, + { url = "https://files.pythonhosted.org/packages/46/17/3be0b476a6bfb3a51bf1750323b5eddf883dddb6482ccb8dbcab2c6c48ad/pymongo-4.15.5-cp310-cp310-win_amd64.whl", hash = "sha256:1ecc544f515f828f05d3c56cd98063ba3ef8b75f534c63de43306d59f1e93fcd", size = 808153, upload-time = "2025-12-02T18:42:26.889Z" }, + { url = "https://files.pythonhosted.org/packages/bf/0a/39f9daf16d695abd58987bb5e2c164b5a64e42b8d53d3c43bc06e4aa7dfc/pymongo-4.15.5-cp310-cp310-win_arm64.whl", hash = "sha256:1151968ab90db146f0591b6c7db27ce4f73c7ffa0bbddc1d7fb7cb14c9f0b967", size = 800943, upload-time = "2025-12-02T18:42:28.668Z" }, + { url = "https://files.pythonhosted.org/packages/0c/ea/e43387c2ed78a60ad917c45f4d4de4f6992929d63fe15af4c2e624f093a9/pymongo-4.15.5-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:57157a4b936e28e2fbe7017b2f6a751da5e284675cab371f2c596d4e0e4f58f3", size = 865894, upload-time = "2025-12-02T18:42:30.496Z" }, + { url = "https://files.pythonhosted.org/packages/5e/8c/f2c9c55adb9709a4b2244d8d8d9ec05e4abb274e03fe8388b58a34ae08b0/pymongo-4.15.5-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:e2a34a7391f4cc54fc584e49db6f7c3929221a9da08b3af2d2689884a5943843", size = 866235, upload-time = "2025-12-02T18:42:31.862Z" }, + { url = "https://files.pythonhosted.org/packages/5e/aa/bdf3553d7309b0ebc0c6edc23f43829b1758431f2f2f7385d2427b20563b/pymongo-4.15.5-cp311-cp311-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:be040c8cdaf9c2d5ae9ab60a67ecab453ec19d9ccd457a678053fdceab5ee4c8", size = 1429787, upload-time = "2025-12-02T18:42:33.829Z" }, + { url = "https://files.pythonhosted.org/packages/b3/55/80a8eefc88f578fde56489e5278ba5caa5ee9b6f285959ed2b98b44e2133/pymongo-4.15.5-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:defe93944526b1774265c16acf014689cb1b0b18eb84a7b370083b214f9e18cd", size = 1456747, upload-time = "2025-12-02T18:42:35.805Z" }, + { url = "https://files.pythonhosted.org/packages/1d/54/6a7ec290c7ab22aab117ab60e7375882ec5af7433eaf077f86e187a3a9e8/pymongo-4.15.5-cp311-cp311-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:816e66116f0ef868eff0463a8b28774af8b547466dbad30c8e82bf0325041848", size = 1514670, upload-time = "2025-12-02T18:42:37.737Z" }, + { url = "https://files.pythonhosted.org/packages/65/8a/5822aa20b274ee8a8821bf0284f131e7fc555b0758c3f2a82c51ae73a3c6/pymongo-4.15.5-cp311-cp311-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:66c7b332532e0f021d784d04488dbf7ed39b7e7d6d5505e282ec8e9cf1025791", size = 1500711, upload-time = "2025-12-02T18:42:39.61Z" }, + { url = "https://files.pythonhosted.org/packages/32/ca/63984e32b4d745a25445c9da1159dfe4568a03375f32bb1a9e009dccb023/pymongo-4.15.5-cp311-cp311-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:acc46a9e47efad8c5229e644a3774169013a46ee28ac72d1fa4edd67c0b7ee9b", size = 1452021, upload-time = "2025-12-02T18:42:41.323Z" }, + { url = "https://files.pythonhosted.org/packages/f1/23/0d6988f3fdfcacae2ac8d7b76eb24f80ebee9eb607c53bcebfad75b7fd85/pymongo-4.15.5-cp311-cp311-win32.whl", hash = "sha256:b9836c28ba350d8182a51f32ef9bb29f0c40e82ba1dfb9e4371cd4d94338a55d", size = 844483, upload-time = "2025-12-02T18:42:42.814Z" }, + { url = "https://files.pythonhosted.org/packages/8e/04/dedff8a5a9539e5b6128d8d2458b9c0c83ebd38b43389620a0d97223f114/pymongo-4.15.5-cp311-cp311-win_amd64.whl", hash = "sha256:3a45876c5c2ab44e2a249fb542eba2a026f60d6ab04c7ef3924eae338d9de790", size = 859194, upload-time = "2025-12-02T18:42:45.025Z" }, + { url = "https://files.pythonhosted.org/packages/67/e5/fb6f49bceffe183e66831c2eebd2ea14bd65e2816aeaf8e2fc018fd8c344/pymongo-4.15.5-cp311-cp311-win_arm64.whl", hash = "sha256:e4a48fc5c712b3db85c9987cfa7fde0366b7930018de262919afd9e52cfbc375", size = 848377, upload-time = "2025-12-02T18:42:47.19Z" }, + { url = "https://files.pythonhosted.org/packages/3c/4e/8f9fcb2dc9eab1fb0ed02da31e7f4847831d9c0ef08854a296588b97e8ed/pymongo-4.15.5-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:c33477af1a50d1b4d86555e098fc2cf5992d839ad538dea0c00a8682162b7a75", size = 920955, upload-time = "2025-12-02T18:42:48.812Z" }, + { url = "https://files.pythonhosted.org/packages/d2/b4/c0808bed1f82b3008909b9562615461e59c3b66f8977e502ea87c88b08a4/pymongo-4.15.5-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:e6b30defa4a52d3698cd84d608963a8932f7e9b6ec5130087e7082552ac685e5", size = 920690, upload-time = "2025-12-02T18:42:50.832Z" }, + { url = "https://files.pythonhosted.org/packages/12/f3/feea83150c6a0cd3b44d5f705b1c74bff298a36f82d665f597bf89d42b3f/pymongo-4.15.5-cp312-cp312-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:45fec063f5672e6173bcb09b492431e3641cc74399c2b996fcb995881c2cac61", size = 1690351, upload-time = "2025-12-02T18:42:53.402Z" }, + { url = "https://files.pythonhosted.org/packages/d7/4e/15924d33d8d429e4c41666090017c6ac5e7ccc4ce5e435a2df09e45220a8/pymongo-4.15.5-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b8c6813110c0d9fde18674b7262f47a2270ae46c0ddd05711e6770caa3c9a3fb", size = 1726089, upload-time = "2025-12-02T18:42:56.187Z" }, + { url = "https://files.pythonhosted.org/packages/a5/49/650ff29dc5f9cf090dfbd6fb248c56d8a10d268b6f46b10fb02fbda3c762/pymongo-4.15.5-cp312-cp312-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:e8ec48d1db9f44c737b13be4299a1782d5fde3e75423acbbbe927cb37ebbe87d", size = 1800637, upload-time = "2025-12-02T18:42:57.913Z" }, + { url = "https://files.pythonhosted.org/packages/7d/18/f34661ade670ee42331543f4aa229569ac7ef45907ecda41b777137b9f40/pymongo-4.15.5-cp312-cp312-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:1f410694fdd76631ead7df6544cdeadaf2407179196c3642fced8e48bb21d0a6", size = 1785480, upload-time = "2025-12-02T18:43:00.626Z" }, + { url = "https://files.pythonhosted.org/packages/10/b6/378bb26937f6b366754484145826aca2d2361ac05b0bacd45a35876abcef/pymongo-4.15.5-cp312-cp312-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:b8c46765d6ac5727a899190aacdeec7a57f8c93346124ddd7e12633b573e2e65", size = 1718548, upload-time = "2025-12-02T18:43:02.32Z" }, + { url = "https://files.pythonhosted.org/packages/58/79/31b8afba36f794a049633e105e45c30afaa0e1c0bab48332d999e87d4860/pymongo-4.15.5-cp312-cp312-win32.whl", hash = "sha256:647118a58dca7d3547714fc0b383aebf81f5852f4173dfd77dd34e80eea9d29b", size = 891319, upload-time = "2025-12-02T18:43:04.699Z" }, + { url = "https://files.pythonhosted.org/packages/c8/31/a7e6d8c5657d922872ac75ab1c0a1335bfb533d2b4dad082d5d04089abbb/pymongo-4.15.5-cp312-cp312-win_amd64.whl", hash = "sha256:099d3e2dddfc75760c6a8fadfb99c1e88824a99c2c204a829601241dff9da049", size = 910919, upload-time = "2025-12-02T18:43:06.555Z" }, + { url = "https://files.pythonhosted.org/packages/1c/b4/286c12fa955ae0597cd4c763d87c986e7ade681d4b11a81766f62f079c79/pymongo-4.15.5-cp312-cp312-win_arm64.whl", hash = "sha256:649cb906882c4058f467f334fb277083998ba5672ffec6a95d6700db577fd31a", size = 896357, upload-time = "2025-12-02T18:43:08.801Z" }, + { url = "https://files.pythonhosted.org/packages/9b/92/e70db1a53bc0bb5defe755dee66b5dfbe5e514882183ffb696d6e1d38aa2/pymongo-4.15.5-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:2b736226f9001bbbd02f822acb9b9b6d28319f362f057672dfae2851f7da6125", size = 975324, upload-time = "2025-12-02T18:43:11.074Z" }, + { url = "https://files.pythonhosted.org/packages/a4/90/dd78c059a031b942fa36d71796e94a0739ea9fb4251fcd971e9579192611/pymongo-4.15.5-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:60ea9f07fbbcc7c88f922082eb27436dce6756730fdef76a3a9b4c972d0a57a3", size = 975129, upload-time = "2025-12-02T18:43:13.345Z" }, + { url = "https://files.pythonhosted.org/packages/40/72/87cf1bb75ef296456912eb7c6d51ebe7a36dbbe9bee0b8a9cd02a62a8a6e/pymongo-4.15.5-cp313-cp313-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:20af63218ae42870eaee31fb8cc4ce9e3af7f04ea02fc98ad751fb7a9c8d7be3", size = 1950973, upload-time = "2025-12-02T18:43:15.225Z" }, + { url = "https://files.pythonhosted.org/packages/8c/68/dfa507c8e5cebee4e305825b436c34f5b9ba34488a224b7e112a03dbc01e/pymongo-4.15.5-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:20d9c11625392f1f8dec7688de5ce344e110ca695344efa313ae4839f13bd017", size = 1995259, upload-time = "2025-12-02T18:43:16.869Z" }, + { url = "https://files.pythonhosted.org/packages/85/9d/832578e5ed7f682a09441bbc0881ffd506b843396ef4b34ec53bd38b2fb2/pymongo-4.15.5-cp313-cp313-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:1202b3e5357b161acb7b7cc98e730288a5c15544e5ef7254b33931cb9a27c36e", size = 2086591, upload-time = "2025-12-02T18:43:19.559Z" }, + { url = "https://files.pythonhosted.org/packages/0a/99/ca8342a0cefd2bb1392187ef8fe01432855e3b5cd1e640495246bcd65542/pymongo-4.15.5-cp313-cp313-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:63af710e9700dbf91abccf119c5f5533b9830286d29edb073803d3b252862c0d", size = 2070200, upload-time = "2025-12-02T18:43:21.214Z" }, + { url = "https://files.pythonhosted.org/packages/3f/7d/f4a9c1fceaaf71524ff9ff964cece0315dcc93df4999a49f064564875bff/pymongo-4.15.5-cp313-cp313-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:f22eeb86861cf7b8ee6886361d52abb88e3cd96c6f6d102e45e2604fc6e9e316", size = 1985263, upload-time = "2025-12-02T18:43:23.415Z" }, + { url = "https://files.pythonhosted.org/packages/d8/15/f942535bcc6e22d3c26c7e730daf296ffe69d8ce474c430ea7e551f8cf33/pymongo-4.15.5-cp313-cp313-win32.whl", hash = "sha256:aad6efe82b085bf77cec2a047ded2c810e93eced3ccf1a8e3faec3317df3cd52", size = 938143, upload-time = "2025-12-02T18:43:26.081Z" }, + { url = "https://files.pythonhosted.org/packages/02/2a/c92a6927d676dd376d1ae05c680139c5cad068b22e5f0c8cb61014448894/pymongo-4.15.5-cp313-cp313-win_amd64.whl", hash = "sha256:ccc801f6d71ebee2ec2fb3acc64b218fa7cdb7f57933b2f8eee15396b662a0a0", size = 962603, upload-time = "2025-12-02T18:43:27.816Z" }, + { url = "https://files.pythonhosted.org/packages/3a/f0/cdf78e9ed9c26fb36b8d75561ebf3c7fe206ff1c3de2e1b609fccdf3a55b/pymongo-4.15.5-cp313-cp313-win_arm64.whl", hash = "sha256:f043abdf20845bf29a554e95e4fe18d7d7a463095d6a1547699a12f80da91e02", size = 944308, upload-time = "2025-12-02T18:43:29.371Z" }, + { url = "https://files.pythonhosted.org/packages/03/0c/49713e0f8f41110e8b2bcce7c88570b158cf43dd53a0d01d4e1c772c7ede/pymongo-4.15.5-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:ba0e75a390334221744e2666fd2d4c82419b580c9bc8d6e0d2d61459d263f3af", size = 1029996, upload-time = "2025-12-02T18:43:31.58Z" }, + { url = "https://files.pythonhosted.org/packages/23/de/1df5d7b49647e9e4511054f750c1109cb8e160763b286b96879917170618/pymongo-4.15.5-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:853ec7da97642eabaf94d3de4453a86365729327d920af167bf14b2e87b24dce", size = 1029612, upload-time = "2025-12-02T18:43:33.69Z" }, + { url = "https://files.pythonhosted.org/packages/8b/19/3a051228e5beb0b421d725bb2ab5207a260c718d9b5be5b85cfe963733e3/pymongo-4.15.5-cp314-cp314-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:7631304106487480ebbd8acbe44ff1e69d1fdc27e83d9753dc1fd227cea10761", size = 2211814, upload-time = "2025-12-02T18:43:35.769Z" }, + { url = "https://files.pythonhosted.org/packages/bf/b3/989531a056c4388ef18245d1a6d6b3ec5c538666b000764286119efbf194/pymongo-4.15.5-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:50505181365eba5d4d35c462870b3614c8eddd0b2407c89377c1a59380640dd9", size = 2264629, upload-time = "2025-12-02T18:43:37.479Z" }, + { url = "https://files.pythonhosted.org/packages/ea/5f/8b3339fec44d0ba6d9388a19340fb1534c85ab6aa9fd8fb9c1af146bb72a/pymongo-4.15.5-cp314-cp314-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3b75ec7006471299a571d6db1c5609ea4aa9c847a701e9b2953a8ede705d82db", size = 2371823, upload-time = "2025-12-02T18:43:39.866Z" }, + { url = "https://files.pythonhosted.org/packages/d4/7f/706bf45cf12990b6cb73e6290b048944a51592de7a597052a761eea90b8d/pymongo-4.15.5-cp314-cp314-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:c3fc24cb1f4ec60ed83162d4bba0c26abc6c9ae78c928805583673f3b3ea6984", size = 2351860, upload-time = "2025-12-02T18:43:42.002Z" }, + { url = "https://files.pythonhosted.org/packages/f3/c5/fdcc81c20c67a61ba1073122c9ab42c937dd6f914004747e9ceefa4cead3/pymongo-4.15.5-cp314-cp314-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:21d17bb2934b0640863361c08dd06991f128a97f9bee19425a499227be9ae6b4", size = 2251349, upload-time = "2025-12-02T18:43:43.924Z" }, + { url = "https://files.pythonhosted.org/packages/0c/1c/e540ccac0685b234a23574dce3c8e077cd59bcb73ab19bcab1915894d3a6/pymongo-4.15.5-cp314-cp314-win32.whl", hash = "sha256:5a3974236cb842b4ef50a5a6bfad9c7d83a713af68ea3592ba240bbcb863305a", size = 992901, upload-time = "2025-12-02T18:43:45.732Z" }, + { url = "https://files.pythonhosted.org/packages/89/31/eb72c53bc897cb50b57000d71ce9bdcfc9c84ba4c7f6d55348df47b241d8/pymongo-4.15.5-cp314-cp314-win_amd64.whl", hash = "sha256:73fa8a7eee44fd95ba7d5cf537340ff3ff34efeb1f7d6790532d0a6ed4dee575", size = 1021205, upload-time = "2025-12-02T18:43:47.756Z" }, + { url = "https://files.pythonhosted.org/packages/ea/4a/74a7cc350d60953d27b5636906b43b232b501cee07f70f6513ac603097e8/pymongo-4.15.5-cp314-cp314-win_arm64.whl", hash = "sha256:d41288ca2a3eb9ac7c8cad4ea86ef8d63b69dc46c9b65c2bbd35331ec2a0fc57", size = 1000616, upload-time = "2025-12-02T18:43:49.677Z" }, + { url = "https://files.pythonhosted.org/packages/1a/22/1e557868b9b207d7dbf7706412251b28a82d4b958e007b6f2569d59ada3d/pymongo-4.15.5-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:552670f0c8bff103656d4e4b1f2c018f789c9de03f7615ed5e547d5b1b83cda0", size = 1086723, upload-time = "2025-12-02T18:43:51.432Z" }, + { url = "https://files.pythonhosted.org/packages/aa/9c/2e24c2da289e1d3b9bc4e0850136a364473bddfbe8b19b33d2bb5d30ee0d/pymongo-4.15.5-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:41891b45f6ff1e23cfd1b7fbe40286664ad4507e2d2aa61c6d8c40eb6e11dded", size = 1086653, upload-time = "2025-12-02T18:43:53.131Z" }, + { url = "https://files.pythonhosted.org/packages/c6/be/4c2460c9ec91a891c754b91914ce700cc46009dae40183a85e26793dfae9/pymongo-4.15.5-cp314-cp314t-manylinux1_i686.manylinux_2_28_i686.manylinux_2_5_i686.whl", hash = "sha256:524a8a593ae2eb1ec6db761daf0c03f98824e9882ab7df3d458d0c76c7ade255", size = 2531627, upload-time = "2025-12-02T18:43:55.141Z" }, + { url = "https://files.pythonhosted.org/packages/a0/48/cea56d04eb6bbd8b8943ff73d7cf26b94f715fccb23cf7ef9a4f853725a0/pymongo-4.15.5-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:e7ceb35c41b86711a1b284c604e2b944a2d46cb1b8dd3f8b430a9155491378f2", size = 2603767, upload-time = "2025-12-02T18:43:57.188Z" }, + { url = "https://files.pythonhosted.org/packages/d9/ff/6743e351f8e0d5c3f388deb15f0cdbb77d2439eb3fba7ebcdf7878719517/pymongo-4.15.5-cp314-cp314t-manylinux2014_ppc64le.manylinux_2_17_ppc64le.manylinux_2_28_ppc64le.whl", hash = "sha256:3be2336715924be3a861b5e40c634376fd6bfe6dd1892d391566aa5a88a31307", size = 2725216, upload-time = "2025-12-02T18:43:59.463Z" }, + { url = "https://files.pythonhosted.org/packages/d4/90/fa532b6320b3ba61872110ff6f674bd54b54a592c0c64719e4f46852d0b6/pymongo-4.15.5-cp314-cp314t-manylinux2014_s390x.manylinux_2_17_s390x.manylinux_2_28_s390x.whl", hash = "sha256:d65df9c015e33f74ea9d1abf474971abca21e347a660384f8227dbdab75a33ca", size = 2704804, upload-time = "2025-12-02T18:44:01.415Z" }, + { url = "https://files.pythonhosted.org/packages/e1/84/1905c269aced043973b9528d94678e62e2eba249e70490c3c32dc70e2501/pymongo-4.15.5-cp314-cp314t-manylinux2014_x86_64.manylinux_2_17_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:83c05bea05e151754357f8e6bbb80d5accead5110dc58f64e283173c71ec9de2", size = 2582274, upload-time = "2025-12-02T18:44:03.427Z" }, + { url = "https://files.pythonhosted.org/packages/7e/af/78c13179961e418396ec6ef53c0f1c855f1e9f1176d10909e8345d65366a/pymongo-4.15.5-cp314-cp314t-win32.whl", hash = "sha256:7c285614a3e8570b03174a25db642e449b0e7f77a6c9e487b73b05c9bf228ee6", size = 1044015, upload-time = "2025-12-02T18:44:05.318Z" }, + { url = "https://files.pythonhosted.org/packages/b0/d5/49012f03418dce976124da339f3a6afbe6959cb0468ca6302596fe272926/pymongo-4.15.5-cp314-cp314t-win_amd64.whl", hash = "sha256:aae7d96f7b2b1a2753349130797543e61e93ee2ace8faa7fbe0565e2eb5d815f", size = 1078481, upload-time = "2025-12-02T18:44:07.215Z" }, + { url = "https://files.pythonhosted.org/packages/5e/fc/f352a070d8ff6f388ce344c5ddb82348a38e0d1c99346fa6bfdef07134fe/pymongo-4.15.5-cp314-cp314t-win_arm64.whl", hash = "sha256:576a7d4b99465d38112c72f7f3d345f9d16aeeff0f923a3b298c13e15ab4f0ad", size = 1051166, upload-time = "2025-12-02T18:44:09.048Z" }, ] [[package]] From 9b4ba06e68805f40feeb029fafaaa49eb7f0b314 Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Wed, 8 Apr 2026 12:02:44 +0500 Subject: [PATCH 2/7] docs update --- docs/api/endpoints.md | 2 +- docs/changelog.md | 2 +- docs/deployment/environment.md | 10 +++++++++- docs/getting-started/configuration.md | 2 +- docs/guide/chat.md | 2 +- docs/integrations/langchain.md | 2 +- docs/security.md | 2 ++ 7 files changed, 16 insertions(+), 6 deletions(-) diff --git a/docs/api/endpoints.md b/docs/api/endpoints.md index 2c42e42..ff8d7ce 100644 --- a/docs/api/endpoints.md +++ b/docs/api/endpoints.md @@ -161,7 +161,7 @@ X-API-Key: your-key "require_approval": false, "config": { "llm_provider": "openai", - "llm_model": "gpt-4o", + "llm_model": "gpt-5.3", "top_k": 5 } } diff --git a/docs/changelog.md b/docs/changelog.md index 5e65701..9523c0c 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -54,7 +54,7 @@ for production RAG pipelines. via LangGraph `interrupt()` before embedding - **3-layer memory chat** — short-term turns + rolling summary + long-term facts, powered by LCEL chains -- **Multi-provider LLM support** — OpenAI (`gpt-4o`), Gemini (`gemini-2.0-flash`), +- **Multi-provider LLM support** — OpenAI (`gpt-5.3`), Gemini (`gemini-2.5`), Groq (`llama-3.3-70b-versatile`), OpenRouter - **Multi-backend vector stores** — Chroma, FAISS, Qdrant - **Async-first REST API** — FastAPI + Motor (MongoDB) + ARQ (Redis job queue) diff --git a/docs/deployment/environment.md b/docs/deployment/environment.md index 023c0d5..3245c88 100644 --- a/docs/deployment/environment.md +++ b/docs/deployment/environment.md @@ -7,7 +7,7 @@ Copy `.env.example` to `.env` and configure for your deployment. | Variable | Description | |---|---| | `LONGPARSER_API_KEY` | API key for server authentication | -| `LONGPARSER_MONGO_URI` | MongoDB connection string | +| `LONGPARSER_MONGO_URL` | MongoDB connection string | ## LLM @@ -50,3 +50,11 @@ Copy `.env.example` to `.env` and configure for your deployment. |---|---|---| | `LONGPARSER_REDIS_URL` | `redis://localhost:6379/0` | Redis URL for task queue | | `LONGPARSER_WORKER_CONCURRENCY` | `2` | Worker concurrency level | + +## Security + +| Variable | Default | Description | +|---|---|---| +| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins (comma separated) | +| `LONGPARSER_RATE_LIMIT` | `60` | Max requests per minute per tenant ID | +| `LONGPARSER_ADMIN_KEYS` | — | Comma-separated admin API keys | diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md index 643129c..efd370f 100644 --- a/docs/getting-started/configuration.md +++ b/docs/getting-started/configuration.md @@ -15,7 +15,7 @@ cp .env.example .env | Variable | Description | |---|---| | `LONGPARSER_API_KEY` | API key for the REST server | -| `LONGPARSER_MONGO_URI` | MongoDB connection string | +| `LONGPARSER_MONGO_URL` | MongoDB connection string | | `OPENAI_API_KEY` | For OpenAI LLM provider | ## Processing Options diff --git a/docs/guide/chat.md b/docs/guide/chat.md index b686bfc..a3fb8e6 100644 --- a/docs/guide/chat.md +++ b/docs/guide/chat.md @@ -40,7 +40,7 @@ POST /chat "question": "What are the key findings?", "config": { "llm_provider": "openai", - "llm_model": "gpt-4o", + "llm_model": "gpt-5.3", "top_k": 5 } } diff --git a/docs/integrations/langchain.md b/docs/integrations/langchain.md index c05dc2b..b2130d4 100644 --- a/docs/integrations/langchain.md +++ b/docs/integrations/langchain.md @@ -59,7 +59,7 @@ from langchain.chains import RetrievalQA from langchain_openai import ChatOpenAI qa = RetrievalQA.from_chain_type( - llm=ChatOpenAI(model="gpt-4o"), + llm=ChatOpenAI(model="gpt-5.3"), retriever=vectorstore.as_retriever(search_kwargs={"k": 5}), ) diff --git a/docs/security.md b/docs/security.md index ba315cf..9932f71 100644 --- a/docs/security.md +++ b/docs/security.md @@ -35,6 +35,8 @@ Key risks: | **MongoDB injection** | Motor driver + typed Pydantic inputs prevent injection | | **SSRF via webhook** | No outbound HTTP made based on user input | | **Hallucinated citations** | Citation IDs validated against retrieved set before returning to client | +| **DDoS / Spam via API** | Route-level Rate Limiting strictly isolated per tenant via Redis | +| **Cross-Origin attacks** | Configurable CORS restrictions and strict Tenant Isolation | ## Dependency Security From a46be48c04086140d29f5c108e3d379bdafecc63 Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Wed, 8 Apr 2026 12:18:25 +0500 Subject: [PATCH 3/7] fix: include server and test dependencies in CI pipeline --- .github/workflows/ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index cc886c0..278e954 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -37,7 +37,7 @@ jobs: key: pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }} - name: Install package and test deps - run: pip install -e "." pytest pytest-cov + run: pip install -e ".[dev,server]" - name: Run tests run: pytest tests/ -v --tb=short --cov=longparser --cov-report=term-missing From 12ac0e0ba507bca43d323baf7283ffbe7694e729 Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Mon, 13 Apr 2026 10:01:13 +0500 Subject: [PATCH 4/7] version update --- .github/workflows/ci.yml | 2 +- CHANGELOG.md | 20 ++++++++++++++++++ CONTRIBUTING.md | 2 +- README.md | 8 +++---- SECURITY.md | 2 ++ docs/changelog.md | 20 ++++++++++++++++++ docs/contributing.md | 2 +- docs/deployment/docker.md | 2 +- docs/deployment/environment.md | 2 +- docs/getting-started/configuration.md | 2 +- docs/getting-started/installation.md | 4 ++-- docs/getting-started/quickstart.md | 8 +++---- docs/guide/chat.md | 2 +- docs/guide/parsing.md | 8 +++---- docs/index.md | 7 ++++--- docs/reference/pipeline.md | 28 +++++++++++++++++-------- docs/reference/schemas.md | 2 +- pyproject.toml | 2 +- src/longparser/__init__.py | 12 +++++++---- src/longparser/pipeline/__init__.py | 4 ++++ src/longparser/server/chat/engine.py | 4 ++-- src/longparser/server/chat/llm_chain.py | 2 +- src/longparser/server/chat/schemas.py | 2 +- src/longparser/server/embeddings.py | 2 +- 24 files changed, 105 insertions(+), 44 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 278e954..dee8694 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -23,7 +23,7 @@ jobs: strategy: fail-fast: false matrix: - python-version: ["3.10", "3.11", "3.12"] + python-version: ["3.10", "3.11", "3.12", "3.13"] steps: - uses: actions/checkout@v4 - uses: actions/setup-python@v5 diff --git a/CHANGELOG.md b/CHANGELOG.md index 5e65701..8a8237a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,6 +5,26 @@ All notable changes to **LongParser** are documented here. This project follows [Semantic Versioning](https://semver.org/) and [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.1.3] — 2026-04-13 + +### Fixed + +- **Source code**: Added `DocumentPipeline` as a public alias for `PipelineOrchestrator` — + docs, quickstart, and all examples now use this name consistently +- **Documentation**: Fixed wrong coverage path `long_parser` → `longparser` in `CONTRIBUTING.md` +- **Documentation**: Replaced stale `cleanrag-api` reference in Docker deployment docs +- **Documentation**: Standardized Gemini API key env var to `GOOGLE_API_KEY` across all docs +- **Source code**: Updated default LLM model fallback from `gpt-4o` to `gpt-5.3` in + `schemas.py`, `llm_chain.py`, and `engine.py` +- **Source code**: Renamed stale `cleanrag:` Redis key prefix to `longparser:` in embeddings + +### Changed + +- Python 3.13 added to CI matrix, badges, and installation docs +- `SECURITY.md` updated with Redis rate-limiting and CORS threat mitigations + +--- + ## [0.1.2] — 2026-04-05 ### Changed diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index f44546e..06acdab 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -84,7 +84,7 @@ Use Python 3.10+ type hints. All public API must be fully annotated. uv run pytest tests/unit/ -v # With coverage: -uv run pytest tests/unit/ --cov=src/long_parser --cov-report=term-missing +uv run pytest tests/unit/ --cov=src/longparser --cov-report=term-missing # Full test suite (requires MongoDB + Redis): uv run pytest tests/ -v diff --git a/README.md b/README.md index 3b4f72a..dce377d 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ Monthly Downloads - Python + Python MIT License @@ -105,9 +105,9 @@ pip install "longparser[cpu]" ### Python SDK ```python -from longparser import PipelineOrchestrator, ProcessingConfig +from longparser import DocumentPipeline, ProcessingConfig -pipeline = PipelineOrchestrator() +pipeline = DocumentPipeline(ProcessingConfig()) result = pipeline.process_file("document.pdf") print(f"Pages: {result.document.metadata.total_pages}") @@ -186,7 +186,7 @@ src/longparser/ ├── schemas.py ← core Pydantic models (Document, Block, Chunk, …) ├── extractors/ ← Docling, LaTeX OCR backends ├── chunkers/ ← HybridChunker -├── pipeline/ ← PipelineOrchestrator +├── pipeline/ ← DocumentPipeline ├── integrations/ ← LangChain loader & LlamaIndex reader ├── utils/ ← shared helpers (RTL detection, …) └── server/ ← REST API layer diff --git a/SECURITY.md b/SECURITY.md index ba315cf..9932f71 100644 --- a/SECURITY.md +++ b/SECURITY.md @@ -35,6 +35,8 @@ Key risks: | **MongoDB injection** | Motor driver + typed Pydantic inputs prevent injection | | **SSRF via webhook** | No outbound HTTP made based on user input | | **Hallucinated citations** | Citation IDs validated against retrieved set before returning to client | +| **DDoS / Spam via API** | Route-level Rate Limiting strictly isolated per tenant via Redis | +| **Cross-Origin attacks** | Configurable CORS restrictions and strict Tenant Isolation | ## Dependency Security diff --git a/docs/changelog.md b/docs/changelog.md index 9523c0c..2fa3957 100644 --- a/docs/changelog.md +++ b/docs/changelog.md @@ -5,6 +5,26 @@ All notable changes to **LongParser** are documented here. This project follows [Semantic Versioning](https://semver.org/) and [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). +## [0.1.3] — 2026-04-13 + +### Fixed + +- **Source code**: Added `DocumentPipeline` as a public alias for `PipelineOrchestrator` — + docs, quickstart, and all examples now use this name consistently +- **Documentation**: Fixed wrong coverage path `long_parser` → `longparser` in `CONTRIBUTING.md` +- **Documentation**: Replaced stale `cleanrag-api` reference in Docker deployment docs +- **Documentation**: Standardized Gemini API key env var to `GOOGLE_API_KEY` across all docs +- **Source code**: Updated default LLM model fallback from `gpt-4o` to `gpt-5.3` in + `schemas.py`, `llm_chain.py`, and `engine.py` +- **Source code**: Renamed stale `cleanrag:` Redis key prefix to `longparser:` in embeddings + +### Changed + +- Python 3.13 added to CI matrix, badges, and installation docs +- `SECURITY.md` updated with Redis rate-limiting and CORS threat mitigations + +--- + ## [0.1.2] — 2026-04-05 ### Changed diff --git a/docs/contributing.md b/docs/contributing.md index e8b7196..72727c9 100644 --- a/docs/contributing.md +++ b/docs/contributing.md @@ -84,7 +84,7 @@ Use Python 3.10+ type hints. All public API must be fully annotated. uv run pytest tests/unit/ -v # With coverage: -uv run pytest tests/unit/ --cov=src/long_parser --cov-report=term-missing +uv run pytest tests/unit/ --cov=src/longparser --cov-report=term-missing # Full test suite (requires MongoDB + Redis): uv run pytest tests/ -v diff --git a/docs/deployment/docker.md b/docs/deployment/docker.md index e462ce5..8ffeac7 100644 --- a/docs/deployment/docker.md +++ b/docs/deployment/docker.md @@ -49,5 +49,5 @@ docker compose up --scale longparser=3 ```bash curl http://localhost:8000/health -# {"status": "ok", "service": "cleanrag-api"} +# {"status": "ok", "service": "longparser-api"} ``` diff --git a/docs/deployment/environment.md b/docs/deployment/environment.md index 3245c88..0d8d28c 100644 --- a/docs/deployment/environment.md +++ b/docs/deployment/environment.md @@ -16,7 +16,7 @@ Copy `.env.example` to `.env` and configure for your deployment. | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider | | `LONGPARSER_LLM_MODEL` | _(provider default)_ | Model name | | `OPENAI_API_KEY` | — | OpenAI API key | -| `GEMINI_API_KEY` | — | Google Gemini API key | +| `GOOGLE_API_KEY` | — | Google Gemini API key | | `GROQ_API_KEY` | — | Groq API key | | `OPENROUTER_API_KEY` | — | OpenRouter API key | diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md index efd370f..859c2c1 100644 --- a/docs/getting-started/configuration.md +++ b/docs/getting-started/configuration.md @@ -33,7 +33,7 @@ cp .env.example .env |---|---| | `LONGPARSER_LLM_PROVIDER` | `openai` / `gemini` / `groq` / `openrouter` | | `LONGPARSER_LLM_MODEL` | Model name (uses provider default if unset) | -| `GEMINI_API_KEY` | For Google Gemini | +| `GOOGLE_API_KEY` | For Google Gemini | | `GROQ_API_KEY` | For Groq | ## Vector Store diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 908f659..5356c04 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -2,7 +2,7 @@ ## Requirements -- Python 3.10, 3.11, or 3.12 +- Python 3.10, 3.11, 3.12, or 3.13 - Tesseract OCR (`brew install tesseract` / `apt install tesseract-ocr`) --- @@ -104,5 +104,5 @@ The server starts on `http://localhost:8000`. ```python import longparser -print(longparser.__version__) # 0.1.2 +print(longparser.__version__) # 0.1.3 ``` diff --git a/docs/getting-started/quickstart.md b/docs/getting-started/quickstart.md index b779f4b..e501288 100644 --- a/docs/getting-started/quickstart.md +++ b/docs/getting-started/quickstart.md @@ -17,11 +17,11 @@ from longparser import DocumentPipeline, ProcessingConfig pipeline = DocumentPipeline(ProcessingConfig()) # Parse a PDF -doc = pipeline.process("research_paper.pdf") +result = pipeline.process_file("research_paper.pdf") -print(f"Pages: {len(doc.pages)}") -print(f"Blocks: {len(doc.blocks)}") -print(f"Chunks: {len(doc.chunks)}") +print(f"Pages: {result.document.metadata.total_pages}") +print(f"Chunks: {len(result.chunks)}") +print(result.chunks[0].text) ``` ## 3. Inspect Chunks diff --git a/docs/guide/chat.md b/docs/guide/chat.md index a3fb8e6..7ddc175 100644 --- a/docs/guide/chat.md +++ b/docs/guide/chat.md @@ -70,6 +70,6 @@ Every answer's `cited_chunk_ids` are validated against the retrieved set. IDs no | Provider | Key | |---|---| | OpenAI | `OPENAI_API_KEY` | -| Google Gemini | `GEMINI_API_KEY` | +| Google Gemini | `GOOGLE_API_KEY` | | Groq | `GROQ_API_KEY` | | OpenRouter | `OPENROUTER_API_KEY` | diff --git a/docs/guide/parsing.md b/docs/guide/parsing.md index 171c5b9..93c6386 100644 --- a/docs/guide/parsing.md +++ b/docs/guide/parsing.md @@ -18,7 +18,7 @@ LongParser uses **Docling** with Tesseract CLI OCR as its extraction engine — from longparser import DocumentPipeline, ProcessingConfig pipeline = DocumentPipeline(ProcessingConfig()) -doc = pipeline.process("paper.pdf") +result = pipeline.process_file("paper.pdf") ``` ## Formula Modes @@ -36,15 +36,15 @@ config = ProcessingConfig(formula_mode="smart") ```python # Pages -for page in doc.pages: +for page in result.document.pages: print(f"Page {page.page_number}: {page.width}x{page.height}") # Blocks (semantic units) -for block in doc.blocks: +for block in result.document.blocks: print(f"[{block.type}] p={block.provenance.page_number}: {block.text[:80]}") # Chunks (RAG-ready) -for chunk in doc.chunks: +for chunk in result.chunks: print(f"{chunk.chunk_type} | {chunk.token_count} tokens | pages={chunk.page_numbers}") ``` diff --git a/docs/index.md b/docs/index.md index 650ed63..4e7ff6e 100644 --- a/docs/index.md +++ b/docs/index.md @@ -16,7 +16,7 @@ Monthly Downloads   - Python + Python   MIT License @@ -57,9 +57,10 @@ pip install longparser from longparser import DocumentPipeline, ProcessingConfig pipeline = DocumentPipeline(ProcessingConfig()) -doc = pipeline.process("report.pdf") +result = pipeline.process_file("report.pdf") -print(f"Extracted {len(doc.blocks)} blocks, {len(doc.chunks)} chunks") +print(f"Chunks: {len(result.chunks)}") +print(result.chunks[0].text) ``` --- diff --git a/docs/reference/pipeline.md b/docs/reference/pipeline.md index 8f3e5a4..7cdfbf9 100644 --- a/docs/reference/pipeline.md +++ b/docs/reference/pipeline.md @@ -7,39 +7,49 @@ The `DocumentPipeline` is the main entry point for LongParser's extraction pipel ```python from longparser import DocumentPipeline, ProcessingConfig -pipeline = DocumentPipeline(config=ProcessingConfig()) -doc = pipeline.process("document.pdf") +pipeline = DocumentPipeline(ProcessingConfig()) +result = pipeline.process_file("document.pdf") ``` ### Constructor ```python -DocumentPipeline(config: ProcessingConfig) +DocumentPipeline(config: ProcessingConfig | None = None) ``` | Parameter | Type | Description | |---|---|---| -| `config` | `ProcessingConfig` | Extraction and chunking configuration | +| `config` | `ProcessingConfig \| None` | Extraction and chunking configuration (uses defaults if `None`) | ### Methods -#### `process(file_path)` +#### `process_file(file_path)` Process a document end-to-end through Extract → Validate → Chunk. ```python -doc = pipeline.process("report.pdf") -# Returns: longparser.schemas.Document +result = pipeline.process_file("report.pdf") +# Returns: longparser.pipeline.PipelineResult ``` -**Returns:** `Document` with `.pages`, `.blocks`, `.chunks` populated. +**Returns:** `PipelineResult` with `.document` and `.chunks` populated. + +#### `process(request)` + +Process a document from a `JobRequest` object. + +```python +from longparser import JobRequest +request = JobRequest(file_path="report.pdf") +result = pipeline.process(request) +``` #### `process_batch(file_paths)` Process multiple documents sequentially. ```python -docs = pipeline.process_batch(["a.pdf", "b.docx", "c.pptx"]) +results = pipeline.process_batch(["a.pdf", "b.docx", "c.pptx"]) ``` ## ProcessingConfig diff --git a/docs/reference/schemas.md b/docs/reference/schemas.md index 7e33ac6..e4dda21 100644 --- a/docs/reference/schemas.md +++ b/docs/reference/schemas.md @@ -4,7 +4,7 @@ Core data models used throughout LongParser. ## Document -Top-level container returned by `DocumentPipeline.process()`. +Top-level container returned by `DocumentPipeline.process_file()`. ```python class Document: diff --git a/pyproject.toml b/pyproject.toml index 38330da..afea16d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "longparser" -version = "0.1.2" +version = "0.1.3" description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines." readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.10" diff --git a/src/longparser/__init__.py b/src/longparser/__init__.py index 5de272e..7d00c7e 100755 --- a/src/longparser/__init__.py +++ b/src/longparser/__init__.py @@ -9,9 +9,9 @@ Quick start:: - from longparser import PipelineOrchestrator, ProcessingConfig + from longparser import DocumentPipeline, ProcessingConfig - pipeline = PipelineOrchestrator() + pipeline = DocumentPipeline(ProcessingConfig()) result = pipeline.process_file("document.pdf") print(result.chunks[0].text) @@ -19,13 +19,13 @@ uv run uvicorn longparser.server.app:app --reload --port 8000 -See :class:`~longparser.pipeline.PipelineOrchestrator` for the main SDK entry +See :class:`~longparser.pipeline.DocumentPipeline` for the main SDK entry point and :mod:`longparser.server` for the REST API layer. """ from __future__ import annotations -__version__ = "0.1.2" +__version__ = "0.1.3" __author__ = "ENDEVSOLS Team" __license__ = "MIT" @@ -62,6 +62,9 @@ def __getattr__(name: str): if name == "PipelineOrchestrator": from .pipeline import PipelineOrchestrator return PipelineOrchestrator + if name == "DocumentPipeline": + from .pipeline import DocumentPipeline + return DocumentPipeline if name == "PipelineResult": from .pipeline import PipelineResult return PipelineResult @@ -99,6 +102,7 @@ def __getattr__(name: str): # Lazily imported (require extras) "DoclingExtractor", "PipelineOrchestrator", + "DocumentPipeline", "PipelineResult", "HybridChunker", ] diff --git a/src/longparser/pipeline/__init__.py b/src/longparser/pipeline/__init__.py index 6b775d9..710800e 100755 --- a/src/longparser/pipeline/__init__.py +++ b/src/longparser/pipeline/__init__.py @@ -2,7 +2,11 @@ from .orchestrator import PipelineOrchestrator, PipelineResult +# Public alias — docs and quickstart use this name +DocumentPipeline = PipelineOrchestrator + __all__ = [ "PipelineOrchestrator", + "DocumentPipeline", "PipelineResult", ] diff --git a/src/longparser/server/chat/engine.py b/src/longparser/server/chat/engine.py index b55b7cf..d50a7af 100755 --- a/src/longparser/server/chat/engine.py +++ b/src/longparser/server/chat/engine.py @@ -76,7 +76,7 @@ # Token Counting (model-aware) — kept as custom logic # --------------------------------------------------------------------------- -def count_tokens(text: str, model: str = "gpt-4o") -> int: +def count_tokens(text: str, model: str = "gpt-5.3") -> int: """Count tokens — exact for OpenAI models, conservative approx for others.""" try: import tiktoken @@ -96,7 +96,7 @@ def budget_trim( recent_turns: list[dict], rolling_summary: str, long_term_facts: list[dict], - model: str = "gpt-4o", + model: str = "gpt-5.3", max_prompt_tokens: int = 6000, ) -> dict: """Priority-ordered truncation of prompt variables to fit token budget. diff --git a/src/longparser/server/chat/llm_chain.py b/src/longparser/server/chat/llm_chain.py index f2cb8e7..b32bb2f 100755 --- a/src/longparser/server/chat/llm_chain.py +++ b/src/longparser/server/chat/llm_chain.py @@ -115,7 +115,7 @@ def get_chat_model( """ config = config or ChatConfig() provider = provider or config.llm_provider - model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-4o") + model = model or config.llm_model or DEFAULT_MODELS.get(provider, "gpt-5.3") max_tokens = max_tokens or config.max_output_tokens creator = _CREATORS.get(provider) diff --git a/src/longparser/server/chat/schemas.py b/src/longparser/server/chat/schemas.py index 0405a84..0479cf7 100755 --- a/src/longparser/server/chat/schemas.py +++ b/src/longparser/server/chat/schemas.py @@ -33,7 +33,7 @@ class ChatConfig(BaseModel): default_factory=lambda: os.getenv("LONGPARSER_LLM_PROVIDER", "openai") ) llm_model: str = Field( - default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-4o") + default_factory=lambda: os.getenv("LONGPARSER_LLM_MODEL", "gpt-5.3") ) max_input_tokens: int = Field( default_factory=lambda: int(os.getenv("LONGPARSER_CHAT_MAX_INPUT_TOKENS", "1000")) diff --git a/src/longparser/server/embeddings.py b/src/longparser/server/embeddings.py index e59f513..e0b2bbc 100755 --- a/src/longparser/server/embeddings.py +++ b/src/longparser/server/embeddings.py @@ -108,7 +108,7 @@ def dim(self) -> int: return self._dim fp = self.get_fingerprint() - cache_key = f"cleanrag:embed_dim:{fp}" + cache_key = f"longparser:embed_dim:{fp}" # 1) Try Redis cross-process cache if available try: From ce34496489167c9fdeb0251482532d17c8d7eec9 Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Wed, 22 Apr 2026 17:44:32 +0500 Subject: [PATCH 5/7] Release v0.1.4: Add fast PDF extractor, auto-language detection, AGPL safety checks, and fix LangChain/LlamaIndex adapters --- .github/workflows/license-check.yml | 50 ++ FEATURE_ROADMAP.md | 150 ++++++ LICENSE-THIRD-PARTY.md | 50 ++ docs/getting-started/installation.md | 2 +- pyproject.toml | 17 +- src/longparser/__init__.py | 7 +- .../extractors/pymupdf_extractor.py | 493 ++++++++++++++++++ src/longparser/integrations/__init__.py | 6 +- src/longparser/integrations/langchain.py | 5 +- src/longparser/integrations/llamaindex.py | 5 +- src/longparser/pipeline/orchestrator.py | 207 +++++++- src/longparser/schemas.py | 22 +- src/longparser/utils/__init__.py | 11 +- src/longparser/utils/lang_detect.py | 193 +++++++ src/longparser/utils/ocr_router.py | 148 ++++++ tests/benchmarks/benchmark_pipeline.py | 98 ++++ tests/unit/test_backward_compat.py | 142 +++++ tests/unit/test_license_safety.py | 82 +++ 18 files changed, 1652 insertions(+), 36 deletions(-) create mode 100644 .github/workflows/license-check.yml create mode 100644 FEATURE_ROADMAP.md create mode 100644 LICENSE-THIRD-PARTY.md create mode 100644 src/longparser/extractors/pymupdf_extractor.py create mode 100644 src/longparser/utils/lang_detect.py create mode 100644 src/longparser/utils/ocr_router.py create mode 100644 tests/benchmarks/benchmark_pipeline.py create mode 100644 tests/unit/test_backward_compat.py create mode 100644 tests/unit/test_license_safety.py diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml new file mode 100644 index 0000000..39b5031 --- /dev/null +++ b/.github/workflows/license-check.yml @@ -0,0 +1,50 @@ +name: License Safety Check + +on: [push, pull_request] + +jobs: + license-check: + name: Ensure no GPL/AGPL imports in core + runs-on: ubuntu-latest + steps: + - uses: actions/checkout@v4 + + - name: Check core files for GPL/AGPL imports + run: | + echo "=== License Safety Check ===" + echo "Verifying no GPL/AGPL package is imported in core code..." + echo "" + + FAIL=0 + + # List of GPL/AGPL package import patterns to block + BLOCKED_PATTERNS="pymupdf4llm|pymupdf|import marker\.|from marker\.|import surya|from surya" + + # Files that ARE allowed to import these (isolated backends) + ALLOWED_FILES=( + "src/longparser/extractors/pymupdf_extractor.py" + "src/longparser/extractors/marker_extractor.py" + ) + + # Build grep exclude args + EXCLUDE_ARGS="" + for f in "${ALLOWED_FILES[@]}"; do + EXCLUDE_ARGS="$EXCLUDE_ARGS --exclude=$f" + done + + # Search all Python files in src/longparser EXCEPT allowed files + MATCHES=$(grep -rnE "$BLOCKED_PATTERNS" src/longparser/ \ + --include='*.py' $EXCLUDE_ARGS || true) + + if [ -n "$MATCHES" ]; then + echo "❌ FAIL: GPL/AGPL imports found in core code!" + echo "" + echo "$MATCHES" + echo "" + echo "These packages must ONLY be imported in their isolated extractor files." + FAIL=1 + else + echo "✅ PASS: No GPL/AGPL imports in core code." + fi + + exit $FAIL diff --git a/FEATURE_ROADMAP.md b/FEATURE_ROADMAP.md new file mode 100644 index 0000000..2ea7b1a --- /dev/null +++ b/FEATURE_ROADMAP.md @@ -0,0 +1,150 @@ +# LongParser — Product & Feature Roadmap + +> This roadmap reflects the current development direction based on community trends, +> competitor analysis, and the RAG ecosystem in 2025–2026. Items are ordered by +> priority within each phase. All dates are targets, not guarantees. + +--- + +## Current State — v0.1.x ✅ + +- 5-stage extraction pipeline (Extract → Validate → HITL → Chunk → Embed → Index) +- Multi-format support: PDF, DOCX, PPTX, XLSX, CSV via Docling +- `HybridChunker` — 6-strategy token-aware, hierarchy-aware, table-aware chunking +- Human-in-the-Loop (HITL) review via LangGraph `interrupt()` +- 3-layer memory chat engine (short-term + rolling summary + long-term facts) +- Multi-provider LLM: OpenAI, Gemini, Groq, OpenRouter +- Multi-backend vector stores: Chroma, FAISS, Qdrant +- FastAPI REST server + ARQ/Redis job queue + Motor/MongoDB +- LangChain `BaseRetriever` + LlamaIndex `BaseReader` adapters +- CPU / GPU install separation via extras + +--- + +## Phase 1 — Accuracy & Quality (v0.2.x) — Q2 2026 + +### Parser Enhancements + +- [ ] **Marker backend** — add `marker-pdf` as an optional extraction backend for higher-fidelity Markdown output on complex academic PDFs +- [ ] **PyMuPDF4LLM backend** — lightweight, fast alternative for speed-critical pipelines (10× faster than Docling for simple PDFs) +- [ ] **Scanned PDF fast path** — route documents to Tesseract vs pix2tex vs Surya automatically based on page complexity score +- [ ] **Multi-column layout detection** — prevent reading-order errors in newspaper/journal-style layouts +- [ ] **Image extraction** — export embedded figures with captions into separate chunks with `type: figure` +- [ ] **Document language auto-detection** — select OCR model automatically based on detected script + +### Chunking Improvements + +- [ ] **Semantic chunking** — optional embedding-based boundary detection (split at semantic shifts, not just token counts) +- [ ] **Sliding window overlap** — configurable overlap strategy per chunk type (more overlap for tables, less for headings) +- [ ] **Cross-reference resolution** — link `(see Figure 3)` and `(Table 2)` references to their target blocks +- [ ] **Summary chunks** — auto-generate a 1–2 sentence summary chunk per section for hierarchical retrieval + +### Quality & Validation + +- [ ] **Chunk quality scorer** — assign a confidence score per chunk based on OCR confidence, completeness, and structural integrity +- [ ] **PII detection** — flag and optionally redact personal information (names, emails, phone numbers) before embedding +- [ ] **Duplicate block detection** — suppress repeated headers/footers that appear on every page + +--- + +## Phase 2 — Agentic & Multimodal (v0.3.x) — Q3 2026 + +### Agentic RAG + +- [ ] **Agentic retrieval loop** — implement query rewriting + iterative retrieval + self-reflection before answer generation +- [ ] **Multi-hop question answering** — chain retrieval steps for questions that span multiple sections or documents +- [ ] **Tool-calling integration** — expose document pipeline as a LangChain/LangGraph tool callable by autonomous agents +- [ ] **Hypothetical Document Embeddings (HyDE)** — generate hypothetical answers to queries for improved retrieval recall + +### Multimodal + +- [ ] **Vision-Language Model (VLM) integration** — use GPT-4o / Gemini Vision to describe figures, charts, and diagrams as text chunks +- [ ] **Chart data extraction** — parse bar/line/pie charts into structured data tables +- [ ] **Slide layout understanding** — treat PPTX slides as visual units with spatial layout context, not just text extraction + +### Reranking & Retrieval + +- [ ] **Cross-encoder reranker** — add optional `sentence-transformers` cross-encoder reranking step after initial retrieval +- [ ] **Hybrid search** — combine dense vector search with BM25 sparse retrieval (reciprocal rank fusion) +- [ ] **Maximum Marginal Relevance (MMR)** — reduce redundancy in retrieved chunks +- [ ] **Metadata filtering** — filter chunks by `page_number`, `section`, `doc_type`, `date` at query time + +--- + +## Phase 3 — Enterprise & Observability (v0.4.x) — Q4 2026 + +### Knowledge Graph + +- [ ] **Entity extraction** — extract named entities (people, organizations, dates, locations) from chunks +- [ ] **Relationship mapping** — build entity relationship graphs from document content +- [ ] **Graph-based retrieval** — traverse the entity graph for multi-hop retrieval (GraphRAG pattern) +- [ ] **Neo4j / NetworkX integration** — persist the knowledge graph to a graph database + +### Evaluation Framework + +- [ ] **Built-in RAG evaluator** — measure retrieval recall@k, answer faithfulness, and context adherence +- [ ] **Chunk attribution** — trace every answer sentence back to the source chunk and page +- [ ] **RAGAS integration** — plug into the RAGAS evaluation framework +- [ ] **Benchmark suite** — reproducible benchmarks against Unstructured, LlamaParse, Docling standalone + +### Observability & Compliance + +- [ ] **LangSmith integration** — trace every pipeline run end-to-end +- [ ] **OpenTelemetry support** — emit spans/traces to any OTel-compatible backend +- [ ] **Audit log** — immutable log of every HITL decision (approve/reject/edit) with timestamps and user IDs +- [ ] **GDPR compliance mode** — PII redaction + right-to-erasure support (delete all chunks for a document) +- [ ] **Role-based access control (RBAC)** — multi-tenant document access in the REST API + +--- + +## Phase 4 — Scale & Ecosystem (v0.5.x+) — 2027 + +### Performance & Scale + +- [ ] **Async parallel extraction** — process multiple documents concurrently in the background worker +- [ ] **Streaming extraction** — yield blocks as they are extracted (no need to wait for full document) +- [ ] **Incremental indexing** — update only changed pages/sections on re-upload +- [ ] **S3 / GCS / Azure Blob** — native cloud storage input (not just local files) +- [ ] **Kubernetes Helm chart** — one-command production deployment + +### New Integrations + +- [ ] **Weaviate** vector store adapter +- [ ] **Pinecone** vector store adapter +- [ ] **Milvus** vector store adapter +- [ ] **DSPy** integration — use DSPy to auto-optimize retrieval prompts +- [ ] **Haystack `DocumentConverter`** component +- [ ] **Flowise / Langflow** node — drag-and-drop visual pipeline builder support + +### Developer Experience + +- [ ] **LongParser CLI** — `longparser parse document.pdf --output chunks.json` +- [ ] **Web UI (HITL Dashboard)** — visual interface for reviewing and editing blocks before embedding +- [ ] **VS Code extension** — preview parsed chunks directly from the editor +- [ ] **Webhook support** — notify external systems when a job completes or requires HITL review + +--- + +## Competitive Positioning + +| Capability | LongParser | Unstructured | LlamaParse | Docling | +|---|---|---|---|---| +| Privacy-first (fully local) | ✅ | ⚠️ (cloud option) | ❌ (API-only) | ✅ | +| HITL review workflow | ✅ | ❌ | ❌ | ❌ | +| Bundled REST API server | ✅ | ✅ (paid) | ✅ (cloud) | ❌ | +| Table-aware chunking | ✅ | ⚠️ | ✅ | ✅ | +| LaTeX / equation OCR | ✅ | ❌ | ⚠️ | ⚠️ | +| LangChain + LlamaIndex | ✅ | ✅ | ✅ | ⚠️ | +| Open source (MIT) | ✅ | ⚠️ (core only) | ❌ | ✅ | +| Knowledge graph (planned) | 🔜 | ❌ | ❌ | ❌ | +| Agentic retrieval (planned) | 🔜 | ❌ | ⚠️ | ❌ | + +--- + +## Guiding Principles + +1. **Privacy by default** — all processing runs locally; no data leaves user infrastructure +2. **Human oversight** — HITL is a first-class citizen, not an afterthought +3. **Composable** — every stage is independently usable; no forced lock-in to the full stack +4. **Production-grade** — async, typed, tested, documented from day one +5. **Ecosystem-native** — LangChain, LlamaIndex, and HuggingFace are first-class integration targets diff --git a/LICENSE-THIRD-PARTY.md b/LICENSE-THIRD-PARTY.md new file mode 100644 index 0000000..257709f --- /dev/null +++ b/LICENSE-THIRD-PARTY.md @@ -0,0 +1,50 @@ +# Third-Party Licenses + +LongParser core is licensed under the **MIT License**. + +Some **optional** backends and integrations use different licenses. +These packages are **never loaded by default** — they are only imported +when you explicitly install them and select them in your configuration. + +## Optional Backend Licenses + +| Package | License | Install Command | When Loaded | +|---------|---------|-----------------|-------------| +| `pymupdf4llm` | AGPL-3.0 or Artifex Commercial | `pip install "longparser[pymupdf]"` | Only when you set `backend="pymupdf"` | +| `marker-pdf` | GPL-3.0-or-later | `pip install "longparser[marker]"` | Only when you set `backend="marker"` *(future)* | +| `surya-ocr` | GPL-3.0-or-later | `pip install "longparser[surya]"` | Only when explicitly imported *(future)* | + +## Core Dependency Licenses (always installed) + +| Package | License | Purpose | +|---------|---------|---------| +| `pydantic` | MIT | Schema validation | +| `docling` | MIT | Default PDF extraction engine | +| `docling-core` | MIT | Docling data models | +| `fast-langdetect` | Apache-2.0 | Document language detection | + +## What This Means for You + +- **If you only use `pip install longparser`** — everything is MIT or Apache-2.0. + You can use LongParser in any project (commercial, proprietary, open source). + +- **If you install `longparser[pymupdf]`** — the `pymupdf4llm` library is + AGPL-3.0 licensed. You must comply with AGPL terms for the PyMuPDF component, + OR purchase a commercial license from [Artifex](https://artifex.com). + LongParser core code remains MIT. + +- **If you install `longparser[marker]`** *(future)* — the `marker-pdf` library + is GPL-3.0 licensed. You must comply with GPL terms for the Marker component. + LongParser core code remains MIT. + +## License Isolation Guarantee + +LongParser uses **lazy imports** to ensure GPL/AGPL packages are never loaded +unless explicitly requested. The following guarantees hold: + +1. `import longparser` does NOT import any GPL/AGPL package +2. `from longparser import DocumentPipeline` does NOT import any GPL/AGPL package +3. `DocumentPipeline().process_file("doc.pdf")` does NOT import any GPL/AGPL + package (uses Docling, which is MIT) +4. GPL/AGPL code is only loaded when you explicitly set `backend="pymupdf"` or + `backend="marker"` in `ProcessingConfig` diff --git a/docs/getting-started/installation.md b/docs/getting-started/installation.md index 5356c04..4ee1d42 100644 --- a/docs/getting-started/installation.md +++ b/docs/getting-started/installation.md @@ -104,5 +104,5 @@ The server starts on `http://localhost:8000`. ```python import longparser -print(longparser.__version__) # 0.1.3 +print(longparser.__version__) # 0.1.4 ``` diff --git a/pyproject.toml b/pyproject.toml index afea16d..dbb7cbe 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "longparser" -version = "0.1.3" +version = "0.1.4" description = "Privacy-first document intelligence engine — converts PDFs, DOCX, PPTX, XLSX, and CSV into AI-ready Markdown + structured JSON for RAG pipelines." readme = {file = "README.md", content-type = "text/markdown"} requires-python = ">=3.10" @@ -36,6 +36,7 @@ dependencies = [ "docling>=2.14", "docling-core>=2.13", "langgraph-checkpoint-mongodb>=0.3.1", + "fast-langdetect>=0.3,<1.0", # Apache-2.0 — document language detection ] [project.optional-dependencies] @@ -51,6 +52,20 @@ langchain = [ llamaindex = [ "llama-index-core>=0.10", ] +# ----------- v0.1.4: Optional extraction backends ----------- +# ⚠️ pymupdf4llm is AGPL-3.0 licensed. See LICENSE-THIRD-PARTY.md. +# Only loaded when user sets backend="pymupdf". +pymupdf = [ + "pymupdf4llm>=1.27", +] +# ⚠️ marker-pdf is GPL-3.0. GPU recommended. Future release. +# marker = [ +# "marker-pdf", +# ] +# ⚠️ surya-ocr is GPL-3.0. GPU recommended. Future release. +# surya = [ +# "surya-ocr>=0.17", +# ] # FastAPI REST server + MongoDB + job queue + LangChain chat engine server = [ "fastapi>=0.115", diff --git a/src/longparser/__init__.py b/src/longparser/__init__.py index 7d00c7e..b1b9794 100755 --- a/src/longparser/__init__.py +++ b/src/longparser/__init__.py @@ -25,7 +25,7 @@ from __future__ import annotations -__version__ = "0.1.3" +__version__ = "0.1.4" __author__ = "ENDEVSOLS Team" __license__ = "MIT" @@ -59,6 +59,10 @@ def __getattr__(name: str): if name == "DoclingExtractor": from .extractors import DoclingExtractor return DoclingExtractor + if name == "PyMuPDFExtractor": + # AGPL-isolated — only loaded when explicitly requested + from .extractors.pymupdf_extractor import PyMuPDFExtractor + return PyMuPDFExtractor if name == "PipelineOrchestrator": from .pipeline import PipelineOrchestrator return PipelineOrchestrator @@ -101,6 +105,7 @@ def __getattr__(name: str): "JobResult", # Lazily imported (require extras) "DoclingExtractor", + "PyMuPDFExtractor", "PipelineOrchestrator", "DocumentPipeline", "PipelineResult", diff --git a/src/longparser/extractors/pymupdf_extractor.py b/src/longparser/extractors/pymupdf_extractor.py new file mode 100644 index 0000000..aecf375 --- /dev/null +++ b/src/longparser/extractors/pymupdf_extractor.py @@ -0,0 +1,493 @@ +"""PyMuPDF4LLM-based extractor for fast, CPU-native PDF extraction. + +⚠️ LICENSE NOTICE — AGPL-3.0 + pymupdf4llm is dual-licensed under AGPL-3.0 or Artifex Commercial License. + By using this backend, you agree to the terms of the AGPL-3.0 license + unless you have purchased a commercial license from Artifex Software, Inc. + + This module is NOT imported by default — users must explicitly opt in + via ``pip install longparser[pymupdf]`` and ``backend='pymupdf'``. + +⚠️ ISOLATION RULES (do NOT violate) + 1. This file must NEVER be imported by ``extractors/__init__.py`` + 2. This file must NEVER be imported at module level by ``orchestrator.py`` + 3. This file must ONLY be imported behind ``if backend == "pymupdf":`` + 4. ``import longparser`` must NEVER trigger loading this file + +Best for: + - Native PDFs with embedded text (not scanned) + - Speed-critical pipelines (10-50× faster than Docling) + - CPU-only environments (no GPU, no ML models) + +NOT suitable for: + - Scanned PDFs (no OCR capability) + - Complex tables with merged cells + - Documents needing deep heading hierarchy detection + +Usage:: + + from longparser import ProcessingConfig, DocumentPipeline + + pipeline = DocumentPipeline( + config=ProcessingConfig(backend="pymupdf") + ) + result = pipeline.process_file("report.pdf") +""" + +from __future__ import annotations + +import hashlib +import logging +import uuid +from pathlib import Path +from typing import Optional, List, Tuple + +from ..schemas import ( + Document, Page, Block, Table, TableCell, + BlockType, ExtractorType, ProcessingConfig, + BoundingBox, Provenance, Confidence, BlockFlags, + DocumentMetadata, PageProfile, ExtractionMetadata, +) +from .base import BaseExtractor + +logger = logging.getLogger(__name__) + + +def _require_pymupdf(): + """Check that pymupdf4llm is installed; raise clear error if not. + + Returns the ``pymupdf4llm`` module on success. + """ + try: + import pymupdf4llm + return pymupdf4llm + except ImportError: + raise ImportError( + "\n" + "╔══════════════════════════════════════════════════════════╗\n" + "║ pymupdf4llm is not installed. ║\n" + "║ ║\n" + "║ Install: pip install 'longparser[pymupdf]' ║\n" + "║ ║\n" + "║ ⚠️ pymupdf4llm is licensed under AGPL-3.0. ║\n" + "║ By installing it, you agree to AGPL terms for that ║\n" + "║ component. LongParser core remains MIT-licensed. ║\n" + "║ ║\n" + "║ For commercial use without AGPL obligations, purchase ║\n" + "║ a license from https://artifex.com ║\n" + "╚══════════════════════════════════════════════════════════╝\n" + ) + + +def _require_pymupdf_fitz(): + """Import the fitz (PyMuPDF) module for page-level operations.""" + try: + import pymupdf as fitz + return fitz + except ImportError: + try: + import fitz + return fitz + except ImportError: + raise ImportError( + "PyMuPDF (fitz) is required for the pymupdf backend. " + "Install with: pip install 'longparser[pymupdf]'" + ) + + +class PyMuPDFExtractor(BaseExtractor): + """Fast, CPU-native PDF extractor using PyMuPDF4LLM. + + Converts PDFs to structured Markdown and maps the output to + LongParser's ``Document`` / ``Block`` model. Uses no ML models, + no GPU — pure C-based PDF parsing via MuPDF. + + Attributes + ---------- + extractor_type : ExtractorType + Always ``ExtractorType.NATIVE_PDF``. + version : str + Extractor version string. + """ + + extractor_type = ExtractorType.NATIVE_PDF + version = "1.0.0" + + def __init__(self): + """Initialize and verify pymupdf4llm is available.""" + _require_pymupdf() + self._images: list = [] + logger.info( + "PyMuPDF4LLM backend initialized (CPU-native, no OCR, no GPU)" + ) + + def extract( + self, + file_path: Path, + config: ProcessingConfig, + page_numbers: Optional[List[int]] = None, + ) -> Tuple[Document, ExtractionMetadata]: + """Extract a PDF using PyMuPDF4LLM. + + Parameters + ---------- + file_path: + Path to the PDF file. + config: + Processing configuration. + page_numbers: + Optional list of 0-indexed page numbers to extract. + + Returns + ------- + tuple[Document, ExtractionMetadata] + Extracted document and metadata. + """ + import pymupdf4llm + + file_path = Path(file_path) + logger.info("Extracting with PyMuPDF4LLM: %s", file_path.name) + + # Validate file type + if file_path.suffix.lower() != ".pdf": + raise ValueError( + f"PyMuPDF4LLM backend only supports PDF files, got: {file_path.suffix}" + ) + + # File hash + file_hash = hashlib.sha256(file_path.read_bytes()).hexdigest()[:16] + + # Extract with pymupdf4llm + kwargs = {"show_progress": False} + if page_numbers is not None: + kwargs["pages"] = page_numbers + + md_text = pymupdf4llm.to_markdown(str(file_path), **kwargs) + + # Get page-level info using PyMuPDF directly + fitz = _require_pymupdf_fitz() + pdf_doc = fitz.open(str(file_path)) + total_pages = len(pdf_doc) + + # Extract images if config.export_images + self._images = [] + if config.export_images: + self._extract_images(pdf_doc, config) + + # Build Document from Markdown + document = self._markdown_to_document( + md_text=md_text, + pdf_doc=pdf_doc, + file_path=file_path, + file_hash=file_hash, + total_pages=total_pages, + config=config, + ) + + pdf_doc.close() + + meta = ExtractionMetadata( + strategy_used="pymupdf4llm", + ocr_backend_used="none (native text)", + ) + + logger.info( + "PyMuPDF4LLM extraction complete: %d pages, %d blocks", + total_pages, len(document.all_blocks), + ) + + return document, meta + + def _markdown_to_document( + self, + md_text: str, + pdf_doc, + file_path: Path, + file_hash: str, + total_pages: int, + config: ProcessingConfig, + ) -> Document: + """Convert Markdown text to a LongParser Document model.""" + metadata = DocumentMetadata( + source_file=str(file_path), + file_hash=file_hash, + total_pages=total_pages, + ) + + pages: list[Page] = [] + + # Split markdown by page breaks (pymupdf4llm uses "---" or form feeds) + page_chunks = self._split_by_pages(md_text, total_pages) + + for page_idx, page_md in enumerate(page_chunks): + page_no = page_idx + 1 + + # Get page dimensions from PyMuPDF + if page_idx < len(pdf_doc): + rect = pdf_doc[page_idx].rect + width, height = rect.width, rect.height + else: + width, height = 612.0, 792.0 # Letter default + + # Parse markdown blocks + blocks = self._parse_markdown_blocks(page_md, page_no, file_path) + + # Build page profile + profile = PageProfile( + page_number=page_no, + layout_confidence=0.9, # PyMuPDF is reliable for native PDFs + ) + + pages.append(Page( + page_number=page_no, + width=width, + height=height, + blocks=blocks, + profile=profile, + )) + + return Document(metadata=metadata, pages=pages) + + def _split_by_pages(self, md_text: str, total_pages: int) -> list[str]: + """Split markdown text into per-page chunks.""" + import re + + # pymupdf4llm inserts page separators + # Common patterns: "-----" (5+ dashes), or form feed characters + parts = re.split(r'\n-{3,}\n|\f', md_text) + + # If splitting didn't work, put everything on page 1 + if len(parts) <= 1: + return [md_text] + + # Pad to total_pages if needed + while len(parts) < total_pages: + parts.append("") + + return parts[:total_pages] + + def _parse_markdown_blocks( + self, + page_md: str, + page_no: int, + file_path: Path, + ) -> list[Block]: + """Parse markdown text into Block objects.""" + blocks: list[Block] = [] + lines = page_md.strip().split("\n") + order_idx = 0 + + i = 0 + while i < len(lines): + line = lines[i] + stripped = line.strip() + + if not stripped: + i += 1 + continue + + # Detect block type + if stripped.startswith("#"): + # Heading + level = len(stripped) - len(stripped.lstrip("#")) + text = stripped.lstrip("#").strip() + block = self._make_block( + BlockType.HEADING, text, order_idx, page_no, + file_path, heading_level=min(level, 6), + ) + blocks.append(block) + + elif stripped.startswith("|") and "|" in stripped[1:]: + # Table — collect all table lines + table_lines = [stripped] + i += 1 + while i < len(lines) and lines[i].strip().startswith("|"): + table_lines.append(lines[i].strip()) + i += 1 + table_md = "\n".join(table_lines) + table_obj = self._parse_table(table_lines) + block = self._make_block( + BlockType.TABLE, table_md, order_idx, page_no, + file_path, table=table_obj, + ) + blocks.append(block) + order_idx += 1 + continue # Already incremented i + + elif stripped.startswith(("- ", "* ", "+ ")) or ( + len(stripped) > 2 and stripped[0].isdigit() and stripped[1] in ".)" + ): + # List item + text = stripped.lstrip("-*+ ").lstrip("0123456789.)").strip() + block = self._make_block( + BlockType.LIST_ITEM, text, order_idx, page_no, file_path, + ) + blocks.append(block) + + elif stripped.startswith("```"): + # Code block + code_lines = [] + i += 1 + while i < len(lines) and not lines[i].strip().startswith("```"): + code_lines.append(lines[i]) + i += 1 + code_text = "\n".join(code_lines) + block = self._make_block( + BlockType.CODE, code_text, order_idx, page_no, file_path, + ) + blocks.append(block) + i += 1 # Skip closing ``` + order_idx += 1 + continue + + elif stripped.startswith("$$") or stripped.startswith("\\["): + # Equation block + eq_lines = [stripped] + if not (stripped.endswith("$$") and len(stripped) > 2): + i += 1 + while i < len(lines): + eq_line = lines[i].strip() + eq_lines.append(eq_line) + if eq_line.endswith("$$") or eq_line.endswith("\\]"): + break + i += 1 + eq_text = "\n".join(eq_lines) + block = self._make_block( + BlockType.EQUATION, eq_text, order_idx, page_no, file_path, + ) + blocks.append(block) + + else: + # Regular paragraph + block = self._make_block( + BlockType.PARAGRAPH, stripped, order_idx, page_no, file_path, + ) + blocks.append(block) + + order_idx += 1 + i += 1 + + return blocks + + def _make_block( + self, + block_type: BlockType, + text: str, + order_index: int, + page_no: int, + file_path: Path, + heading_level: Optional[int] = None, + table: Optional[Table] = None, + ) -> Block: + """Create a Block with standard provenance.""" + return Block( + type=block_type, + text=text, + order_index=order_index, + heading_level=heading_level, + provenance=Provenance( + source_file=str(file_path), + page_number=page_no, + bbox=BoundingBox(x0=0, y0=0, x1=0, y1=0), + extractor=self.extractor_type, + extractor_version=self.version, + ), + confidence=Confidence(overall=0.9), + table=table, + ) + + def _parse_table(self, table_lines: list[str]) -> Table: + """Parse a Markdown table into a Table object.""" + # Filter out separator lines (|---|---|) + data_lines = [ + line for line in table_lines + if line.strip() and not all(c in "|-: " for c in line.strip()) + ] + + if not data_lines: + return Table(n_rows=0, n_cols=0) + + cells: list[TableCell] = [] + n_cols = 0 + + for row_idx, line in enumerate(data_lines): + parts = [p.strip() for p in line.strip("|").split("|")] + n_cols = max(n_cols, len(parts)) + for col_idx, cell_text in enumerate(parts): + cells.append(TableCell( + r0=row_idx, c0=col_idx, text=cell_text + )) + + return Table( + n_rows=len(data_lines), + n_cols=n_cols, + cells=cells, + table_confidence=0.85, + ) + + def _extract_images(self, pdf_doc, config: ProcessingConfig): + """Extract images from PDF pages.""" + for page_idx in range(len(pdf_doc)): + page = pdf_doc[page_idx] + image_list = page.get_images(full=True) + for img_idx, img in enumerate(image_list): + try: + xref = img[0] + base_image = pdf_doc.extract_image(xref) + if base_image: + self._images.append({ + "page": page_idx + 1, + "index": img_idx, + "data": base_image["image"], + "ext": base_image.get("ext", "png"), + }) + except Exception as e: + logger.debug("Failed to extract image on page %d: %s", page_idx + 1, e) + + def save_images(self, output_dir: Path) -> list[Path]: + """Save extracted images to disk. + + Parameters + ---------- + output_dir: + Directory to save images to. + + Returns + ------- + list[Path] + Paths to saved image files. + """ + output_dir = Path(output_dir) + output_dir.mkdir(parents=True, exist_ok=True) + saved = [] + + for img_info in self._images: + fname = f"page_{img_info['page']:03d}_img_{img_info['index']:02d}.{img_info['ext']}" + fpath = output_dir / fname + with open(fpath, "wb") as f: + f.write(img_info["data"]) + saved.append(fpath) + + logger.info("Saved %d images to %s", len(saved), output_dir) + return saved + + def to_markdown(self, document: Document) -> str: + """Convert Document back to Markdown.""" + parts = [] + for page in document.pages: + for block in page.blocks: + if block.type == BlockType.HEADING: + level = block.heading_level or 1 + parts.append(f"{'#' * level} {block.text}") + elif block.type == BlockType.TABLE: + parts.append(block.text) + elif block.type == BlockType.LIST_ITEM: + parts.append(f"- {block.text}") + elif block.type == BlockType.CODE: + parts.append(f"```\n{block.text}\n```") + elif block.type == BlockType.EQUATION: + parts.append(f"$$\n{block.text}\n$$") + else: + parts.append(block.text) + parts.append("") + return "\n".join(parts) diff --git a/src/longparser/integrations/__init__.py b/src/longparser/integrations/__init__.py index b8eae82..44055de 100755 --- a/src/longparser/integrations/__init__.py +++ b/src/longparser/integrations/__init__.py @@ -2,9 +2,9 @@ Install the extras to use these adapters:: - pip install clean_rag[langchain] - pip install clean_rag[llamaindex] - pip install clean_rag[all] + pip install longparser[langchain] + pip install longparser[llamaindex] + pip install longparser[all] """ from __future__ import annotations diff --git a/src/longparser/integrations/langchain.py b/src/longparser/integrations/langchain.py index 59bdba0..7848c31 100755 --- a/src/longparser/integrations/langchain.py +++ b/src/longparser/integrations/langchain.py @@ -5,7 +5,7 @@ Install the extra to use this adapter:: - pip install clean_rag[langchain] + pip install longparser[langchain] Usage:: @@ -27,7 +27,7 @@ _INSTALL_MSG = ( "langchain-core is required for the LangChain adapter. " - "Install it with: pip install clean_rag[langchain]" + "Install it with: pip install longparser[langchain]" ) @@ -95,6 +95,7 @@ def lazy_load(self) -> Iterator["LCDocument"]: from ..pipeline import PipelineOrchestrator pipeline = PipelineOrchestrator( + config=self.config, tesseract_lang=self.tesseract_lang, tessdata_path=self.tessdata_path, ) diff --git a/src/longparser/integrations/llamaindex.py b/src/longparser/integrations/llamaindex.py index a8d4344..d5437b9 100755 --- a/src/longparser/integrations/llamaindex.py +++ b/src/longparser/integrations/llamaindex.py @@ -5,7 +5,7 @@ Install the extra to use this adapter:: - pip install clean_rag[llamaindex] + pip install longparser[llamaindex] Usage:: @@ -27,7 +27,7 @@ _INSTALL_MSG = ( "llama-index-core is required for the LlamaIndex adapter. " - "Install it with: pip install clean_rag[llamaindex]" + "Install it with: pip install longparser[llamaindex]" ) @@ -105,6 +105,7 @@ def load_data( file = Path(file) pipeline = PipelineOrchestrator( + config=self.config, tesseract_lang=self.tesseract_lang, tessdata_path=self.tessdata_path, ) diff --git a/src/longparser/pipeline/orchestrator.py b/src/longparser/pipeline/orchestrator.py index 202be9e..5062a48 100755 --- a/src/longparser/pipeline/orchestrator.py +++ b/src/longparser/pipeline/orchestrator.py @@ -1,4 +1,13 @@ -"""Simple pipeline orchestrator for LongParser.""" +"""Simple pipeline orchestrator for LongParser. + +Supports multiple extraction backends: + +- ``"docling"`` (default) — Docling with Tesseract CLI OCR (MIT) +- ``"pymupdf"`` — PyMuPDF4LLM for fast native PDF extraction (AGPL, optional) +- ``"auto"`` — Automatic backend selection based on document properties + +Language detection runs before OCR to set the correct Tesseract language. +""" from pathlib import Path from dataclasses import dataclass, field @@ -11,6 +20,7 @@ from ..extractors import DoclingExtractor from ..extractors.docling_extractor import HierarchyChunk from ..chunkers import HybridChunker +from ..utils.lang_detect import detect_language, get_tesseract_langs, extract_sample_text logger = logging.getLogger(__name__) @@ -30,43 +40,189 @@ def total_blocks(self) -> int: class PipelineOrchestrator: """ - Simple pipeline orchestrator using Docling. + Pipeline orchestrator with backend selection and language detection. Flow: - 1. Docling extracts with Tesseract CLI OCR - 2. Layout analysis detects structure - 3. HierarchicalChunker preserves heading hierarchy + 1. (Optional) Auto-detect document language + 2. Select backend: Docling, PyMuPDF, or auto-route + 3. Extract with chosen backend + 4. HierarchicalChunker preserves heading hierarchy + + Parameters + ---------- + config: + Processing configuration with backend, language, and layout settings. + Only used for backend selection during init. Per-file config is passed + to ``process_file()``. + tesseract_lang: + Languages for Tesseract OCR (default: ``["eng"]``). Overridden by + ``config.languages`` or auto-detection if enabled. + tessdata_path: + Path to tessdata directory with language models and configs. + force_full_page_ocr: + If True, OCR entire page even if embedded text exists. """ - def __init__(self, tesseract_lang: List[str] = None, tessdata_path: str = None, force_full_page_ocr: bool = False): - """ - Initialize pipeline. - - Args: - tesseract_lang: Languages for Tesseract OCR (default: ["eng"]) - tessdata_path: Path to tessdata directory with language models and configs. - force_full_page_ocr: If True, OCR entire page even if embedded text exists. + def __init__( + self, + config: Optional[ProcessingConfig] = None, + tesseract_lang: List[str] = None, + tessdata_path: str = None, + force_full_page_ocr: bool = False, + ): + self._config = config or ProcessingConfig() + self._tessdata_path = tessdata_path + self._force_full_page_ocr = force_full_page_ocr + self._base_tesseract_lang = tesseract_lang + + # Determine backend from config + backend = self._config.backend + + if backend == "pymupdf": + # Lazy import — only loaded when user explicitly requests it + from ..extractors.pymupdf_extractor import PyMuPDFExtractor + self.extractor = PyMuPDFExtractor() + self._backend_name = "pymupdf" + logger.info("Pipeline initialized with PyMuPDF4LLM backend (CPU-native, fast)") + + elif backend == "auto": + # Auto mode: start with Docling (safe default), route at process time + self.extractor = DoclingExtractor( + tesseract_lang=tesseract_lang, + tessdata_path=tessdata_path, + force_full_page_ocr=force_full_page_ocr, + ) + self._backend_name = "auto" + logger.info("Pipeline initialized in auto mode (will choose backend per document)") + + else: + # Default: Docling (MIT, always available) + self.extractor = DoclingExtractor( + tesseract_lang=tesseract_lang, + tessdata_path=tessdata_path, + force_full_page_ocr=force_full_page_ocr, + ) + self._backend_name = "docling" + logger.info("Pipeline initialized with Docling backend (default)") + + def _resolve_languages( + self, + file_path: Path, + config: ProcessingConfig, + ) -> list[str]: + """Resolve OCR languages via user override or auto-detection. + + Priority order: + 1. ``config.languages`` (explicit user override — always wins) + 2. ``self._base_tesseract_lang`` (constructor param) + 3. Auto-detection via ``fast-langdetect`` (if enabled) + 4. Default: ``["eng"]`` """ - self.extractor = DoclingExtractor( - tesseract_lang=tesseract_lang, - tessdata_path=tessdata_path, - force_full_page_ocr=force_full_page_ocr, - ) - + # 1. Explicit user override + if config.languages: + logger.info("Using user-specified languages: %s", config.languages) + return config.languages + + # 2. Constructor param + if self._base_tesseract_lang: + # If auto-detect is enabled, try to improve on constructor default + if config.auto_detect_language: + detected_langs = self._auto_detect(file_path) + if detected_langs: + return detected_langs + return self._base_tesseract_lang + + # 3. Auto-detect + if config.auto_detect_language: + detected_langs = self._auto_detect(file_path) + if detected_langs: + return detected_langs + + # 4. Default + return ["eng"] + + def _auto_detect(self, file_path: Path) -> Optional[list[str]]: + """Run language detection and return Tesseract codes, or None.""" + sample = extract_sample_text(file_path) + if not sample or len(sample.strip()) < 20: + return None + + lang_code, confidence = detect_language(sample) + if confidence > 0.0: + tess_langs = get_tesseract_langs(lang_code) + logger.info( + "Auto-detected language: %s (%.0f%%) → Tesseract: %s", + lang_code, confidence * 100, tess_langs, + ) + # Store for later use in document metadata + self._detected_lang = lang_code + self._detected_lang_confidence = confidence + return tess_langs + + return None + + def _should_use_pymupdf(self, file_path: Path) -> bool: + """Check if PyMuPDF is a better choice for this file (auto mode).""" + ext = file_path.suffix.lower() + + # PyMuPDF only handles PDFs + if ext != ".pdf": + return False + + # Check if PDF has a text layer (= native, not scanned) + sample = extract_sample_text(file_path, max_chars=500) + if sample and len(sample.strip()) > 100: + # Has text → native PDF → PyMuPDF is faster + try: + from ..extractors.pymupdf_extractor import PyMuPDFExtractor + return True + except ImportError: + # pymupdf4llm not installed — fall back to Docling + logger.debug("Auto mode: pymupdf4llm not installed, using Docling") + return False + + # Scanned PDF or too little text → use Docling (has OCR) + return False + def process(self, request: JobRequest) -> PipelineResult: """Process a document.""" start_time = time.time() file_path = Path(request.file_path) config = request.config + + # Initialize language detection state + self._detected_lang = None + self._detected_lang_confidence = 0.0 logger.info(f"Processing: {file_path.name}") - + + # Auto-mode: decide backend per document + if self._backend_name == "auto" and self._should_use_pymupdf(file_path): + from ..extractors.pymupdf_extractor import PyMuPDFExtractor + extractor = PyMuPDFExtractor() + logger.info("Auto mode selected: PyMuPDF4LLM (native PDF detected)") + else: + extractor = self.extractor + + # Resolve languages for Docling backend + if isinstance(extractor, DoclingExtractor): + resolved_langs = self._resolve_languages(file_path, config) + extractor._languages = resolved_langs + # Extract document - document, meta = self.extractor.extract(file_path, config) - - # Get hierarchy - hierarchy = self.extractor.get_hierarchy(file_path, config) + document, meta = extractor.extract(file_path, config) + + # Inject language detection results into metadata + if self._detected_lang: + document.metadata.detected_language = self._detected_lang + document.metadata.language_confidence = self._detected_lang_confidence + + # Get hierarchy (only DoclingExtractor has this) + if isinstance(extractor, DoclingExtractor): + hierarchy = extractor.get_hierarchy(file_path, config) + else: + hierarchy = [] processing_time = time.time() - start_time logger.info(f"Completed in {processing_time:.2f}s") @@ -164,6 +320,8 @@ def export_results(self, result: PipelineResult, output_dir: Path) -> dict: "total_blocks": len(all_blocks), "total_tables": total_tables, "processing_time_seconds": result.processing_time_seconds, + "detected_language": result.document.metadata.detected_language, + "language_confidence": result.document.metadata.language_confidence, "stages_completed": [ "stage1_extraction", "stage2_validation", @@ -228,3 +386,4 @@ def export_chunks(self, result: PipelineResult, output_dir: Path) -> Path: def save_images(self, output_dir: Path) -> List[Path]: """Save extracted images.""" return self.extractor.save_images(output_dir) + diff --git a/src/longparser/schemas.py b/src/longparser/schemas.py index 60bd47f..6e54f1e 100755 --- a/src/longparser/schemas.py +++ b/src/longparser/schemas.py @@ -118,6 +118,8 @@ class PageProfile(BaseModel): table_confidence: Optional[float] = None has_rtl: bool = False has_math: bool = False + detected_columns: int = Field(default=1, description="Number of text columns detected on page") + reading_order_confidence: float = Field(default=1.0, ge=0.0, le=1.0, description="Confidence of reading-order reconstruction") class Page(BaseModel): @@ -135,6 +137,8 @@ class DocumentMetadata(BaseModel): source_file: str file_hash: str = "" language: Optional[str] = None + detected_language: Optional[str] = Field(default=None, description="Auto-detected language code (ISO 639-1) via fast-langdetect") + language_confidence: float = Field(default=0.0, ge=0.0, le=1.0, description="Confidence of auto-detected language") total_pages: int = 0 academic_mode: bool = False rtl_hint: bool = False @@ -163,6 +167,17 @@ def all_tables(self) -> list[Table]: class ProcessingConfig(BaseModel): """Configuration for pipeline execution.""" + # --- v0.1.4: Backend selection --- + backend: str = Field(default="docling", description="Extraction backend: 'docling' | 'pymupdf' | 'auto'") + + # --- v0.1.4: Language detection --- + languages: Optional[list[str]] = Field(default=None, description="Explicit Tesseract language codes, e.g. ['eng','ara']. Overrides auto-detect.") + auto_detect_language: bool = Field(default=True, description="Auto-detect document language before OCR (uses fast-langdetect)") + + # --- v0.1.4: Multi-column layout --- + column_count_hint: Optional[int] = Field(default=None, description="Manual column count hint. None = auto-detect by Docling") + force_left_to_right: bool = Field(default=False, description="Force left-to-right top-to-bottom reading order") + academic_mode: bool = False rtl_hint: bool = False do_ocr: bool = True @@ -202,6 +217,10 @@ class ExtractionMetadata(BaseModel): reprocessed_pages: list[int] = Field(default_factory=list) ocr_backend_used: Optional[str] = None reasons: list[str] = Field(default_factory=list) + # --- v0.1.4: OCR routing metadata --- + ocr_strategy: str = Field(default="standard", description="OCR strategy used: 'standard' | 'math' | 'full_ocr'") + is_scanned: bool = Field(default=False, description="Whether the document was detected as scanned (no text layer)") + page_complexity_scores: dict[int, int] = Field(default_factory=dict, description="Per-page complexity scores used for OCR routing") class ChunkingConfig(BaseModel): @@ -222,12 +241,13 @@ class Chunk(BaseModel): chunk_id: str = Field(default_factory=lambda: str(uuid.uuid4())) text: str token_count: int - chunk_type: str # "section" | "table" | "table_schema" | "list" | "equation" | "continuation" + chunk_type: str # "section" | "table" | "table_schema" | "list" | "equation" | "figure" | "continuation" section_path: list[str] = Field(default_factory=list) page_numbers: list[int] = Field(default_factory=list) block_ids: list[str] = Field(default_factory=list) overlap_with_previous: bool = False equation_detected: bool = False + image_path: Optional[str] = Field(default=None, description="Path to figure image if chunk_type == 'figure'") metadata: dict = Field(default_factory=dict) # row_start, row_end, sheet, col_band diff --git a/src/longparser/utils/__init__.py b/src/longparser/utils/__init__.py index c642b45..7c7ea22 100755 --- a/src/longparser/utils/__init__.py +++ b/src/longparser/utils/__init__.py @@ -1,5 +1,14 @@ """Utility modules for LongParser.""" from .rtl_detector import detect_rtl_language +from .lang_detect import detect_language, get_tesseract_langs +from .ocr_router import is_page_scanned, score_page_complexity, get_ocr_strategy -__all__ = ["detect_rtl_language"] +__all__ = [ + "detect_rtl_language", + "detect_language", + "get_tesseract_langs", + "is_page_scanned", + "score_page_complexity", + "get_ocr_strategy", +] diff --git a/src/longparser/utils/lang_detect.py b/src/longparser/utils/lang_detect.py new file mode 100644 index 0000000..b544d4b --- /dev/null +++ b/src/longparser/utils/lang_detect.py @@ -0,0 +1,193 @@ +"""Language detection for document text samples. + +Uses ``fast-langdetect`` (Apache-2.0, Facebook FastText model) to detect +the primary language of a text sample and map it to Tesseract language codes. + +This module is designed for zero-failure operation: +- Falls back to English if ``fast-langdetect`` is not installed +- Falls back to English if detection confidence is too low +- Falls back to English on any unexpected error +- Never raises exceptions that would break the pipeline + +Usage:: + + from longparser.utils.lang_detect import detect_language, get_tesseract_langs + + lang, confidence = detect_language("هذا نص عربي") # ("ar", 0.99) + tess_codes = get_tesseract_langs("ar") # ["ara"] +""" + +from __future__ import annotations + +import logging +from typing import Optional + +logger = logging.getLogger(__name__) + +# --------------------------------------------------------------------------- +# Mapping: ISO 639-1 code (fast-langdetect) → Tesseract language code(s) +# --------------------------------------------------------------------------- +_LANG_TO_TESSERACT: dict[str, list[str]] = { + "af": ["afr"], "am": ["amh"], "ar": ["ara"], "az": ["aze"], + "be": ["bel"], "bg": ["bul"], "bn": ["ben"], "bs": ["bos"], + "ca": ["cat"], "cs": ["ces"], "cy": ["cym"], "da": ["dan"], + "de": ["deu"], "el": ["ell"], "en": ["eng"], "es": ["spa"], + "et": ["est"], "eu": ["eus"], "fa": ["fas"], "fi": ["fin"], + "fr": ["fra"], "ga": ["gle"], "gl": ["glg"], "gu": ["guj"], + "ha": ["hau"], "he": ["heb"], "hi": ["hin"], "hr": ["hrv"], + "hu": ["hun"], "hy": ["hye"], "id": ["ind"], "is": ["isl"], + "it": ["ita"], "ja": ["jpn"], "jv": ["jav"], "ka": ["kat"], + "kk": ["kaz"], "km": ["khm"], "kn": ["kan"], "ko": ["kor"], + "la": ["lat"], "lt": ["lit"], "lv": ["lav"], "mk": ["mkd"], + "ml": ["mal"], "mn": ["mon"], "mr": ["mar"], "ms": ["msa"], + "my": ["mya"], "ne": ["nep"], "nl": ["nld"], "no": ["nor"], + "pa": ["pan"], "pl": ["pol"], "pt": ["por"], "ro": ["ron"], + "ru": ["rus"], "si": ["sin"], "sk": ["slk"], "sl": ["slv"], + "sq": ["sqi"], "sr": ["srp"], "sv": ["swe"], "sw": ["swa"], + "ta": ["tam"], "te": ["tel"], "th": ["tha"], "tl": ["tgl"], + "tr": ["tur"], "uk": ["ukr"], "ur": ["urd"], "uz": ["uzb"], + "vi": ["vie"], "yo": ["yor"], + # Chinese variants + "zh": ["chi_sim", "chi_tra"], +} + + +def detect_language( + text: str, + min_confidence: float = 0.5, +) -> tuple[str, float]: + """Detect the primary language of a text sample. + + Parameters + ---------- + text: + Text sample to analyze. At least 20 characters recommended. + min_confidence: + Minimum confidence threshold. Below this, falls back to ``"en"``. + + Returns + ------- + tuple[str, float] + ``(language_code, confidence)`` — e.g. ``("ar", 0.99)``. + Falls back to ``("en", 0.0)`` on any failure. + """ + if not text or len(text.strip()) < 20: + logger.debug("Text too short for language detection, defaulting to English") + return "en", 0.0 + + try: + from fast_langdetect import detect + result = detect(text) + lang = result.get("lang", "en") + score = result.get("score", 0.0) + + if score < min_confidence: + logger.info( + "Language detection low confidence (%.2f for '%s'), " + "defaulting to English", score, lang + ) + return "en", score + + logger.info("Detected language: %s (confidence: %.2f)", lang, score) + return lang, score + + except ImportError: + logger.warning( + "fast-langdetect is not installed. Language detection disabled. " + "Install with: pip install fast-langdetect" + ) + return "en", 0.0 + except Exception as e: + logger.warning("Language detection failed: %s — defaulting to English", e) + return "en", 0.0 + + +def get_tesseract_langs(lang_code: str) -> list[str]: + """Map a detected language code to Tesseract language code(s). + + Parameters + ---------- + lang_code: + ISO 639-1 language code (e.g. ``"ar"``, ``"en"``). + + Returns + ------- + list[str] + Tesseract language codes (e.g. ``["ara"]``, ``["eng"]``). + """ + return _LANG_TO_TESSERACT.get(lang_code, ["eng"]) + + +def extract_sample_text(file_path, max_chars: int = 2000) -> str: + """Extract a sample of text from a document for language detection. + + Uses a lightweight approach: reads first few KB of the file and + extracts printable text. For PDFs, attempts to use PyMuPDF if + available, otherwise falls back to reading raw bytes. + + Parameters + ---------- + file_path: + Path to the document file. + max_chars: + Maximum characters to extract. + + Returns + ------- + str + Extracted text sample, or empty string if extraction fails. + """ + from pathlib import Path + file_path = Path(file_path) + + if not file_path.exists(): + return "" + + ext = file_path.suffix.lower() + + # For PDFs: try lightweight text extraction + if ext == ".pdf": + return _extract_pdf_sample(file_path, max_chars) + + # For text-like files: read directly + if ext in (".csv", ".txt", ".md"): + try: + with open(file_path, "r", encoding="utf-8", errors="ignore") as f: + return f.read(max_chars) + except Exception: + return "" + + # For other formats: return empty (language detection will use + # text extracted by Docling later) + return "" + + +def _extract_pdf_sample(file_path, max_chars: int) -> str: + """Extract text sample from a PDF using the lightest method available.""" + # Try pdfplumber (lightweight, often available) + try: + import pdfplumber + with pdfplumber.open(str(file_path)) as pdf: + text = "" + for page in pdf.pages[:3]: # First 3 pages + page_text = page.extract_text() or "" + text += page_text + "\n" + if len(text) >= max_chars: + break + return text[:max_chars] + except ImportError: + pass + except Exception: + pass + + # Fallback: read raw bytes and extract printable chars + try: + with open(file_path, "rb") as f: + raw = f.read(max_chars * 4) # Read more bytes since not all are text + # Extract ASCII/Unicode text from raw bytes + text = raw.decode("utf-8", errors="ignore") + # Filter to printable characters + printable = "".join(c for c in text if c.isprintable() or c in "\n\t ") + return printable[:max_chars] + except Exception: + return "" diff --git a/src/longparser/utils/ocr_router.py b/src/longparser/utils/ocr_router.py new file mode 100644 index 0000000..dd3586d --- /dev/null +++ b/src/longparser/utils/ocr_router.py @@ -0,0 +1,148 @@ +"""Smart OCR routing for scanned PDFs. + +Routes pages to the best OCR strategy based on content complexity: + +- **standard** — Tesseract with default settings (fast, CPU-native) +- **math** — Tesseract for text + pix2tex for equations +- **full_ocr** — Tesseract with ``force_full_page_ocr=True`` + +All strategies are CPU-friendly. No GPU-dependent engines (Surya, Marker) +are used in the routing — those are available as separate optional backends. + +Usage:: + + from longparser.utils.ocr_router import ( + is_page_scanned, score_page_complexity, get_ocr_strategy, + ) + + if is_page_scanned(page_text): + score = score_page_complexity(page_text, num_blocks=15, has_tables=True) + strategy = get_ocr_strategy(score) + # strategy = "full_ocr" for score >= 5 +""" + +from __future__ import annotations + +import logging +import re + +logger = logging.getLogger(__name__) + +# Pattern to detect math symbols and simple equations in text. +# Matches Unicode math symbols and simple algebraic patterns like "x = 5". +_MATH_RE = re.compile( + r'[\u2211\u220F\u222B\u221A\u00B1\u2264\u2265\u2248\u2260\u03B1-\u03C9\u03A3]' + r'|[a-z]\s*=\s*[a-z0-9]', + re.IGNORECASE, +) + + +def is_page_scanned(page_text: str, min_chars: int = 30) -> bool: + """Check if a page is likely scanned (no usable text layer). + + Parameters + ---------- + page_text: + Extracted text from the page. + min_chars: + Minimum character count to consider the page as having a text layer. + + Returns + ------- + bool + ``True`` if the page has fewer than ``min_chars`` printable characters + (indicating it's likely a scanned image with no embedded text). + """ + clean = page_text.strip() + return len(clean) < min_chars + + +def has_math_content(text: str) -> bool: + """Check if text contains mathematical symbols or equation patterns. + + Parameters + ---------- + text: + Text to check for math content. + + Returns + ------- + bool + ``True`` if math symbols or equation patterns are found. + """ + return bool(_MATH_RE.search(text)) + + +def score_page_complexity( + page_text: str, + num_blocks: int = 0, + has_tables: bool = False, +) -> int: + """Score page complexity on a scale of 0-10. + + Used to decide which OCR strategy to apply: + + - **0-2** → ``"standard"`` — Simple page, Tesseract is enough + - **3-4** → ``"math"`` — Has equations, add pix2tex + - **5+** → ``"full_ocr"`` — Complex layout, use full-page OCR + + Parameters + ---------- + page_text: + Extracted text from the page. + num_blocks: + Number of content blocks on the page. + has_tables: + Whether the page contains tables. + + Returns + ------- + int + Complexity score from 0 to 10. + """ + score = 0 + + # Tables add significant complexity + if has_tables: + score += 3 + + # Math content needs pix2tex + if has_math_content(page_text): + score += 2 + + # Many blocks suggest a dense/complex layout + if num_blocks > 20: + score += 2 + elif num_blocks > 10: + score += 1 + + # Very short text on a page with blocks = likely OCR issues + if page_text and len(page_text.strip()) < 100 and num_blocks > 5: + score += 1 + + return min(score, 10) + + +def get_ocr_strategy(complexity_score: int) -> str: + """Pick OCR strategy based on page complexity score. + + Parameters + ---------- + complexity_score: + Score from :func:`score_page_complexity` (0-10). + + Returns + ------- + str + One of: + + - ``"standard"`` — Tesseract with default settings + - ``"math"`` — Tesseract + pix2tex for equations + - ``"full_ocr"`` — Tesseract with ``force_full_page_ocr=True`` + """ + if complexity_score <= 2: + return "standard" + elif complexity_score <= 4: + return "math" + else: + return "full_ocr" diff --git a/tests/benchmarks/benchmark_pipeline.py b/tests/benchmarks/benchmark_pipeline.py new file mode 100644 index 0000000..716ee44 --- /dev/null +++ b/tests/benchmarks/benchmark_pipeline.py @@ -0,0 +1,98 @@ +"""Pipeline performance benchmark for regression testing. + +Run this BEFORE and AFTER v0.2.x changes to prove no speed regression. + +Usage: + # Save baseline (v0.1.3) + python tests/benchmarks/benchmark_pipeline.py > benchmark_v013.txt + + # After v0.2.x changes + python tests/benchmarks/benchmark_pipeline.py > benchmark_v020.txt + + # Compare + diff benchmark_v013.txt benchmark_v020.txt +""" + +import time +import sys +from pathlib import Path + + +def benchmark_file(file_path: str) -> dict: + """Benchmark a single file through the pipeline.""" + from longparser import DocumentPipeline, ProcessingConfig + + path = Path(file_path) + if not path.exists(): + return {"file": file_path, "status": "SKIPPED (file not found)"} + + pipeline = DocumentPipeline() + config = ProcessingConfig() + + t0 = time.time() + try: + result = pipeline.process_file(path, config=config) + elapsed = time.time() - t0 + + return { + "file": path.name, + "time_seconds": round(elapsed, 2), + "total_blocks": result.total_blocks, + "total_pages": result.document.metadata.total_pages, + "status": "OK", + } + except Exception as e: + elapsed = time.time() - t0 + return { + "file": path.name, + "time_seconds": round(elapsed, 2), + "status": f"ERROR: {e}", + } + + +def main(): + """Run benchmark on all available test fixtures.""" + # Look for test PDFs in common locations + fixture_dirs = [ + Path("tests/fixtures"), + Path("tests"), + Path("uploads"), + ] + + test_files = [] + for d in fixture_dirs: + if d.exists(): + test_files.extend(sorted(d.glob("*.pdf"))) + + if not test_files: + print("No PDF test files found in tests/fixtures/ or uploads/") + print("Place some PDFs there and re-run.") + sys.exit(1) + + print("=" * 60) + print("LongParser Pipeline Benchmark") + print("=" * 60) + print(f"Files found: {len(test_files)}") + print() + + results = [] + for f in test_files[:5]: # Cap at 5 files for reasonable benchmark time + print(f"Benchmarking: {f.name} ...", end=" ", flush=True) + result = benchmark_file(str(f)) + results.append(result) + print(f"{result.get('time_seconds', '?')}s — {result['status']}") + + print() + print("-" * 60) + print(f"{'File':<30} {'Time':>8} {'Blocks':>8} {'Pages':>6}") + print("-" * 60) + for r in results: + if r["status"] == "OK": + print(f"{r['file']:<30} {r['time_seconds']:>7.2f}s {r['total_blocks']:>8} {r['total_pages']:>6}") + else: + print(f"{r['file']:<30} {r['status']}") + print("-" * 60) + + +if __name__ == "__main__": + main() diff --git a/tests/unit/test_backward_compat.py b/tests/unit/test_backward_compat.py new file mode 100644 index 0000000..fae7d49 --- /dev/null +++ b/tests/unit/test_backward_compat.py @@ -0,0 +1,142 @@ +"""Backward compatibility tests for v0.2.x changes. + +Ensures that users who wrote code against v0.1.3 can upgrade to v0.2.x +without changing a single line of their code. Every new field must have +a default that matches the v0.1.3 behavior. +""" + +import pytest + + +class TestProcessingConfigCompat: + """ProcessingConfig() with no args must behave exactly like v0.1.3.""" + + def test_default_values_match_v013(self): + from longparser.schemas import ProcessingConfig + config = ProcessingConfig() + + # v0.1.3 defaults — these must NEVER change + assert config.academic_mode is False + assert config.rtl_hint is False + assert config.do_ocr is True + assert config.formula_ocr is True + assert config.do_table_structure is True + assert config.export_images is True + assert config.formula_mode == "smart" + assert config.smart_max_equations == 25 + assert config.smart_max_ocr_seconds == 300.0 + assert config.exclude_page_headers_footers is True + + def test_new_fields_have_safe_defaults(self): + """New v0.2.x fields must default to values that don't change behavior.""" + from longparser.schemas import ProcessingConfig + config = ProcessingConfig() + + # backend must default to docling (existing behavior) + backend = getattr(config, "backend", "docling") + assert backend == "docling" + + # auto_detect_language defaults to True but only runs if languages=None + auto_detect = getattr(config, "auto_detect_language", True) + assert auto_detect is True + + # languages=None means "use existing tesseract_lang param" + languages = getattr(config, "languages", None) + assert languages is None + + +class TestDocumentMetadataCompat: + """DocumentMetadata must keep all v0.1.3 fields.""" + + def test_v013_fields_exist(self): + from longparser.schemas import DocumentMetadata + meta = DocumentMetadata(source_file="test.pdf") + + assert meta.source_file == "test.pdf" + assert meta.file_hash == "" + assert meta.language is None + assert meta.total_pages == 0 + assert meta.academic_mode is False + assert meta.rtl_hint is False + + +class TestBlockCompat: + """Block schema must keep all v0.1.3 fields and types.""" + + def test_block_type_values_unchanged(self): + from longparser.schemas import BlockType + + # All v0.1.3 values must still exist + assert BlockType.HEADING == "heading" + assert BlockType.PARAGRAPH == "paragraph" + assert BlockType.LIST_ITEM == "list_item" + assert BlockType.TABLE == "table" + assert BlockType.FIGURE == "figure" + assert BlockType.CAPTION == "caption" + assert BlockType.FOOTER == "footer" + assert BlockType.HEADER == "header" + assert BlockType.EQUATION == "equation" + assert BlockType.CODE == "code" + + def test_extractor_type_values_unchanged(self): + from longparser.schemas import ExtractorType + + # All v0.1.3 values must still exist + assert ExtractorType.DOCLING == "docling" + assert ExtractorType.SURYA == "surya" + assert ExtractorType.MARKER == "marker" + assert ExtractorType.NATIVE_PDF == "native_pdf" + assert ExtractorType.PADDLE == "paddle" + + +class TestChunkCompat: + """Chunk schema must keep all v0.1.3 fields.""" + + def test_chunk_fields_exist(self): + from longparser.schemas import Chunk + chunk = Chunk(text="test", token_count=1, chunk_type="section") + + assert chunk.text == "test" + assert chunk.token_count == 1 + assert chunk.chunk_type == "section" + assert chunk.section_path == [] + assert chunk.page_numbers == [] + assert chunk.block_ids == [] + assert chunk.overlap_with_previous is False + assert chunk.equation_detected is False + + +class TestPublicAPICompat: + """All v0.1.3 public names must still be importable.""" + + def test_all_v013_exports_available(self): + from longparser import ( # noqa: F401 + __version__, + Document, + Page, + Block, + Table, + TableCell, + BlockType, + ExtractorType, + ProcessingConfig, + BoundingBox, + Provenance, + Confidence, + BlockFlags, + DocumentMetadata, + PageProfile, + ExtractionMetadata, + ChunkingConfig, + Chunk, + JobRequest, + JobResult, + ) + + def test_lazy_imports_still_work(self): + """Lazy imports from v0.1.3 must still resolve.""" + from longparser import DocumentPipeline # noqa: F401 + from longparser import PipelineOrchestrator # noqa: F401 + from longparser import PipelineResult # noqa: F401 + from longparser import HybridChunker # noqa: F401 + from longparser import DoclingExtractor # noqa: F401 diff --git a/tests/unit/test_license_safety.py b/tests/unit/test_license_safety.py new file mode 100644 index 0000000..8afac8b --- /dev/null +++ b/tests/unit/test_license_safety.py @@ -0,0 +1,82 @@ +"""License safety tests — ensure GPL/AGPL packages are never loaded by default. + +These tests verify that importing ``longparser`` and using its default +pipeline does NOT load any GPL/AGPL-licensed package (pymupdf4llm, marker, +surya). This is critical to maintain LongParser's MIT license. +""" + +import sys +import pytest + + +# Packages that must NEVER appear in sys.modules after a default import +_BLOCKED_MODULES = [ + "pymupdf4llm", + "pymupdf", + "fitz", # PyMuPDF's internal module name + "marker", + "marker.converters", + "surya", + "surya.ocr", +] + + +def _clear_blocked_modules(): + """Remove any pre-loaded blocked modules from sys.modules.""" + for mod_name in list(sys.modules): + for blocked in _BLOCKED_MODULES: + if mod_name == blocked or mod_name.startswith(blocked + "."): + del sys.modules[mod_name] + + +class TestLicenseSafety: + """Verify that core imports do not load GPL/AGPL dependencies.""" + + def test_import_longparser_does_not_load_agpl(self): + """``import longparser`` must not load any GPL/AGPL module.""" + _clear_blocked_modules() + + import longparser # noqa: F401 + + for mod_name in _BLOCKED_MODULES: + assert mod_name not in sys.modules, ( + f"GPL/AGPL module '{mod_name}' was loaded by 'import longparser'. " + f"This violates the MIT license isolation. " + f"Check __init__.py and extractors/__init__.py for stray imports." + ) + + def test_import_schemas_does_not_load_agpl(self): + """``from longparser.schemas import ...`` must not load GPL/AGPL.""" + _clear_blocked_modules() + + from longparser.schemas import ( # noqa: F401 + ProcessingConfig, Document, Block, Chunk + ) + + for mod_name in _BLOCKED_MODULES: + assert mod_name not in sys.modules, ( + f"GPL/AGPL module '{mod_name}' was loaded by schema import." + ) + + def test_processing_config_default_backend_is_docling(self): + """Default backend must be 'docling' (MIT), not a GPL/AGPL backend.""" + from longparser.schemas import ProcessingConfig + config = ProcessingConfig() + + # If backend field exists, it must default to docling + backend = getattr(config, "backend", "docling") + assert backend == "docling", ( + f"Default backend is '{backend}', expected 'docling'. " + f"Defaulting to a GPL/AGPL backend would violate MIT license." + ) + + def test_pymupdf_extractor_not_in_extractors_init(self): + """PyMuPDFExtractor must NOT be exported from extractors/__init__.py.""" + from longparser import extractors + + public_names = getattr(extractors, "__all__", dir(extractors)) + + assert "PyMuPDFExtractor" not in public_names, ( + "PyMuPDFExtractor must NOT be in extractors/__init__.py. " + "It must only be imported lazily when backend='pymupdf' is set." + ) From 26897dbfc0b12c455cdb926a3ca0dde4cc4ce85f Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Thu, 23 Apr 2026 10:03:06 +0500 Subject: [PATCH 6/7] ci: fix license safety check regex --- .github/workflows/license-check.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index 39b5031..784fda2 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -18,7 +18,7 @@ jobs: FAIL=0 # List of GPL/AGPL package import patterns to block - BLOCKED_PATTERNS="pymupdf4llm|pymupdf|import marker\.|from marker\.|import surya|from surya" + BLOCKED_PATTERNS="import[[:space:]]+pymupdf|from[[:space:]]+pymupdf|import[[:space:]]+marker\.|from[[:space:]]+marker\.|import[[:space:]]+surya|from[[:space:]]+surya" # Files that ARE allowed to import these (isolated backends) ALLOWED_FILES=( From c5f3bd3dbd6d1bedb72b28da728cb30a697b0971 Mon Sep 17 00:00:00 2001 From: Mohsin Ali Date: Thu, 23 Apr 2026 10:06:59 +0500 Subject: [PATCH 7/7] ci: fix grep exclude syntax for license check --- .github/workflows/license-check.yml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/license-check.yml b/.github/workflows/license-check.yml index 784fda2..63f9217 100644 --- a/.github/workflows/license-check.yml +++ b/.github/workflows/license-check.yml @@ -22,8 +22,8 @@ jobs: # Files that ARE allowed to import these (isolated backends) ALLOWED_FILES=( - "src/longparser/extractors/pymupdf_extractor.py" - "src/longparser/extractors/marker_extractor.py" + "pymupdf_extractor.py" + "marker_extractor.py" ) # Build grep exclude args