ENDEVSOLS · MUZAMMILPERVAIZ · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026 · Apr 8, 2026
diff --git a/.env.example b/.env.example
@@ -5,19 +5,33 @@
 # ============================================================
 
 # ── Database ─────────────────────────────────────────────────
+# Local dev (no auth):
 LONGPARSER_MONGO_URL=mongodb://localhost:27017
+# Docker Compose (auth handled by docker-compose.yml override):
+#   No need to change — docker-compose sets the authenticated URL automatically.
+# Production (with auth):
+#   LONGPARSER_MONGO_URL=mongodb://USER:PASSWORD@host:27017/longparser?authSource=admin
 LONGPARSER_DB_NAME=longparser
 
 # ── Job Queue (Redis / ARQ) ───────────────────────────────────
+# Local dev (no auth):
 LONGPARSER_REDIS_URL=redis://localhost:6379
+# Production (with auth):
+#   LONGPARSER_REDIS_URL=redis://:PASSWORD@host:6379
+
+# ── Docker Auth Credentials (used by docker-compose.yml) ──────
+# Change these before deploying. Defaults are for local dev only.
+MONGO_USER=longparser
+MONGO_PASS=longparser
+REDIS_PASS=longparser
 
 # ── File Storage ──────────────────────────────────────────────
 LONGPARSER_UPLOAD_DIR=./uploads
 
 # ── LLM Provider ─────────────────────────────────────────────
 # One of: openai | gemini | groq | openrouter
 LONGPARSER_LLM_PROVIDER=openai
-LONGPARSER_LLM_MODEL=gpt-4o
+LONGPARSER_LLM_MODEL=gpt-5.3
 
 # ── API Keys ──────────────────────────────────────────────────
 OPENAI_API_KEY=sk-...
@@ -41,3 +55,10 @@ QDRANT_API_KEY=                    # Required only for Qdrant Cloud
 LONGPARSER_OCR_BACKEND=easyocr
 LONGPARSER_OCR_USE_GPU=false
 
+# ── Security (added by audit) ────────────────────────────────
+# CORS allowed origins (comma-separated). Default: * (all origins)
+# LONGPARSER_CORS_ORIGINS=https://app.example.com,https://admin.example.com
+# Rate limit: max requests per minute per tenant. Default: 60
+# LONGPARSER_RATE_LIMIT=60
+# Admin API keys (comma-separated). If empty, all users are admin.
+# LONGPARSER_ADMIN_KEYS=key1,key2
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -37,7 +37,7 @@ jobs:
           key: pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}
 
       - name: Install package and test deps
-        run: pip install -e "." pytest pytest-cov
+        run: pip install -e ".[dev,server]"
 
       - name: Run tests
         run: pytest tests/ -v --tb=short --cov=longparser --cov-report=term-missing

diff --git a/.gitignore b/.gitignore
@@ -60,4 +60,11 @@ MANIFEST.in
 .env
 
 # IDE / Gemini agent
-.gemini/
+.gemini/
+
+# Logs
+*.log
+
+# Temporary test files
+test_hack.csv
+tests_temp/
diff --git a/Dockerfile b/Dockerfile
@@ -28,7 +28,7 @@ COPY pyproject.toml uv.lock ./
 # 2) install only dependencies (not project) — cache-friendly
 # Use --frozen to respect lockfile, skip CUDA/NVIDIA packages (installed as CPU-only later)
 ENV UV_HTTP_TIMEOUT=300
-RUN uv sync --no-cache --frozen --no-install-project --extra api --extra embeddings --extra chroma --extra latex-ocr \
+RUN uv sync --no-cache --frozen --no-install-project --extra server --extra embeddings --extra chroma --extra latex-ocr \
     --no-install-package torch \
     --no-install-package torchvision \
     --no-install-package nvidia-cublas-cu12 \
@@ -54,7 +54,7 @@ RUN uv sync --no-cache --frozen --no-install-project --extra api --extra embeddi
 COPY . .
 
 # 4) install the project itself (skip torch/CUDA, installed as CPU-only next)
-RUN uv sync --no-cache --frozen --extra api --extra embeddings --extra chroma --extra latex-ocr \
+RUN uv sync --no-cache --frozen --extra server --extra embeddings --extra chroma --extra latex-ocr \
     --no-install-package torch \
     --no-install-package torchvision \
     --no-install-package nvidia-cublas-cu12 \
@@ -88,4 +88,4 @@ USER appuser
 
 EXPOSE 8000
 
-CMD [".venv/bin/uvicorn", "clean_rag.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
+CMD [".venv/bin/uvicorn", "longparser.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
diff --git a/README.md b/README.md
@@ -39,11 +39,12 @@
 | **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
 | **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
 | **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
-| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
+| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
 | **3-layer memory** | Short-term turns + rolling summary + long-term facts |
 | **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
 | **Multi-backend vectors** | Chroma, FAISS, Qdrant |
-| **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
+| **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
+| **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
 | **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
 | **Privacy-first** | All processing runs locally; no data leaves your infra |
 
@@ -233,11 +234,14 @@ Copy `.env.example` to `.env` and set:
 | Variable | Default | Description |
 |----------|---------|-------------|
 | `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
-| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
+| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
 | `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
-| `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
+| `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
 | `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
 | `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
+| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
+| `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
+| `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |
 
 ---
 

diff --git a/docker-compose.yml b/docker-compose.yml
@@ -1,11 +1,14 @@
 services:
   api:
     build: .
-    container_name: cleanrag-api
+    container_name: longparser-api
     command: [ ".venv/bin/uvicorn", "longparser.server.app:app", "--host", "0.0.0.0", "--port", "8000" ]
     env_file: .env
     environment:
       - LONGPARSER_MFD_MODEL_DIR=/app/models/mfd
+      # ── For Docker networking, override the localhost URLs from .env ──
+      - LONGPARSER_MONGO_URL=mongodb://${MONGO_USER:-longparser}:${MONGO_PASS:-longparser}@mongo:27017/longparser?authSource=admin
+      - LONGPARSER_REDIS_URL=redis://:${REDIS_PASS:-longparser}@redis:6379
     ports:
       - "8000:8000"
     volumes:
@@ -27,11 +30,13 @@ services:
 
   worker:
     build: .
-    container_name: cleanrag-worker
+    container_name: longparser-worker
     command: [ ".venv/bin/arq", "longparser.server.worker.WorkerSettings" ]
     env_file: .env
     environment:
       - LONGPARSER_MFD_MODEL_DIR=/app/models/mfd
+      - LONGPARSER_MONGO_URL=mongodb://${MONGO_USER:-longparser}:${MONGO_PASS:-longparser}@mongo:27017/longparser?authSource=admin
+      - LONGPARSER_REDIS_URL=redis://:${REDIS_PASS:-longparser}@redis:6379
     volumes:
       - uploads:/app/uploads
       - ./models:/app/models
@@ -51,25 +56,28 @@ services:
 
   redis:
     image: redis:7
-    container_name: cleanrag-redis
-    command: [ "redis-server", "--appendonly", "yes" ]
+    container_name: longparser-redis
+    command: [ "redis-server", "--appendonly", "yes", "--requirepass", "${REDIS_PASS:-longparser}" ]
     volumes:
       - redis-data:/data
     restart: unless-stopped
     healthcheck:
-      test: [ "CMD", "redis-cli", "ping" ]
+      test: [ "CMD", "redis-cli", "-a", "${REDIS_PASS:-longparser}", "ping" ]
       interval: 30s
       timeout: 5s
       retries: 3
 
   mongo:
     image: mongo:7
-    container_name: cleanrag-mongo
+    container_name: longparser-mongo
+    environment:
+      MONGO_INITDB_ROOT_USERNAME: ${MONGO_USER:-longparser}
+      MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASS:-longparser}
     volumes:
       - mongo-data:/data/db
     restart: unless-stopped
     healthcheck:
-      test: [ "CMD", "mongosh", "--quiet", "--eval", "db.adminCommand('ping').ok" ]
+      test: [ "CMD", "mongosh", "-u", "${MONGO_USER:-longparser}", "-p", "${MONGO_PASS:-longparser}", "--authenticationDatabase", "admin", "--quiet", "--eval", "db.adminCommand('ping').ok" ]
       interval: 30s
       timeout: 5s
       retries: 3

diff --git a/docs/api/endpoints.md b/docs/api/endpoints.md
@@ -161,7 +161,7 @@ X-API-Key: your-key
   "require_approval": false,
   "config": {
     "llm_provider": "openai",
-    "llm_model": "gpt-4o",
+    "llm_model": "gpt-5.3",
     "top_k": 5
   }
 }

diff --git a/docs/changelog.md b/docs/changelog.md
@@ -54,7 +54,7 @@ for production RAG pipelines.
   via LangGraph `interrupt()` before embedding
 - **3-layer memory chat** — short-term turns + rolling summary + long-term facts,
   powered by LCEL chains
-- **Multi-provider LLM support** — OpenAI (`gpt-4o`), Gemini (`gemini-2.0-flash`),
+- **Multi-provider LLM support** — OpenAI (`gpt-5.3`), Gemini (`gemini-2.5`),
   Groq (`llama-3.3-70b-versatile`), OpenRouter
 - **Multi-backend vector stores** — Chroma, FAISS, Qdrant
 - **Async-first REST API** — FastAPI + Motor (MongoDB) + ARQ (Redis job queue)

diff --git a/docs/deployment/environment.md b/docs/deployment/environment.md
@@ -7,7 +7,7 @@ Copy `.env.example` to `.env` and configure for your deployment.
 | Variable | Description |
 |---|---|
 | `LONGPARSER_API_KEY` | API key for server authentication |
-| `LONGPARSER_MONGO_URI` | MongoDB connection string |
+| `LONGPARSER_MONGO_URL` | MongoDB connection string |
 
 ## LLM
 
@@ -50,3 +50,11 @@ Copy `.env.example` to `.env` and configure for your deployment.
 |---|---|---|
 | `LONGPARSER_REDIS_URL` | `redis://localhost:6379/0` | Redis URL for task queue |
 | `LONGPARSER_WORKER_CONCURRENCY` | `2` | Worker concurrency level |
+
+## Security
+
+| Variable | Default | Description |
+|---|---|---|
+| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins (comma separated) |
+| `LONGPARSER_RATE_LIMIT` | `60` | Max requests per minute per tenant ID |
+| `LONGPARSER_ADMIN_KEYS` | — | Comma-separated admin API keys |
diff --git a/docs/getting-started/configuration.md b/docs/getting-started/configuration.md
@@ -15,7 +15,7 @@ cp .env.example .env
 | Variable | Description |
 |---|---|
 | `LONGPARSER_API_KEY` | API key for the REST server |
-| `LONGPARSER_MONGO_URI` | MongoDB connection string |
+| `LONGPARSER_MONGO_URL` | MongoDB connection string |
 | `OPENAI_API_KEY` | For OpenAI LLM provider |
 
 ## Processing Options

diff --git a/docs/guide/chat.md b/docs/guide/chat.md
@@ -40,7 +40,7 @@ POST /chat
   "question": "What are the key findings?",
   "config": {
     "llm_provider": "openai",
-    "llm_model": "gpt-4o",
+    "llm_model": "gpt-5.3",
     "top_k": 5
   }
 }

diff --git a/docs/integrations/langchain.md b/docs/integrations/langchain.md
@@ -59,7 +59,7 @@ from langchain.chains import RetrievalQA
 from langchain_openai import ChatOpenAI
 
 qa = RetrievalQA.from_chain_type(
-    llm=ChatOpenAI(model="gpt-4o"),
+    llm=ChatOpenAI(model="gpt-5.3"),
     retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
 )
 

diff --git a/docs/security.md b/docs/security.md
@@ -35,6 +35,8 @@ Key risks:
 | **MongoDB injection** | Motor driver + typed Pydantic inputs prevent injection |
 | **SSRF via webhook** | No outbound HTTP made based on user input |
 | **Hallucinated citations** | Citation IDs validated against retrieved set before returning to client |
+| **DDoS / Spam via API** | Route-level Rate Limiting strictly isolated per tenant via Redis |
+| **Cross-Origin attacks** | Configurable CORS restrictions and strict Tenant Isolation |
 
 ## Dependency Security
 

diff --git a/pyproject.toml b/pyproject.toml
@@ -35,6 +35,7 @@ dependencies = [
     "pydantic>=2.0,<3",
     "docling>=2.14",
     "docling-core>=2.13",
+    "langgraph-checkpoint-mongodb>=0.3.1",
 ]
 
 [project.optional-dependencies]

diff --git a/src/longparser/chunkers/hybrid_chunker.py b/src/longparser/chunkers/hybrid_chunker.py
@@ -345,10 +345,10 @@ def _generate_schema_chunk(
         sample_rows.append(f"  Row {r_idx}: " + "; ".join(parts))
 
     lines = [
-        f"[TABLE SCHEMA]",
+        "[TABLE SCHEMA]",
         f"Table ID: {block.block_id}",
         f"Rows: {n_data} (data rows), Columns: {n_cols}",
-        f"Columns:",
+        "Columns:",
     ]
     lines.extend(col_profiles)
     lines.append(f"Sample Rows ({sample_count}):")

diff --git a/src/longparser/extractors/docling_extractor.py b/src/longparser/extractors/docling_extractor.py
@@ -254,7 +254,7 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig):
                         # Order-based substitution with alignment gate
                         injected = 0
                         _non_omml = 0
-                        for block, latex in zip(formula_blocks, latex_eqs):
+                        for block, latex in zip(formula_blocks, latex_eqs, strict=False):
                             orig_len = len(block.text.strip()) if block.text else 0
                             latex_len = len(latex.strip())
 
@@ -431,7 +431,8 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig):
                     page_img = None
                     try:
                         page_img = page_obj.image.pil_image
-                    except Exception:
+                    except Exception as e:
+                        logger.warning("Failed to extract image for formula scanning: %s", e)
                         continue
                     if page_img is None:
                         continue
@@ -527,8 +528,8 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig):
                                     # Update label to formula so downstream sees it correctly
                                     try:
                                         item.label = type(item.label)("formula")
-                                    except Exception:
-                                        pass
+                                    except Exception as e:
+                                        logger.debug(f"Failed to update formula label: {e}")
                                     replaced = True
                                     logger.debug(f"MFD: replaced garbled block on page {page_no}")
                                     break
@@ -1023,15 +1024,15 @@ def _get_item_text(self, item, docling_doc=None) -> str:
         if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'):
             try:
                 return item.export_to_markdown(doc=docling_doc)
-            except Exception:
-                pass
+            except Exception as e:
+                logger.debug(f"Failed to export table item to markdown: {e}")
         if hasattr(item, 'text') and item.text:
             return item.text
         if hasattr(item, 'export_to_markdown'):
             try:
                 return item.export_to_markdown()
-            except Exception:
-                pass
+            except Exception as e:
+                logger.debug(f"Failed to export item to markdown: {e}")
         return ""
 
     def _get_item_confidence(self, item) -> float:
@@ -1080,10 +1081,10 @@ def _build_pptx_text_map(self, file_path: Path) -> Dict[int, Dict[str, PptxParaI
                                 if s.placeholder_format.type == PP_PH.SUBTITLE:
                                     has_subtitle_placeholder = True
                                     break
-                            except Exception:
-                                pass
-                except ImportError:
-                    pass
+                            except Exception as e:
+                                logger.debug(f"Failed to check PPTX subtitle placeholder format: {e}")
+                except ImportError as e:
+                    logger.debug(f"Failed to import python-pptx: {e}")
 
             for shape in slide.shapes:
                 found_title = self._extract_pptx_shape_info(
@@ -1160,8 +1161,8 @@ def _extract_pptx_shape_info(self, shape, slide_map: Dict[str, PptxParaInfo],
                     is_subtitle_shape = True
                 elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER):
                     is_footer_shape = True
-            except Exception:
-                pass
+            except Exception as e:
+                logger.debug(f"Failed to check PPTX placeholder format type: {e}")
 
         # Skip footer/date/slide-number shapes entirely
         if is_footer_shape:
@@ -1267,7 +1268,7 @@ def extract(
 
         # Calculate file hash
         with open(file_path, "rb") as f:
-            file_hash = hashlib.md5(f.read()).hexdigest()
+            file_hash = hashlib.sha256(f.read()).hexdigest()
 
         # Get conversion result (cached or new)
         result = self._run_docling(file_path, config)