Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 22 additions & 1 deletion .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -5,19 +5,33 @@
# ============================================================

# ── Database ─────────────────────────────────────────────────
# Local dev (no auth):
LONGPARSER_MONGO_URL=mongodb://localhost:27017
# Docker Compose (auth handled by docker-compose.yml override):
# No need to change — docker-compose sets the authenticated URL automatically.
# Production (with auth):
# LONGPARSER_MONGO_URL=mongodb://USER:PASSWORD@host:27017/longparser?authSource=admin
LONGPARSER_DB_NAME=longparser

# ── Job Queue (Redis / ARQ) ───────────────────────────────────
# Local dev (no auth):
LONGPARSER_REDIS_URL=redis://localhost:6379
# Production (with auth):
# LONGPARSER_REDIS_URL=redis://:PASSWORD@host:6379

# ── Docker Auth Credentials (used by docker-compose.yml) ──────
# Change these before deploying. Defaults are for local dev only.
MONGO_USER=longparser
MONGO_PASS=longparser
REDIS_PASS=longparser

# ── File Storage ──────────────────────────────────────────────
LONGPARSER_UPLOAD_DIR=./uploads

# ── LLM Provider ─────────────────────────────────────────────
# One of: openai | gemini | groq | openrouter
LONGPARSER_LLM_PROVIDER=openai
LONGPARSER_LLM_MODEL=gpt-4o
LONGPARSER_LLM_MODEL=gpt-5.3

# ── API Keys ──────────────────────────────────────────────────
OPENAI_API_KEY=sk-...
Expand All @@ -41,3 +55,10 @@ QDRANT_API_KEY= # Required only for Qdrant Cloud
LONGPARSER_OCR_BACKEND=easyocr
LONGPARSER_OCR_USE_GPU=false

# ── Security (added by audit) ────────────────────────────────
# CORS allowed origins (comma-separated). Default: * (all origins)
# LONGPARSER_CORS_ORIGINS=https://app.example.com,https://admin.example.com
# Rate limit: max requests per minute per tenant. Default: 60
# LONGPARSER_RATE_LIMIT=60
# Admin API keys (comma-separated). If empty, all users are admin.
# LONGPARSER_ADMIN_KEYS=key1,key2
2 changes: 1 addition & 1 deletion .github/workflows/ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,7 @@ jobs:
key: pip-${{ matrix.python-version }}-${{ hashFiles('pyproject.toml') }}

- name: Install package and test deps
run: pip install -e "." pytest pytest-cov
run: pip install -e ".[dev,server]"

- name: Run tests
run: pytest tests/ -v --tb=short --cov=longparser --cov-report=term-missing
Expand Down
9 changes: 8 additions & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -60,4 +60,11 @@ MANIFEST.in
.env

# IDE / Gemini agent
.gemini/
.gemini/

# Logs
*.log

# Temporary test files
test_hack.csv
tests_temp/
6 changes: 3 additions & 3 deletions Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@ COPY pyproject.toml uv.lock ./
# 2) install only dependencies (not project) — cache-friendly
# Use --frozen to respect lockfile, skip CUDA/NVIDIA packages (installed as CPU-only later)
ENV UV_HTTP_TIMEOUT=300
RUN uv sync --no-cache --frozen --no-install-project --extra api --extra embeddings --extra chroma --extra latex-ocr \
RUN uv sync --no-cache --frozen --no-install-project --extra server --extra embeddings --extra chroma --extra latex-ocr \
--no-install-package torch \
--no-install-package torchvision \
--no-install-package nvidia-cublas-cu12 \
Expand All @@ -54,7 +54,7 @@ RUN uv sync --no-cache --frozen --no-install-project --extra api --extra embeddi
COPY . .

# 4) install the project itself (skip torch/CUDA, installed as CPU-only next)
RUN uv sync --no-cache --frozen --extra api --extra embeddings --extra chroma --extra latex-ocr \
RUN uv sync --no-cache --frozen --extra server --extra embeddings --extra chroma --extra latex-ocr \
--no-install-package torch \
--no-install-package torchvision \
--no-install-package nvidia-cublas-cu12 \
Expand Down Expand Up @@ -88,4 +88,4 @@ USER appuser

EXPOSE 8000

CMD [".venv/bin/uvicorn", "clean_rag.api.app:app", "--host", "0.0.0.0", "--port", "8000"]
CMD [".venv/bin/uvicorn", "longparser.server.app:app", "--host", "0.0.0.0", "--port", "8000"]
12 changes: 8 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@
| **Multi-format extraction** | PDF, DOCX, PPTX, XLSX, CSV via Docling |
| **Hybrid chunking** | Token-aware, heading-hierarchy-aware, table-aware |
| **HITL review** | Human-in-the-Loop block & chunk editing before embedding |
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` |
| **LangGraph HITL** | `approve / edit / reject` workflow with LangGraph `interrupt()` and MongoDB checkpointer |
| **3-layer memory** | Short-term turns + rolling summary + long-term facts |
| **Multi-provider LLM** | OpenAI, Gemini, Groq, OpenRouter |
| **Multi-backend vectors** | Chroma, FAISS, Qdrant |
| **Async-first API** | FastAPI + Motor (MongoDB) + ARQ (Redis) |
| **Production-ready API** | FastAPI + Motor (MongoDB) + ARQ + Redis (Queue & Rate Limiting) |
| **Enterprise Security** | Tenant isolation, Role-Based Access Control (RBAC), and CORS |
| **LangChain adapters** | Drop-in `BaseRetriever` and LlamaIndex `QueryEngine` |
| **Privacy-first** | All processing runs locally; no data leaves your infra |

Expand Down Expand Up @@ -233,11 +234,14 @@ Copy `.env.example` to `.env` and set:
| Variable | Default | Description |
|----------|---------|-------------|
| `LONGPARSER_MONGO_URL` | `mongodb://localhost:27017` | MongoDB connection |
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue |
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379` | Redis for job queue & rate limits |
| `LONGPARSER_LLM_PROVIDER` | `openai` | LLM provider |
| `LONGPARSER_LLM_MODEL` | `gpt-4o` | Model name |
| `LONGPARSER_LLM_MODEL` | `gpt-5.3` | Model name |
| `LONGPARSER_EMBED_PROVIDER` | `huggingface` | Embedding provider |
| `LONGPARSER_VECTOR_DB` | `chroma` | Vector store backend |
| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins |
| `LONGPARSER_RATE_LIMIT` | `60` | Max RPM per tenant |
| `LONGPARSER_ADMIN_KEYS` | (empty) | Comma-separated admin API keys |

---

Expand Down
22 changes: 15 additions & 7 deletions docker-compose.yml
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
services:
api:
build: .
container_name: cleanrag-api
container_name: longparser-api
command: [ ".venv/bin/uvicorn", "longparser.server.app:app", "--host", "0.0.0.0", "--port", "8000" ]
env_file: .env
environment:
- LONGPARSER_MFD_MODEL_DIR=/app/models/mfd
# ── For Docker networking, override the localhost URLs from .env ──
- LONGPARSER_MONGO_URL=mongodb://${MONGO_USER:-longparser}:${MONGO_PASS:-longparser}@mongo:27017/longparser?authSource=admin
- LONGPARSER_REDIS_URL=redis://:${REDIS_PASS:-longparser}@redis:6379
ports:
- "8000:8000"
volumes:
Expand All @@ -27,11 +30,13 @@ services:

worker:
build: .
container_name: cleanrag-worker
container_name: longparser-worker
command: [ ".venv/bin/arq", "longparser.server.worker.WorkerSettings" ]
env_file: .env
environment:
- LONGPARSER_MFD_MODEL_DIR=/app/models/mfd
- LONGPARSER_MONGO_URL=mongodb://${MONGO_USER:-longparser}:${MONGO_PASS:-longparser}@mongo:27017/longparser?authSource=admin
- LONGPARSER_REDIS_URL=redis://:${REDIS_PASS:-longparser}@redis:6379
volumes:
- uploads:/app/uploads
- ./models:/app/models
Expand All @@ -51,25 +56,28 @@ services:

redis:
image: redis:7
container_name: cleanrag-redis
command: [ "redis-server", "--appendonly", "yes" ]
container_name: longparser-redis
command: [ "redis-server", "--appendonly", "yes", "--requirepass", "${REDIS_PASS:-longparser}" ]
volumes:
- redis-data:/data
restart: unless-stopped
healthcheck:
test: [ "CMD", "redis-cli", "ping" ]
test: [ "CMD", "redis-cli", "-a", "${REDIS_PASS:-longparser}", "ping" ]
interval: 30s
timeout: 5s
retries: 3

mongo:
image: mongo:7
container_name: cleanrag-mongo
container_name: longparser-mongo
environment:
MONGO_INITDB_ROOT_USERNAME: ${MONGO_USER:-longparser}
MONGO_INITDB_ROOT_PASSWORD: ${MONGO_PASS:-longparser}
volumes:
- mongo-data:/data/db
restart: unless-stopped
healthcheck:
test: [ "CMD", "mongosh", "--quiet", "--eval", "db.adminCommand('ping').ok" ]
test: [ "CMD", "mongosh", "-u", "${MONGO_USER:-longparser}", "-p", "${MONGO_PASS:-longparser}", "--authenticationDatabase", "admin", "--quiet", "--eval", "db.adminCommand('ping').ok" ]
interval: 30s
timeout: 5s
retries: 3
Expand Down
2 changes: 1 addition & 1 deletion docs/api/endpoints.md
Original file line number Diff line number Diff line change
Expand Up @@ -161,7 +161,7 @@ X-API-Key: your-key
"require_approval": false,
"config": {
"llm_provider": "openai",
"llm_model": "gpt-4o",
"llm_model": "gpt-5.3",
"top_k": 5
}
}
Expand Down
2 changes: 1 addition & 1 deletion docs/changelog.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ for production RAG pipelines.
via LangGraph `interrupt()` before embedding
- **3-layer memory chat** — short-term turns + rolling summary + long-term facts,
powered by LCEL chains
- **Multi-provider LLM support** — OpenAI (`gpt-4o`), Gemini (`gemini-2.0-flash`),
- **Multi-provider LLM support** — OpenAI (`gpt-5.3`), Gemini (`gemini-2.5`),
Groq (`llama-3.3-70b-versatile`), OpenRouter
- **Multi-backend vector stores** — Chroma, FAISS, Qdrant
- **Async-first REST API** — FastAPI + Motor (MongoDB) + ARQ (Redis job queue)
Expand Down
10 changes: 9 additions & 1 deletion docs/deployment/environment.md
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ Copy `.env.example` to `.env` and configure for your deployment.
| Variable | Description |
|---|---|
| `LONGPARSER_API_KEY` | API key for server authentication |
| `LONGPARSER_MONGO_URI` | MongoDB connection string |
| `LONGPARSER_MONGO_URL` | MongoDB connection string |

## LLM

Expand Down Expand Up @@ -50,3 +50,11 @@ Copy `.env.example` to `.env` and configure for your deployment.
|---|---|---|
| `LONGPARSER_REDIS_URL` | `redis://localhost:6379/0` | Redis URL for task queue |
| `LONGPARSER_WORKER_CONCURRENCY` | `2` | Worker concurrency level |

## Security

| Variable | Default | Description |
|---|---|---|
| `LONGPARSER_CORS_ORIGINS` | `*` | Allowed CORS origins (comma separated) |
| `LONGPARSER_RATE_LIMIT` | `60` | Max requests per minute per tenant ID |
| `LONGPARSER_ADMIN_KEYS` | — | Comma-separated admin API keys |
2 changes: 1 addition & 1 deletion docs/getting-started/configuration.md
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ cp .env.example .env
| Variable | Description |
|---|---|
| `LONGPARSER_API_KEY` | API key for the REST server |
| `LONGPARSER_MONGO_URI` | MongoDB connection string |
| `LONGPARSER_MONGO_URL` | MongoDB connection string |
| `OPENAI_API_KEY` | For OpenAI LLM provider |

## Processing Options
Expand Down
2 changes: 1 addition & 1 deletion docs/guide/chat.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ POST /chat
"question": "What are the key findings?",
"config": {
"llm_provider": "openai",
"llm_model": "gpt-4o",
"llm_model": "gpt-5.3",
"top_k": 5
}
}
Expand Down
2 changes: 1 addition & 1 deletion docs/integrations/langchain.md
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ from langchain.chains import RetrievalQA
from langchain_openai import ChatOpenAI

qa = RetrievalQA.from_chain_type(
llm=ChatOpenAI(model="gpt-4o"),
llm=ChatOpenAI(model="gpt-5.3"),
retriever=vectorstore.as_retriever(search_kwargs={"k": 5}),
)

Expand Down
2 changes: 2 additions & 0 deletions docs/security.md
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,8 @@ Key risks:
| **MongoDB injection** | Motor driver + typed Pydantic inputs prevent injection |
| **SSRF via webhook** | No outbound HTTP made based on user input |
| **Hallucinated citations** | Citation IDs validated against retrieved set before returning to client |
| **DDoS / Spam via API** | Route-level Rate Limiting strictly isolated per tenant via Redis |
| **Cross-Origin attacks** | Configurable CORS restrictions and strict Tenant Isolation |

## Dependency Security

Expand Down
1 change: 1 addition & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -35,6 +35,7 @@ dependencies = [
"pydantic>=2.0,<3",
"docling>=2.14",
"docling-core>=2.13",
"langgraph-checkpoint-mongodb>=0.3.1",
]

[project.optional-dependencies]
Expand Down
4 changes: 2 additions & 2 deletions src/longparser/chunkers/hybrid_chunker.py
Original file line number Diff line number Diff line change
Expand Up @@ -345,10 +345,10 @@ def _generate_schema_chunk(
sample_rows.append(f" Row {r_idx}: " + "; ".join(parts))

lines = [
f"[TABLE SCHEMA]",
"[TABLE SCHEMA]",
f"Table ID: {block.block_id}",
f"Rows: {n_data} (data rows), Columns: {n_cols}",
f"Columns:",
"Columns:",
]
lines.extend(col_profiles)
lines.append(f"Sample Rows ({sample_count}):")
Expand Down
31 changes: 16 additions & 15 deletions src/longparser/extractors/docling_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -254,7 +254,7 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig):
# Order-based substitution with alignment gate
injected = 0
_non_omml = 0
for block, latex in zip(formula_blocks, latex_eqs):
for block, latex in zip(formula_blocks, latex_eqs, strict=False):
orig_len = len(block.text.strip()) if block.text else 0
latex_len = len(latex.strip())

Expand Down Expand Up @@ -431,7 +431,8 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig):
page_img = None
try:
page_img = page_obj.image.pil_image
except Exception:
except Exception as e:
logger.warning("Failed to extract image for formula scanning: %s", e)
continue
if page_img is None:
continue
Expand Down Expand Up @@ -527,8 +528,8 @@ def _run_docling(self, file_path: Path, config: ProcessingConfig):
# Update label to formula so downstream sees it correctly
try:
item.label = type(item.label)("formula")
except Exception:
pass
except Exception as e:
logger.debug(f"Failed to update formula label: {e}")
replaced = True
logger.debug(f"MFD: replaced garbled block on page {page_no}")
break
Expand Down Expand Up @@ -1023,15 +1024,15 @@ def _get_item_text(self, item, docling_doc=None) -> str:
if isinstance(item, TableItem) and hasattr(item, 'export_to_markdown'):
try:
return item.export_to_markdown(doc=docling_doc)
except Exception:
pass
except Exception as e:
logger.debug(f"Failed to export table item to markdown: {e}")
if hasattr(item, 'text') and item.text:
return item.text
if hasattr(item, 'export_to_markdown'):
try:
return item.export_to_markdown()
except Exception:
pass
except Exception as e:
logger.debug(f"Failed to export item to markdown: {e}")
return ""

def _get_item_confidence(self, item) -> float:
Expand Down Expand Up @@ -1080,10 +1081,10 @@ def _build_pptx_text_map(self, file_path: Path) -> Dict[int, Dict[str, PptxParaI
if s.placeholder_format.type == PP_PH.SUBTITLE:
has_subtitle_placeholder = True
break
except Exception:
pass
except ImportError:
pass
except Exception as e:
logger.debug(f"Failed to check PPTX subtitle placeholder format: {e}")
except ImportError as e:
logger.debug(f"Failed to import python-pptx: {e}")

for shape in slide.shapes:
found_title = self._extract_pptx_shape_info(
Expand Down Expand Up @@ -1160,8 +1161,8 @@ def _extract_pptx_shape_info(self, shape, slide_map: Dict[str, PptxParaInfo],
is_subtitle_shape = True
elif ph_type in (PP_PLACEHOLDER.DATE, PP_PLACEHOLDER.FOOTER, PP_PLACEHOLDER.SLIDE_NUMBER):
is_footer_shape = True
except Exception:
pass
except Exception as e:
logger.debug(f"Failed to check PPTX placeholder format type: {e}")

# Skip footer/date/slide-number shapes entirely
if is_footer_shape:
Expand Down Expand Up @@ -1267,7 +1268,7 @@ def extract(

# Calculate file hash
with open(file_path, "rb") as f:
file_hash = hashlib.md5(f.read()).hexdigest()
file_hash = hashlib.sha256(f.read()).hexdigest()

# Get conversion result (cached or new)
result = self._run_docling(file_path, config)
Expand Down
Loading
Loading