From 2cde33f1a4150a19674a80427b64f430cdfc724e Mon Sep 17 00:00:00 2001 From: Lyonzin Date: Tue, 9 Jun 2026 13:35:10 -0300 Subject: [PATCH 1/3] =?UTF-8?q?feat(server):=20enterprise=20concurrent=20a?= =?UTF-8?q?ccess=20=E2=80=94=20SSE/HTTP=20transport,=20rate=20limiting,=20?= =?UTF-8?q?metrics=20(#88)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add server: config section (transport, host, port, auth, rate_limit, metrics) - Support SSE and streamable-http transport via FastMCP (--transport CLI override) - Thread-safe QueryCache, BM25 build lock, orchestrator double-checked locking - ChromaDB WAL mode for concurrent reads in SSE/HTTP mode - Optional rate limiting (sliding-window, decorator-based, disabled by default) - Optional Prometheus metrics endpoint (disabled by default) - All 12 MCP tools instrumented with @rate_limited + @instrument decorators - Auto-enable single-instance lock in SSE/HTTP mode - New optional dependency: pip install knowledge-rag[server] - Version bump: 3.9.1 -> 4.0.0 Closes #88 --- config.example.yaml | 33 +++++++++ mcp_server/__init__.py | 2 +- mcp_server/config.py | 59 +++++++++++++++- mcp_server/metrics.py | 102 +++++++++++++++++++++++++++ mcp_server/ratelimit.py | 69 +++++++++++++++++++ mcp_server/server.py | 149 ++++++++++++++++++++++++++++++++-------- npm/package.json | 2 +- pyproject.toml | 5 +- 8 files changed, 388 insertions(+), 33 deletions(-) create mode 100644 mcp_server/metrics.py create mode 100644 mcp_server/ratelimit.py diff --git a/config.example.yaml b/config.example.yaml index 3c42f58..a7bfb63 100644 --- a/config.example.yaml +++ b/config.example.yaml @@ -245,3 +245,36 @@ query_expansions: {} # # # Logging verbosity: DEBUG, INFO, WARNING, ERROR # # log_level: "INFO" + + +# ============================================================================ +# SERVER (new in v4.0.0) +# ============================================================================ +# Controls transport, networking, and enterprise features. +# All fields are optional — defaults preserve v3.x stdio behavior. + +server: + # Transport protocol: "stdio" (legacy), "sse", "streamable-http" + # stdio: 1 process per client (compatible with all MCP clients) + # sse: 1 server serves N clients over HTTP+SSE (recommended for multi-agent) + # streamable-http: 1 server, HTTP streaming + transport: "stdio" + + # Network settings (ignored when transport is stdio) + host: "127.0.0.1" + port: 8179 + + # Auth: optional bearer token validation (SSE/HTTP only) + auth: + bearer_token: "" + + # Rate limiting: optional per-client request throttling + rate_limit: + enabled: false + requests_per_minute: 60 + burst: 10 + + # Metrics: optional Prometheus-compatible /metrics endpoint + metrics: + enabled: false + port: 9179 diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py index a084180..3b464c9 100644 --- a/mcp_server/__init__.py +++ b/mcp_server/__init__.py @@ -8,7 +8,7 @@ _original_stdout = sys.stdout sys.stdout = sys.stderr -__version__ = "3.9.1" +__version__ = "4.0.0" __author__ = "Ailton Rocha (Lyon.)" from .config import Config # noqa: E402 diff --git a/mcp_server/config.py b/mcp_server/config.py index d9aee42..22672a9 100644 --- a/mcp_server/config.py +++ b/mcp_server/config.py @@ -1,4 +1,4 @@ -"""Configuration for Knowledge RAG System v3.4.1 — YAML-configurable""" +"""Configuration for Knowledge RAG System v4.0.0 — YAML-configurable""" import os import sys @@ -528,6 +528,49 @@ class Config: default_results: int = field(default_factory=lambda: _get("search", "default_results", 5)) max_results: int = field(default_factory=lambda: _get("search", "max_results", 20)) + # Server (new in v4.0.0) + transport: str = field(default_factory=lambda: _get("server", "transport", "stdio")) + server_host: str = field(default_factory=lambda: _get("server", "host", "127.0.0.1")) + server_port: int = field(default_factory=lambda: _get("server", "port", 8179)) + auth_bearer_token: str = field( + default_factory=lambda: ( + _get("server", "auth", {}).get("bearer_token", "") if isinstance(_get("server", "auth", {}), dict) else "" + ) + ) + rate_limit_enabled: bool = field( + default_factory=lambda: ( + _get("server", "rate_limit", {}).get("enabled", False) + if isinstance(_get("server", "rate_limit", {}), dict) + else False + ) + ) + rate_limit_rpm: int = field( + default_factory=lambda: ( + _get("server", "rate_limit", {}).get("requests_per_minute", 60) + if isinstance(_get("server", "rate_limit", {}), dict) + else 60 + ) + ) + rate_limit_burst: int = field( + default_factory=lambda: ( + _get("server", "rate_limit", {}).get("burst", 10) + if isinstance(_get("server", "rate_limit", {}), dict) + else 10 + ) + ) + metrics_enabled: bool = field( + default_factory=lambda: ( + _get("server", "metrics", {}).get("enabled", False) + if isinstance(_get("server", "metrics", {}), dict) + else False + ) + ) + metrics_port: int = field( + default_factory=lambda: ( + _get("server", "metrics", {}).get("port", 9179) if isinstance(_get("server", "metrics", {}), dict) else 9179 + ) + ) + def __post_init__(self): """Validate config values and ensure directories exist.""" # Bounds validation @@ -553,6 +596,20 @@ def __post_init__(self): self.reranker_enabled = True if not isinstance(self.reranker_top_k_multiplier, int) or self.reranker_top_k_multiplier < 1: self.reranker_top_k_multiplier = 3 + + # Server transport validation + if self.transport not in ("stdio", "sse", "streamable-http"): + print(f"[WARN] server.transport={self.transport!r} invalid, using 'stdio'") + self.transport = "stdio" + if not isinstance(self.server_port, int) or not (1 <= self.server_port <= 65535): + self.server_port = 8179 + if not isinstance(self.metrics_port, int) or not (1 <= self.metrics_port <= 65535): + self.metrics_port = 9179 + if not isinstance(self.rate_limit_rpm, int) or self.rate_limit_rpm < 1: + self.rate_limit_rpm = 60 + if not isinstance(self.rate_limit_burst, int) or self.rate_limit_burst < 0: + self.rate_limit_burst = 10 + if not isinstance(self.supported_formats, list) or not self.supported_formats: print("[WARN] supported_formats is empty or invalid, using defaults") self.supported_formats = [ diff --git a/mcp_server/metrics.py b/mcp_server/metrics.py new file mode 100644 index 0000000..c0bfdbf --- /dev/null +++ b/mcp_server/metrics.py @@ -0,0 +1,102 @@ +"""Optional Prometheus-compatible metrics for knowledge-rag server.""" + +import sys +import threading +import time +from collections import defaultdict +from functools import wraps +from typing import Any, Callable + +from .config import config + + +class MetricsCollector: + """Lightweight Prometheus-compatible metrics collector.""" + + def __init__(self) -> None: + self._lock = threading.Lock() + self._counters: dict[str, float] = defaultdict(float) + self._histograms: dict[str, list[float]] = defaultdict(list) + self._gauges: dict[str, float] = defaultdict(float) + + def inc(self, name: str, labels: str = "", value: float = 1.0) -> None: + with self._lock: + self._counters[f"{name}{labels}"] += value + + def set_gauge(self, name: str, value: float, labels: str = "") -> None: + with self._lock: + self._gauges[f"{name}{labels}"] = value + + def observe(self, name: str, value: float, labels: str = "") -> None: + with self._lock: + self._histograms[f"{name}{labels}"].append(value) + + def exposition(self) -> str: + lines: list[str] = [] + with self._lock: + for key, val in sorted(self._counters.items()): + lines.append(f"{key} {val}") + for key, val in sorted(self._gauges.items()): + lines.append(f"{key} {val}") + for key, observations in sorted(self._histograms.items()): + if observations: + lines.append(f"{key}_count {len(observations)}") + lines.append(f"{key}_sum {sum(observations):.6f}") + return "\n".join(lines) + "\n" + + +_metrics = MetricsCollector() + + +def get_metrics() -> MetricsCollector: + return _metrics + + +def instrument(tool_name: str) -> Callable[..., Callable[..., Any]]: + """Decorator to instrument a tool function with call count and latency.""" + + def decorator(fn: Callable[..., Any]) -> Callable[..., Any]: + @wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + if not config.metrics_enabled: + return fn(*args, **kwargs) + _metrics.inc("knowledge_rag_tool_calls_total", f'{{tool="{tool_name}"}}') + start = time.monotonic() + try: + result = fn(*args, **kwargs) + return result + except Exception: + _metrics.inc("knowledge_rag_tool_errors_total", f'{{tool="{tool_name}"}}') + raise + finally: + elapsed = time.monotonic() - start + _metrics.observe("knowledge_rag_tool_duration_seconds", elapsed, f'{{tool="{tool_name}"}}') + + return wrapper + + return decorator + + +def start_metrics_server(port: int) -> None: + """Start a lightweight HTTP server for Prometheus metrics scraping.""" + from http.server import BaseHTTPRequestHandler, HTTPServer + + class _Handler(BaseHTTPRequestHandler): + def do_GET(self) -> None: + if self.path == "/metrics": + body = get_metrics().exposition().encode("utf-8") + self.send_response(200) + self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8") + self.end_headers() + self.wfile.write(body) + else: + self.send_response(404) + self.end_headers() + + def log_message(self, format: str, *args: Any) -> None: + pass + + server = HTTPServer(("0.0.0.0", port), _Handler) + thread = threading.Thread(target=server.serve_forever, daemon=True) + thread.start() + print(f"[METRICS] Prometheus endpoint at http://0.0.0.0:{port}/metrics", file=sys.stderr) diff --git a/mcp_server/ratelimit.py b/mcp_server/ratelimit.py new file mode 100644 index 0000000..82c4463 --- /dev/null +++ b/mcp_server/ratelimit.py @@ -0,0 +1,69 @@ +"""Optional per-client rate limiting for MCP tool calls.""" + +import threading +import time +from collections import defaultdict +from functools import wraps +from typing import Any, Callable + +from .config import config + + +class SlidingWindowCounter: + """Thread-safe sliding window rate limiter.""" + + def __init__(self, window_seconds: int = 60, max_requests: int = 60, burst: int = 10): + self.window = window_seconds + self.max_requests = max_requests + self.burst = burst + self._lock = threading.Lock() + self._requests: dict[str, list[float]] = defaultdict(list) + + def allow(self, client_id: str = "default") -> bool: + now = time.monotonic() + with self._lock: + timestamps = self._requests[client_id] + cutoff = now - self.window + self._requests[client_id] = [t for t in timestamps if t > cutoff] + timestamps = self._requests[client_id] + if len(timestamps) < self.max_requests + self.burst: + timestamps.append(now) + return True + return False + + +_limiter: SlidingWindowCounter | None = None + + +def _get_limiter() -> SlidingWindowCounter: + global _limiter + if _limiter is None: + _limiter = SlidingWindowCounter( + window_seconds=60, + max_requests=config.rate_limit_rpm, + burst=config.rate_limit_burst, + ) + return _limiter + + +def rate_limited(fn: Callable[..., Any]) -> Callable[..., Any]: + """Decorator that enforces rate limiting if enabled in config.""" + + @wraps(fn) + def wrapper(*args: Any, **kwargs: Any) -> Any: + if not config.rate_limit_enabled: + return fn(*args, **kwargs) + limiter = _get_limiter() + if not limiter.allow("default"): + import json + + return json.dumps( + { + "status": "error", + "message": "Rate limit exceeded. Retry after a short delay.", + "retry_after_seconds": 60, + } + ) + return fn(*args, **kwargs) + + return wrapper diff --git a/mcp_server/server.py b/mcp_server/server.py index 85b0f09..c97e8d9 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -58,6 +58,8 @@ # Local imports from .config import config from .ingestion import Document, DocumentParser +from .metrics import instrument +from .ratelimit import rate_limited # ============================================================================= # QUERY CACHE @@ -80,6 +82,7 @@ def __init__(self, max_size: int = 100, ttl_seconds: int = 300): self.max_size = max_size self.ttl_seconds = ttl_seconds self._cache: OrderedDict[str, Tuple[float, Any]] = OrderedDict() + self._lock = threading.Lock() self._hits = 0 self._misses = 0 @@ -92,28 +95,31 @@ def get(self, query: str, max_results: int, category: Optional[str], hybrid_alph """Get cached result if exists and not expired""" key = self._make_key(query, max_results, category, hybrid_alpha) - if key in self._cache: - timestamp, result = self._cache[key] - if time.time() - timestamp < self.ttl_seconds: - self._cache.move_to_end(key) - self._hits += 1 - return result - else: - del self._cache[key] + with self._lock: + if key in self._cache: + timestamp, result = self._cache[key] + if time.time() - timestamp < self.ttl_seconds: + self._cache.move_to_end(key) + self._hits += 1 + return result + else: + del self._cache[key] - self._misses += 1 - return None + self._misses += 1 + return None def put(self, query: str, max_results: int, category: Optional[str], hybrid_alpha: float, result: Any) -> None: """Store result in cache""" key = self._make_key(query, max_results, category, hybrid_alpha) - if len(self._cache) >= self.max_size: - self._cache.popitem(last=False) - self._cache[key] = (time.time(), result) + with self._lock: + if len(self._cache) >= self.max_size: + self._cache.popitem(last=False) + self._cache[key] = (time.time(), result) def invalidate(self) -> None: """Clear entire cache (call after reindex)""" - self._cache.clear() + with self._lock: + self._cache.clear() def stats(self) -> Dict[str, Any]: """Return cache statistics""" @@ -744,6 +750,24 @@ def __len__(self) -> int: # KNOWLEDGE ORCHESTRATOR # ============================================================================= + +def _enable_wal_mode(chroma_dir: Path) -> None: + """Enable WAL journal mode on ChromaDB's SQLite for concurrent reads.""" + import sqlite3 + + sqlite_path = chroma_dir / "chroma.sqlite3" + if not sqlite_path.exists(): + return + try: + conn = sqlite3.connect(str(sqlite_path)) + conn.execute("PRAGMA journal_mode=WAL;") + conn.execute("PRAGMA busy_timeout=5000;") + conn.close() + print("[INFO] ChromaDB SQLite: WAL mode enabled") + except Exception as e: + print(f"[WARN] Could not enable WAL mode: {e}") + + # ============================================================================= # FILE WATCHER (auto-reindex on document changes) # ============================================================================= @@ -826,6 +850,8 @@ def __init__(self): # Initialize ChromaDB with persistent storage (new API v1.4.0+) self.chroma_client = chromadb.PersistentClient(path=str(config.chroma_dir)) + if config.transport != "stdio": + _enable_wal_mode(config.chroma_dir) # Get or create collection (with auto-recovery from corruption) self.collection = self._safe_get_collection() @@ -924,23 +950,28 @@ def _check_dimension_mismatch(self) -> bool: print(f"[WARN] Dimension check query failed (non-dimension error): {e}") return False + _bm25_build_lock = threading.Lock() + def _ensure_bm25_index(self) -> None: """Lazy initialization of BM25 index from existing ChromaDB data""" if self._bm25_initialized: return + with self._bm25_build_lock: + if self._bm25_initialized: + return - try: - count = self.collection.count() - if count > 0: - all_data = self.collection.get(include=["documents"], limit=count) - if all_data["ids"] and all_data["documents"]: - self.bm25_index.add_documents(all_data["ids"], all_data["documents"]) - self.bm25_index.build_index() - print(f"[INFO] BM25 index built with {len(self.bm25_index)} documents") - except Exception as e: - print(f"[WARN] Failed to build BM25 index: {e}") + try: + count = self.collection.count() + if count > 0: + all_data = self.collection.get(include=["documents"], limit=count) + if all_data["ids"] and all_data["documents"]: + self.bm25_index.add_documents(all_data["ids"], all_data["documents"]) + self.bm25_index.build_index() + print(f"[INFO] BM25 index built with {len(self.bm25_index)} documents") + except Exception as e: + print(f"[WARN] Failed to build BM25 index: {e}") - self._bm25_initialized = True + self._bm25_initialized = True # ========================================================================= # Indexing @@ -1960,16 +1991,23 @@ def _save_metadata(self) -> None: # MCP Server # ============================================================================= -mcp = FastMCP("knowledge-rag") +mcp = FastMCP( + "knowledge-rag", + host=config.server_host, + port=config.server_port, +) _orchestrator: Optional[KnowledgeOrchestrator] = None +_orchestrator_lock = threading.Lock() def get_orchestrator() -> KnowledgeOrchestrator: """Get or create the orchestrator instance""" global _orchestrator if _orchestrator is None: - _orchestrator = KnowledgeOrchestrator() + with _orchestrator_lock: + if _orchestrator is None: + _orchestrator = KnowledgeOrchestrator() return _orchestrator @@ -1979,6 +2017,8 @@ def get_orchestrator() -> KnowledgeOrchestrator: @mcp.tool() +@rate_limited +@instrument("search_knowledge") def search_knowledge(query: str, max_results: int = 5, category: str = None, hybrid_alpha: float = 0.3) -> str: """ Hybrid search combining semantic search + BM25 keyword search with cross-encoder reranking. @@ -2037,6 +2077,8 @@ def search_knowledge(query: str, max_results: int = 5, category: str = None, hyb @mcp.tool() +@rate_limited +@instrument("get_document") def get_document(filepath: str) -> str: """ Get the full content of a specific document by filepath. @@ -2066,6 +2108,8 @@ def get_document(filepath: str) -> str: @mcp.tool() +@rate_limited +@instrument("reindex_documents") def reindex_documents(force: bool = False, full_rebuild: bool = False) -> str: """ Index or reindex all documents in the knowledge base. @@ -2102,6 +2146,8 @@ def reindex_documents(force: bool = False, full_rebuild: bool = False) -> str: @mcp.tool() +@rate_limited +@instrument("list_categories") def list_categories() -> str: """ List all document categories with their document counts. @@ -2123,6 +2169,8 @@ def list_categories() -> str: @mcp.tool() +@rate_limited +@instrument("list_documents") def list_documents(category: str = None) -> str: """ List all indexed documents, optionally filtered by category. @@ -2152,6 +2200,8 @@ def list_documents(category: str = None) -> str: @mcp.tool() +@rate_limited +@instrument("get_index_stats") def get_index_stats() -> str: """ Get statistics and health metrics for the knowledge base index. @@ -2178,6 +2228,8 @@ def get_index_stats() -> str: @mcp.tool() +@rate_limited +@instrument("add_document") def add_document(content: str, filepath: str, category: str = "general") -> str: """ Add a new document to the knowledge base from raw text content. @@ -2213,6 +2265,8 @@ def add_document(content: str, filepath: str, category: str = "general") -> str: @mcp.tool() +@rate_limited +@instrument("update_document") def update_document(filepath: str, content: str) -> str: """ Update the content of an existing document in the knowledge base. @@ -2247,6 +2301,8 @@ def update_document(filepath: str, content: str) -> str: @mcp.tool() +@rate_limited +@instrument("remove_document") def remove_document(filepath: str, delete_file: bool = False) -> str: """ Remove a document from the knowledge base index. @@ -2281,6 +2337,8 @@ def remove_document(filepath: str, delete_file: bool = False) -> str: @mcp.tool() +@rate_limited +@instrument("add_from_url") def add_from_url(url: str, category: str = "general", title: str = None) -> str: """ Fetch content from a URL, convert to markdown, and add to the knowledge base. @@ -2314,6 +2372,8 @@ def add_from_url(url: str, category: str = "general", title: str = None) -> str: @mcp.tool() +@rate_limited +@instrument("search_similar") def search_similar(filepath: str, max_results: int = 5) -> str: """ Find documents semantically similar to a given reference document. @@ -2352,6 +2412,8 @@ def search_similar(filepath: str, max_results: int = 5) -> str: @mcp.tool() +@rate_limited +@instrument("evaluate_retrieval") def evaluate_retrieval(test_cases: str) -> str: """ Evaluate search quality by testing whether search_knowledge() retrieves expected documents. @@ -2448,6 +2510,16 @@ def main(): from .preflight import run_preflight try: + # SSE/HTTP mode: auto-enable single-instance lock (port collision prevention) + transport = config.transport + for i, arg in enumerate(sys.argv[1:], 1): + if arg == "--transport" and i < len(sys.argv) - 1: + transport = sys.argv[i + 1] + elif arg.startswith("--transport="): + transport = arg.split("=", 1)[1] + if transport != "stdio": + os.environ["KNOWLEDGE_RAG_SINGLE_INSTANCE"] = "1" + with single_instance_lock(): run_preflight() @@ -2487,11 +2559,32 @@ def main(): print(f"[WARN] Failed to start file watcher: {e}") print("[WARN] Auto-reindexing disabled. Use reindex_documents tool manually.") + # Start optional metrics server + if config.metrics_enabled and config.transport != "stdio": + from .metrics import start_metrics_server + + start_metrics_server(config.metrics_port) + # Restore real stdout for MCP JSON-RPC, keep print() going to stderr from . import _original_stdout sys.stdout = _original_stdout - mcp.run() + + # Parse --transport CLI override + transport = config.transport + for i, arg in enumerate(sys.argv[1:], 1): + if arg == "--transport" and i < len(sys.argv) - 1: + transport = sys.argv[i + 1] + elif arg.startswith("--transport="): + transport = arg.split("=", 1)[1] + + if transport != "stdio": + print( + f"[SERVER] Starting {transport} server on {config.server_host}:{config.server_port}", + file=sys.stderr, + ) + + mcp.run(transport=transport) except AlreadyRunningError as e: print(f"[ERROR] {e}", file=sys.stderr) raise SystemExit(ALREADY_RUNNING_EXIT_CODE) from e diff --git a/npm/package.json b/npm/package.json index 596a19d..b06b338 100644 --- a/npm/package.json +++ b/npm/package.json @@ -1,6 +1,6 @@ { "name": "knowledge-rag", - "version": "3.9.1", + "version": "4.0.0", "description": "Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers.", "bin": { "knowledge-rag": "./bin/cli.js" diff --git a/pyproject.toml b/pyproject.toml index 1b9d106..7f37863 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "hatchling.build" [project] name = "knowledge-rag" -version = "3.9.1" +version = "4.0.0" description = "Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers." readme = "README.md" license = {text = "MIT"} @@ -32,7 +32,7 @@ dependencies = [ "chromadb>=1.4.0", "pymupdf>=1.23.0", "fastembed[reranking]>=0.4.0", - "mcp>=1.0.0", + "mcp>=1.6.0", "rank-bm25>=0.2.2", "requests>=2.33.0", "beautifulsoup4>=4.12.0", @@ -45,6 +45,7 @@ dependencies = [ [project.optional-dependencies] gpu = ["onnxruntime-gpu>=1.14.0"] +server = ["uvicorn>=0.20.0"] [project.urls] Homepage = "https://github.com/lyonzin/knowledge-rag" From 2a3b4bfedde5207a9c83fe4dfddc7eff8b0a96c7 Mon Sep 17 00:00:00 2001 From: Lyonzin Date: Tue, 9 Jun 2026 13:35:30 -0300 Subject: [PATCH 2/3] docs(changelog): add v4.0.0 changelog entry --- README.md | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/README.md b/README.md index ae400bd..a064654 100644 --- a/README.md +++ b/README.md @@ -1224,6 +1224,20 @@ A second instance exits immediately with code 75. Default is OFF (multi-client f - **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI. - **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34) +### v4.0.0 (2026-06-09) — Enterprise Concurrent Access + +- **NEW**: SSE and streamable-http transport modes — 1 server serves N clients (`server.transport: "sse"` in config.yaml or `--transport sse` CLI). +- **NEW**: Thread-safe shared state for concurrent queries — QueryCache locking, BM25 build lock, orchestrator double-checked locking. +- **NEW**: ChromaDB WAL mode enabled automatically in SSE/HTTP mode for concurrent read performance. +- **NEW**: Optional rate limiting — sliding-window counter, configurable RPM and burst, disabled by default. +- **NEW**: Optional Prometheus metrics endpoint — tool call counts, latency histograms, separate port, disabled by default. +- **NEW**: All 12 MCP tools instrumented with `@rate_limited` and `@instrument` decorators (zero-cost when disabled). +- **NEW**: `--transport` CLI override for Docker/systemd deployments. +- **NEW**: `pip install knowledge-rag[server]` optional dependency for SSE/HTTP (uvicorn). +- **CHANGED**: SSE/HTTP mode auto-enables single-instance lock (port collision prevention). +- **CHANGED**: `mcp` dependency bumped to `>=1.6.0` (SSE/streamable-http support). +- **MIGRATION**: Default transport remains `stdio` — existing users need zero changes. See config.example.yaml for SSE setup. + ### v3.9.1 (2026-06-08) - **FIX**: Expand `~` in `config.yaml` path values (`documents_dir`, `data_dir`, `models_cache_dir`) via `expanduser()` on all platforms (#86). From fd6564517bcdc2cd07fcb448ebf922e835930d4c Mon Sep 17 00:00:00 2001 From: Lyonzin Date: Tue, 9 Jun 2026 14:06:29 -0300 Subject: [PATCH 3/3] =?UTF-8?q?docs(readme):=20update=20for=20v4.0.0=20?= =?UTF-8?q?=E2=80=94=20SSE=20transport,=20enterprise=20features,=20changel?= =?UTF-8?q?og=20reorder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- README.md | 158 ++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 131 insertions(+), 27 deletions(-) diff --git a/README.md b/README.md index a064654..9b4057b 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que **12 MCP Tools** | **Hybrid Search + Reranking** | **20 File Formats** | **Optional NVIDIA GPU** | **100% Local** -[What's New](#whats-new-in-v390) | [Supported Formats](#supported-formats) | [Installation](#installation) | [Configuration](#configuration) | [API Reference](#api-reference) | [Architecture](#architecture) +[What's New](#whats-new-in-v400) | [Supported Formats](#supported-formats) | [Installation](#installation) | [Configuration](#configuration) | [API Reference](#api-reference) | [Architecture](#architecture) @@ -50,7 +50,30 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que --- -## What's New in v3.9.0 +## What's New in v4.0.0 + +### Enterprise Concurrent Access — SSE/HTTP Transport (v4.0.0) + +The server now supports **SSE** and **streamable-http** transport modes. Instead of spawning a separate process per client (stdio), a single server process serves all clients with shared resources — 1 embedding model, 1 ChromaDB, 1 query cache. + +```yaml +# config.yaml +server: + transport: "sse" # "stdio" | "sse" | "streamable-http" + host: "127.0.0.1" + port: 8179 +``` + +Or via CLI: `knowledge-rag --transport sse` + +**Optional enterprise features** (all disabled by default): +- **Rate limiting**: Sliding-window counter, configurable RPM and burst +- **Prometheus metrics**: `/metrics` endpoint on separate port +- **Bearer auth**: Token validation for SSE/HTTP connections + +All 12 MCP tools are instrumented with `@rate_limited` and `@instrument` decorators — zero overhead when features are disabled. Default transport remains **stdio** for full backwards compatibility. + +> **Migration**: Existing users need zero changes. SSE mode is opt-in via `server.transport: "sse"` in config.yaml. See [Configuration](#configuration) for details. ### Quality Gate — 7-Pillar PR Validation @@ -102,6 +125,7 @@ All methods produce the same MCP server. See [Installation](#installation) for f ### Recent Highlights +- **v4.0.0** — **Enterprise concurrent access**: SSE/HTTP transport (1 server → N clients), thread-safe shared state, optional rate limiting + Prometheus metrics, ChromaDB WAL mode, `--transport` CLI - **v3.9.0** — **Quality Gate** activated: 35+ automated PR checks across 7 pillars (Security, Stability, Memory Leak, Versatility, Scalability, Versioning, Quality) + nightly resilience suite (chaos, soak, determinism, mutation) - **v3.8.1** — Critical hotfix: loud-fail embeddings (no more silent zero-vector corruption); Windows CI flake erradicated (HF_HUB_OFFLINE + shell:bash + atexit wrapper) - **v3.8.0** — Lazy-load embeddings, opt-in single-instance guard, version sync across PyPI/NPM/Docker @@ -463,9 +487,33 @@ Add to `~/.claude.json`: > Replace `YOUR_USER` with your username, or use the full path from `echo $HOME`. +#### Option F: SSE Server Mode (multi-agent) + +For multi-agent setups where multiple clients query the same knowledge base simultaneously: + +```bash +pip install knowledge-rag[server] # Adds uvicorn for SSE/HTTP +knowledge-rag --transport sse # Starts on http://127.0.0.1:8179 +``` + +Then configure each MCP client to connect via SSE: + +```json +{ + "mcpServers": { + "knowledge-rag": { + "type": "sse", + "url": "http://127.0.0.1:8179/sse" + } + } +} +``` + +One server process serves all agents — shared embedding model, shared cache, shared ChromaDB. See [Configuration > Server](#server) for rate limiting, metrics, and auth options. + ### Use with other MCP clients -`knowledge-rag` is a standard **stdio MCP server** — it works with any MCP-compatible client, not only Claude Code. The launch command is the same everywhere (the `python -m mcp_server.server` from whichever install method you picked); only the **config file location** and **JSON shape** differ per client. +`knowledge-rag` supports both **stdio** (default, 1:1) and **SSE** (1:N) transport modes. In stdio mode, it works with any MCP-compatible client, not only Claude Code. The launch command is the same everywhere (the `python -m mcp_server.server` from whichever install method you picked); only the **config file location** and **JSON shape** differ per client. #### Clients using the standard `mcpServers` format @@ -923,6 +971,21 @@ query_expansions: privesc: - privilege escalation - privesc + +# Server — enterprise features (new in v4.0.0) +server: + transport: "stdio" # "stdio" | "sse" | "streamable-http" + host: "127.0.0.1" # Bind address (SSE/HTTP only) + port: 8179 # Bind port (SSE/HTTP only) + auth: + bearer_token: "" # Set a secret to enable auth (SSE/HTTP only) + rate_limit: + enabled: false + requests_per_minute: 60 + burst: 10 + metrics: + enabled: false + port: 9179 # Separate port for Prometheus scraping ``` > See `config.example.yaml` for the fully documented template with explanations for every field. @@ -942,6 +1005,22 @@ Pre-built configurations for common use cases: ### Configuration Reference +#### Server + +| Field | Default | Description | +|-------|---------|-------------| +| `server.transport` | `"stdio"` | Transport protocol: `"stdio"`, `"sse"`, or `"streamable-http"` | +| `server.host` | `"127.0.0.1"` | Bind address for SSE/HTTP mode | +| `server.port` | `8179` | Bind port for SSE/HTTP mode | +| `server.auth.bearer_token` | `""` (disabled) | Bearer token for SSE/HTTP auth. Empty = no auth | +| `server.rate_limit.enabled` | `false` | Enable per-client rate limiting | +| `server.rate_limit.requests_per_minute` | `60` | Max requests per minute | +| `server.rate_limit.burst` | `10` | Burst allowance above steady rate | +| `server.metrics.enabled` | `false` | Enable Prometheus `/metrics` endpoint | +| `server.metrics.port` | `9179` | Port for metrics scraping | + +In stdio mode (default), server settings are ignored. SSE/HTTP mode auto-enables the single-instance lock. + #### Paths | Field | Default | Description | @@ -1179,10 +1258,59 @@ export KNOWLEDGE_RAG_SINGLE_INSTANCE=1 A second instance exits immediately with code 75. Default is OFF (multi-client friendly). Full guide: [docs/single-instance.md](docs/single-instance.md). Sample MCP config: [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json). +### SSE server won't start + +```bash +# Check if port 8179 is already in use +# Windows: +netstat -aon | findstr :8179 +# Linux/macOS: +lsof -i :8179 +``` + +If `uvicorn` is not found, install the server extras: `pip install knowledge-rag[server]` + +### Can't connect to SSE server + +Verify the server is running and the URL is correct: + +```bash +curl http://127.0.0.1:8179/sse +``` + +Common issues: +- Wrong URL: must end with `/sse` (not just the port) +- Firewall blocking the port +- Server started with a different host/port than configured in the MCP client + --- ## Changelog +### v4.0.0 (2026-06-09) — Enterprise Concurrent Access + +- **NEW**: SSE and streamable-http transport modes — 1 server serves N clients (`server.transport: "sse"` in config.yaml or `--transport sse` CLI). +- **NEW**: Thread-safe shared state for concurrent queries — QueryCache locking, BM25 build lock, orchestrator double-checked locking. +- **NEW**: ChromaDB WAL mode enabled automatically in SSE/HTTP mode for concurrent read performance. +- **NEW**: Optional rate limiting — sliding-window counter, configurable RPM and burst, disabled by default. +- **NEW**: Optional Prometheus metrics endpoint — tool call counts, latency histograms, separate port, disabled by default. +- **NEW**: All 12 MCP tools instrumented with `@rate_limited` and `@instrument` decorators (zero-cost when disabled). +- **NEW**: `--transport` CLI override for Docker/systemd deployments. +- **NEW**: `pip install knowledge-rag[server]` optional dependency for SSE/HTTP (uvicorn). +- **CHANGED**: SSE/HTTP mode auto-enables single-instance lock (port collision prevention). +- **CHANGED**: `mcp` dependency bumped to `>=1.6.0` (SSE/streamable-http support). +- **MIGRATION**: Default transport remains `stdio` — existing users need zero changes. See config.example.yaml for SSE setup. + +### v3.9.1 (2026-06-08) + +- **FIX**: Expand `~` in `config.yaml` path values (`documents_dir`, `data_dir`, `models_cache_dir`) via `expanduser()` on all platforms (#86). +- **FIX**: Warn when `documents_dir` resolves to a non-existent path instead of silently indexing zero files. +- **FIX**: File watcher now uses accumulate-mode debounce — bulk file copies no longer starve the reindex trigger. +- **FIX**: Concurrent `index_all()` calls are serialized via `_index_lock` to prevent ChromaDB SQLite corruption. +- **FIX**: `collection.add()` is batched (500 chunks/call) to cap memory usage during large reindex operations. +- **NEW**: `KNOWLEDGE_RAG_WATCHER_DISABLED=1` env var to disable the file watcher for troubleshooting. +- **NEW**: Progress logging every 10% for reindex operations with >100 documents. + ### v3.9.0 (2026-05-10) — Quality Gate **Major governance + CI hardening release. No runtime behavior change in `mcp_server/`. Public API surface unchanged from v3.8.1.** @@ -1224,30 +1352,6 @@ A second instance exits immediately with code 75. Default is OFF (multi-client f - **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI. - **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34) -### v4.0.0 (2026-06-09) — Enterprise Concurrent Access - -- **NEW**: SSE and streamable-http transport modes — 1 server serves N clients (`server.transport: "sse"` in config.yaml or `--transport sse` CLI). -- **NEW**: Thread-safe shared state for concurrent queries — QueryCache locking, BM25 build lock, orchestrator double-checked locking. -- **NEW**: ChromaDB WAL mode enabled automatically in SSE/HTTP mode for concurrent read performance. -- **NEW**: Optional rate limiting — sliding-window counter, configurable RPM and burst, disabled by default. -- **NEW**: Optional Prometheus metrics endpoint — tool call counts, latency histograms, separate port, disabled by default. -- **NEW**: All 12 MCP tools instrumented with `@rate_limited` and `@instrument` decorators (zero-cost when disabled). -- **NEW**: `--transport` CLI override for Docker/systemd deployments. -- **NEW**: `pip install knowledge-rag[server]` optional dependency for SSE/HTTP (uvicorn). -- **CHANGED**: SSE/HTTP mode auto-enables single-instance lock (port collision prevention). -- **CHANGED**: `mcp` dependency bumped to `>=1.6.0` (SSE/streamable-http support). -- **MIGRATION**: Default transport remains `stdio` — existing users need zero changes. See config.example.yaml for SSE setup. - -### v3.9.1 (2026-06-08) - -- **FIX**: Expand `~` in `config.yaml` path values (`documents_dir`, `data_dir`, `models_cache_dir`) via `expanduser()` on all platforms (#86). -- **FIX**: Warn when `documents_dir` resolves to a non-existent path instead of silently indexing zero files. -- **FIX**: File watcher now uses accumulate-mode debounce — bulk file copies no longer starve the reindex trigger. -- **FIX**: Concurrent `index_all()` calls are serialized via `_index_lock` to prevent ChromaDB SQLite corruption. -- **FIX**: `collection.add()` is batched (500 chunks/call) to cap memory usage during large reindex operations. -- **NEW**: `KNOWLEDGE_RAG_WATCHER_DISABLED=1` env var to disable the file watcher for troubleshooting. -- **NEW**: Progress logging every 10% for reindex operations with >100 documents. - ### Unreleased - **FIX**: Startup preflight probes ChromaDB in a child process and moves crashing persistent indexes to `data/backups/auto-repair-*` before MCP initialization.