From 2cde33f1a4150a19674a80427b64f430cdfc724e Mon Sep 17 00:00:00 2001
From: Lyonzin <lyonzin@users.noreply.github.com>
Date: Tue, 9 Jun 2026 13:35:10 -0300
Subject: [PATCH 1/3] =?UTF-8?q?feat(server):=20enterprise=20concurrent=20a?=
 =?UTF-8?q?ccess=20=E2=80=94=20SSE/HTTP=20transport,=20rate=20limiting,=20?=
 =?UTF-8?q?metrics=20(#88)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Add server: config section (transport, host, port, auth, rate_limit, metrics)
- Support SSE and streamable-http transport via FastMCP (--transport CLI override)
- Thread-safe QueryCache, BM25 build lock, orchestrator double-checked locking
- ChromaDB WAL mode for concurrent reads in SSE/HTTP mode
- Optional rate limiting (sliding-window, decorator-based, disabled by default)
- Optional Prometheus metrics endpoint (disabled by default)
- All 12 MCP tools instrumented with @rate_limited + @instrument decorators
- Auto-enable single-instance lock in SSE/HTTP mode
- New optional dependency: pip install knowledge-rag[server]
- Version bump: 3.9.1 -> 4.0.0

Closes #88
---
 config.example.yaml     |  33 +++++++++
 mcp_server/__init__.py  |   2 +-
 mcp_server/config.py    |  59 +++++++++++++++-
 mcp_server/metrics.py   | 102 +++++++++++++++++++++++++++
 mcp_server/ratelimit.py |  69 +++++++++++++++++++
 mcp_server/server.py    | 149 ++++++++++++++++++++++++++++++++--------
 npm/package.json        |   2 +-
 pyproject.toml          |   5 +-
 8 files changed, 388 insertions(+), 33 deletions(-)
 create mode 100644 mcp_server/metrics.py
 create mode 100644 mcp_server/ratelimit.py

diff --git a/config.example.yaml b/config.example.yaml
index 3c42f58..a7bfb63 100644
--- a/config.example.yaml
+++ b/config.example.yaml
@@ -245,3 +245,36 @@ query_expansions: {}
 #
 #   # Logging verbosity: DEBUG, INFO, WARNING, ERROR
 #   # log_level: "INFO"
+
+
+# ============================================================================
+# SERVER (new in v4.0.0)
+# ============================================================================
+# Controls transport, networking, and enterprise features.
+# All fields are optional — defaults preserve v3.x stdio behavior.
+
+server:
+  # Transport protocol: "stdio" (legacy), "sse", "streamable-http"
+  # stdio: 1 process per client (compatible with all MCP clients)
+  # sse: 1 server serves N clients over HTTP+SSE (recommended for multi-agent)
+  # streamable-http: 1 server, HTTP streaming
+  transport: "stdio"
+
+  # Network settings (ignored when transport is stdio)
+  host: "127.0.0.1"
+  port: 8179
+
+  # Auth: optional bearer token validation (SSE/HTTP only)
+  auth:
+    bearer_token: ""
+
+  # Rate limiting: optional per-client request throttling
+  rate_limit:
+    enabled: false
+    requests_per_minute: 60
+    burst: 10
+
+  # Metrics: optional Prometheus-compatible /metrics endpoint
+  metrics:
+    enabled: false
+    port: 9179
diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py
index a084180..3b464c9 100644
--- a/mcp_server/__init__.py
+++ b/mcp_server/__init__.py
@@ -8,7 +8,7 @@
 _original_stdout = sys.stdout
 sys.stdout = sys.stderr
 
-__version__ = "3.9.1"
+__version__ = "4.0.0"
 __author__ = "Ailton Rocha (Lyon.)"
 
 from .config import Config  # noqa: E402
diff --git a/mcp_server/config.py b/mcp_server/config.py
index d9aee42..22672a9 100644
--- a/mcp_server/config.py
+++ b/mcp_server/config.py
@@ -1,4 +1,4 @@
-"""Configuration for Knowledge RAG System v3.4.1 — YAML-configurable"""
+"""Configuration for Knowledge RAG System v4.0.0 — YAML-configurable"""
 
 import os
 import sys
@@ -528,6 +528,49 @@ class Config:
     default_results: int = field(default_factory=lambda: _get("search", "default_results", 5))
     max_results: int = field(default_factory=lambda: _get("search", "max_results", 20))
 
+    # Server (new in v4.0.0)
+    transport: str = field(default_factory=lambda: _get("server", "transport", "stdio"))
+    server_host: str = field(default_factory=lambda: _get("server", "host", "127.0.0.1"))
+    server_port: int = field(default_factory=lambda: _get("server", "port", 8179))
+    auth_bearer_token: str = field(
+        default_factory=lambda: (
+            _get("server", "auth", {}).get("bearer_token", "") if isinstance(_get("server", "auth", {}), dict) else ""
+        )
+    )
+    rate_limit_enabled: bool = field(
+        default_factory=lambda: (
+            _get("server", "rate_limit", {}).get("enabled", False)
+            if isinstance(_get("server", "rate_limit", {}), dict)
+            else False
+        )
+    )
+    rate_limit_rpm: int = field(
+        default_factory=lambda: (
+            _get("server", "rate_limit", {}).get("requests_per_minute", 60)
+            if isinstance(_get("server", "rate_limit", {}), dict)
+            else 60
+        )
+    )
+    rate_limit_burst: int = field(
+        default_factory=lambda: (
+            _get("server", "rate_limit", {}).get("burst", 10)
+            if isinstance(_get("server", "rate_limit", {}), dict)
+            else 10
+        )
+    )
+    metrics_enabled: bool = field(
+        default_factory=lambda: (
+            _get("server", "metrics", {}).get("enabled", False)
+            if isinstance(_get("server", "metrics", {}), dict)
+            else False
+        )
+    )
+    metrics_port: int = field(
+        default_factory=lambda: (
+            _get("server", "metrics", {}).get("port", 9179) if isinstance(_get("server", "metrics", {}), dict) else 9179
+        )
+    )
+
     def __post_init__(self):
         """Validate config values and ensure directories exist."""
         # Bounds validation
@@ -553,6 +596,20 @@ def __post_init__(self):
             self.reranker_enabled = True
         if not isinstance(self.reranker_top_k_multiplier, int) or self.reranker_top_k_multiplier < 1:
             self.reranker_top_k_multiplier = 3
+
+        # Server transport validation
+        if self.transport not in ("stdio", "sse", "streamable-http"):
+            print(f"[WARN] server.transport={self.transport!r} invalid, using 'stdio'")
+            self.transport = "stdio"
+        if not isinstance(self.server_port, int) or not (1 <= self.server_port <= 65535):
+            self.server_port = 8179
+        if not isinstance(self.metrics_port, int) or not (1 <= self.metrics_port <= 65535):
+            self.metrics_port = 9179
+        if not isinstance(self.rate_limit_rpm, int) or self.rate_limit_rpm < 1:
+            self.rate_limit_rpm = 60
+        if not isinstance(self.rate_limit_burst, int) or self.rate_limit_burst < 0:
+            self.rate_limit_burst = 10
+
         if not isinstance(self.supported_formats, list) or not self.supported_formats:
             print("[WARN] supported_formats is empty or invalid, using defaults")
             self.supported_formats = [
diff --git a/mcp_server/metrics.py b/mcp_server/metrics.py
new file mode 100644
index 0000000..c0bfdbf
--- /dev/null
+++ b/mcp_server/metrics.py
@@ -0,0 +1,102 @@
+"""Optional Prometheus-compatible metrics for knowledge-rag server."""
+
+import sys
+import threading
+import time
+from collections import defaultdict
+from functools import wraps
+from typing import Any, Callable
+
+from .config import config
+
+
+class MetricsCollector:
+    """Lightweight Prometheus-compatible metrics collector."""
+
+    def __init__(self) -> None:
+        self._lock = threading.Lock()
+        self._counters: dict[str, float] = defaultdict(float)
+        self._histograms: dict[str, list[float]] = defaultdict(list)
+        self._gauges: dict[str, float] = defaultdict(float)
+
+    def inc(self, name: str, labels: str = "", value: float = 1.0) -> None:
+        with self._lock:
+            self._counters[f"{name}{labels}"] += value
+
+    def set_gauge(self, name: str, value: float, labels: str = "") -> None:
+        with self._lock:
+            self._gauges[f"{name}{labels}"] = value
+
+    def observe(self, name: str, value: float, labels: str = "") -> None:
+        with self._lock:
+            self._histograms[f"{name}{labels}"].append(value)
+
+    def exposition(self) -> str:
+        lines: list[str] = []
+        with self._lock:
+            for key, val in sorted(self._counters.items()):
+                lines.append(f"{key} {val}")
+            for key, val in sorted(self._gauges.items()):
+                lines.append(f"{key} {val}")
+            for key, observations in sorted(self._histograms.items()):
+                if observations:
+                    lines.append(f"{key}_count {len(observations)}")
+                    lines.append(f"{key}_sum {sum(observations):.6f}")
+        return "\n".join(lines) + "\n"
+
+
+_metrics = MetricsCollector()
+
+
+def get_metrics() -> MetricsCollector:
+    return _metrics
+
+
+def instrument(tool_name: str) -> Callable[..., Callable[..., Any]]:
+    """Decorator to instrument a tool function with call count and latency."""
+
+    def decorator(fn: Callable[..., Any]) -> Callable[..., Any]:
+        @wraps(fn)
+        def wrapper(*args: Any, **kwargs: Any) -> Any:
+            if not config.metrics_enabled:
+                return fn(*args, **kwargs)
+            _metrics.inc("knowledge_rag_tool_calls_total", f'{{tool="{tool_name}"}}')
+            start = time.monotonic()
+            try:
+                result = fn(*args, **kwargs)
+                return result
+            except Exception:
+                _metrics.inc("knowledge_rag_tool_errors_total", f'{{tool="{tool_name}"}}')
+                raise
+            finally:
+                elapsed = time.monotonic() - start
+                _metrics.observe("knowledge_rag_tool_duration_seconds", elapsed, f'{{tool="{tool_name}"}}')
+
+        return wrapper
+
+    return decorator
+
+
+def start_metrics_server(port: int) -> None:
+    """Start a lightweight HTTP server for Prometheus metrics scraping."""
+    from http.server import BaseHTTPRequestHandler, HTTPServer
+
+    class _Handler(BaseHTTPRequestHandler):
+        def do_GET(self) -> None:
+            if self.path == "/metrics":
+                body = get_metrics().exposition().encode("utf-8")
+                self.send_response(200)
+                self.send_header("Content-Type", "text/plain; version=0.0.4; charset=utf-8")
+                self.end_headers()
+                self.wfile.write(body)
+            else:
+                self.send_response(404)
+                self.end_headers()
+
+        def log_message(self, format: str, *args: Any) -> None:
+            pass
+
+    server = HTTPServer(("0.0.0.0", port), _Handler)
+    thread = threading.Thread(target=server.serve_forever, daemon=True)
+    thread.start()
+    print(f"[METRICS] Prometheus endpoint at http://0.0.0.0:{port}/metrics", file=sys.stderr)
diff --git a/mcp_server/ratelimit.py b/mcp_server/ratelimit.py
new file mode 100644
index 0000000..82c4463
--- /dev/null
+++ b/mcp_server/ratelimit.py
@@ -0,0 +1,69 @@
+"""Optional per-client rate limiting for MCP tool calls."""
+
+import threading
+import time
+from collections import defaultdict
+from functools import wraps
+from typing import Any, Callable
+
+from .config import config
+
+
+class SlidingWindowCounter:
+    """Thread-safe sliding window rate limiter."""
+
+    def __init__(self, window_seconds: int = 60, max_requests: int = 60, burst: int = 10):
+        self.window = window_seconds
+        self.max_requests = max_requests
+        self.burst = burst
+        self._lock = threading.Lock()
+        self._requests: dict[str, list[float]] = defaultdict(list)
+
+    def allow(self, client_id: str = "default") -> bool:
+        now = time.monotonic()
+        with self._lock:
+            timestamps = self._requests[client_id]
+            cutoff = now - self.window
+            self._requests[client_id] = [t for t in timestamps if t > cutoff]
+            timestamps = self._requests[client_id]
+            if len(timestamps) < self.max_requests + self.burst:
+                timestamps.append(now)
+                return True
+            return False
+
+
+_limiter: SlidingWindowCounter | None = None
+
+
+def _get_limiter() -> SlidingWindowCounter:
+    global _limiter
+    if _limiter is None:
+        _limiter = SlidingWindowCounter(
+            window_seconds=60,
+            max_requests=config.rate_limit_rpm,
+            burst=config.rate_limit_burst,
+        )
+    return _limiter
+
+
+def rate_limited(fn: Callable[..., Any]) -> Callable[..., Any]:
+    """Decorator that enforces rate limiting if enabled in config."""
+
+    @wraps(fn)
+    def wrapper(*args: Any, **kwargs: Any) -> Any:
+        if not config.rate_limit_enabled:
+            return fn(*args, **kwargs)
+        limiter = _get_limiter()
+        if not limiter.allow("default"):
+            import json
+
+            return json.dumps(
+                {
+                    "status": "error",
+                    "message": "Rate limit exceeded. Retry after a short delay.",
+                    "retry_after_seconds": 60,
+                }
+            )
+        return fn(*args, **kwargs)
+
+    return wrapper
diff --git a/mcp_server/server.py b/mcp_server/server.py
index 85b0f09..c97e8d9 100644
--- a/mcp_server/server.py
+++ b/mcp_server/server.py
@@ -58,6 +58,8 @@
 # Local imports
 from .config import config
 from .ingestion import Document, DocumentParser
+from .metrics import instrument
+from .ratelimit import rate_limited
 
 # =============================================================================
 # QUERY CACHE
@@ -80,6 +82,7 @@ def __init__(self, max_size: int = 100, ttl_seconds: int = 300):
         self.max_size = max_size
         self.ttl_seconds = ttl_seconds
         self._cache: OrderedDict[str, Tuple[float, Any]] = OrderedDict()
+        self._lock = threading.Lock()
         self._hits = 0
         self._misses = 0
 
@@ -92,28 +95,31 @@ def get(self, query: str, max_results: int, category: Optional[str], hybrid_alph
         """Get cached result if exists and not expired"""
         key = self._make_key(query, max_results, category, hybrid_alpha)
 
-        if key in self._cache:
-            timestamp, result = self._cache[key]
-            if time.time() - timestamp < self.ttl_seconds:
-                self._cache.move_to_end(key)
-                self._hits += 1
-                return result
-            else:
-                del self._cache[key]
+        with self._lock:
+            if key in self._cache:
+                timestamp, result = self._cache[key]
+                if time.time() - timestamp < self.ttl_seconds:
+                    self._cache.move_to_end(key)
+                    self._hits += 1
+                    return result
+                else:
+                    del self._cache[key]
 
-        self._misses += 1
-        return None
+            self._misses += 1
+            return None
 
     def put(self, query: str, max_results: int, category: Optional[str], hybrid_alpha: float, result: Any) -> None:
         """Store result in cache"""
         key = self._make_key(query, max_results, category, hybrid_alpha)
-        if len(self._cache) >= self.max_size:
-            self._cache.popitem(last=False)
-        self._cache[key] = (time.time(), result)
+        with self._lock:
+            if len(self._cache) >= self.max_size:
+                self._cache.popitem(last=False)
+            self._cache[key] = (time.time(), result)
 
     def invalidate(self) -> None:
         """Clear entire cache (call after reindex)"""
-        self._cache.clear()
+        with self._lock:
+            self._cache.clear()
 
     def stats(self) -> Dict[str, Any]:
         """Return cache statistics"""
@@ -744,6 +750,24 @@ def __len__(self) -> int:
 # KNOWLEDGE ORCHESTRATOR
 # =============================================================================
 
+
+def _enable_wal_mode(chroma_dir: Path) -> None:
+    """Enable WAL journal mode on ChromaDB's SQLite for concurrent reads."""
+    import sqlite3
+
+    sqlite_path = chroma_dir / "chroma.sqlite3"
+    if not sqlite_path.exists():
+        return
+    try:
+        conn = sqlite3.connect(str(sqlite_path))
+        conn.execute("PRAGMA journal_mode=WAL;")
+        conn.execute("PRAGMA busy_timeout=5000;")
+        conn.close()
+        print("[INFO] ChromaDB SQLite: WAL mode enabled")
+    except Exception as e:
+        print(f"[WARN] Could not enable WAL mode: {e}")
+
+
 # =============================================================================
 # FILE WATCHER (auto-reindex on document changes)
 # =============================================================================
@@ -826,6 +850,8 @@ def __init__(self):
 
         # Initialize ChromaDB with persistent storage (new API v1.4.0+)
         self.chroma_client = chromadb.PersistentClient(path=str(config.chroma_dir))
+        if config.transport != "stdio":
+            _enable_wal_mode(config.chroma_dir)
 
         # Get or create collection (with auto-recovery from corruption)
         self.collection = self._safe_get_collection()
@@ -924,23 +950,28 @@ def _check_dimension_mismatch(self) -> bool:
             print(f"[WARN] Dimension check query failed (non-dimension error): {e}")
             return False
 
+    _bm25_build_lock = threading.Lock()
+
     def _ensure_bm25_index(self) -> None:
         """Lazy initialization of BM25 index from existing ChromaDB data"""
         if self._bm25_initialized:
             return
+        with self._bm25_build_lock:
+            if self._bm25_initialized:
+                return
 
-        try:
-            count = self.collection.count()
-            if count > 0:
-                all_data = self.collection.get(include=["documents"], limit=count)
-                if all_data["ids"] and all_data["documents"]:
-                    self.bm25_index.add_documents(all_data["ids"], all_data["documents"])
-                    self.bm25_index.build_index()
-                    print(f"[INFO] BM25 index built with {len(self.bm25_index)} documents")
-        except Exception as e:
-            print(f"[WARN] Failed to build BM25 index: {e}")
+            try:
+                count = self.collection.count()
+                if count > 0:
+                    all_data = self.collection.get(include=["documents"], limit=count)
+                    if all_data["ids"] and all_data["documents"]:
+                        self.bm25_index.add_documents(all_data["ids"], all_data["documents"])
+                        self.bm25_index.build_index()
+                        print(f"[INFO] BM25 index built with {len(self.bm25_index)} documents")
+            except Exception as e:
+                print(f"[WARN] Failed to build BM25 index: {e}")
 
-        self._bm25_initialized = True
+            self._bm25_initialized = True
 
     # =========================================================================
     # Indexing
@@ -1960,16 +1991,23 @@ def _save_metadata(self) -> None:
 # MCP Server
 # =============================================================================
 
-mcp = FastMCP("knowledge-rag")
+mcp = FastMCP(
+    "knowledge-rag",
+    host=config.server_host,
+    port=config.server_port,
+)
 
 _orchestrator: Optional[KnowledgeOrchestrator] = None
+_orchestrator_lock = threading.Lock()
 
 
 def get_orchestrator() -> KnowledgeOrchestrator:
     """Get or create the orchestrator instance"""
     global _orchestrator
     if _orchestrator is None:
-        _orchestrator = KnowledgeOrchestrator()
+        with _orchestrator_lock:
+            if _orchestrator is None:
+                _orchestrator = KnowledgeOrchestrator()
     return _orchestrator
 
 
@@ -1979,6 +2017,8 @@ def get_orchestrator() -> KnowledgeOrchestrator:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("search_knowledge")
 def search_knowledge(query: str, max_results: int = 5, category: str = None, hybrid_alpha: float = 0.3) -> str:
     """
     Hybrid search combining semantic search + BM25 keyword search with cross-encoder reranking.
@@ -2037,6 +2077,8 @@ def search_knowledge(query: str, max_results: int = 5, category: str = None, hyb
 
 
 @mcp.tool()
+@rate_limited
+@instrument("get_document")
 def get_document(filepath: str) -> str:
     """
     Get the full content of a specific document by filepath.
@@ -2066,6 +2108,8 @@ def get_document(filepath: str) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("reindex_documents")
 def reindex_documents(force: bool = False, full_rebuild: bool = False) -> str:
     """
     Index or reindex all documents in the knowledge base.
@@ -2102,6 +2146,8 @@ def reindex_documents(force: bool = False, full_rebuild: bool = False) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("list_categories")
 def list_categories() -> str:
     """
     List all document categories with their document counts.
@@ -2123,6 +2169,8 @@ def list_categories() -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("list_documents")
 def list_documents(category: str = None) -> str:
     """
     List all indexed documents, optionally filtered by category.
@@ -2152,6 +2200,8 @@ def list_documents(category: str = None) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("get_index_stats")
 def get_index_stats() -> str:
     """
     Get statistics and health metrics for the knowledge base index.
@@ -2178,6 +2228,8 @@ def get_index_stats() -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("add_document")
 def add_document(content: str, filepath: str, category: str = "general") -> str:
     """
     Add a new document to the knowledge base from raw text content.
@@ -2213,6 +2265,8 @@ def add_document(content: str, filepath: str, category: str = "general") -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("update_document")
 def update_document(filepath: str, content: str) -> str:
     """
     Update the content of an existing document in the knowledge base.
@@ -2247,6 +2301,8 @@ def update_document(filepath: str, content: str) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("remove_document")
 def remove_document(filepath: str, delete_file: bool = False) -> str:
     """
     Remove a document from the knowledge base index.
@@ -2281,6 +2337,8 @@ def remove_document(filepath: str, delete_file: bool = False) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("add_from_url")
 def add_from_url(url: str, category: str = "general", title: str = None) -> str:
     """
     Fetch content from a URL, convert to markdown, and add to the knowledge base.
@@ -2314,6 +2372,8 @@ def add_from_url(url: str, category: str = "general", title: str = None) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("search_similar")
 def search_similar(filepath: str, max_results: int = 5) -> str:
     """
     Find documents semantically similar to a given reference document.
@@ -2352,6 +2412,8 @@ def search_similar(filepath: str, max_results: int = 5) -> str:
 
 
 @mcp.tool()
+@rate_limited
+@instrument("evaluate_retrieval")
 def evaluate_retrieval(test_cases: str) -> str:
     """
     Evaluate search quality by testing whether search_knowledge() retrieves expected documents.
@@ -2448,6 +2510,16 @@ def main():
     from .preflight import run_preflight
 
     try:
+        # SSE/HTTP mode: auto-enable single-instance lock (port collision prevention)
+        transport = config.transport
+        for i, arg in enumerate(sys.argv[1:], 1):
+            if arg == "--transport" and i < len(sys.argv) - 1:
+                transport = sys.argv[i + 1]
+            elif arg.startswith("--transport="):
+                transport = arg.split("=", 1)[1]
+        if transport != "stdio":
+            os.environ["KNOWLEDGE_RAG_SINGLE_INSTANCE"] = "1"
+
         with single_instance_lock():
             run_preflight()
 
@@ -2487,11 +2559,32 @@ def main():
                     print(f"[WARN] Failed to start file watcher: {e}")
                     print("[WARN] Auto-reindexing disabled. Use reindex_documents tool manually.")
 
+            # Start optional metrics server
+            if config.metrics_enabled and config.transport != "stdio":
+                from .metrics import start_metrics_server
+
+                start_metrics_server(config.metrics_port)
+
             # Restore real stdout for MCP JSON-RPC, keep print() going to stderr
             from . import _original_stdout
 
             sys.stdout = _original_stdout
-            mcp.run()
+
+            # Parse --transport CLI override
+            transport = config.transport
+            for i, arg in enumerate(sys.argv[1:], 1):
+                if arg == "--transport" and i < len(sys.argv) - 1:
+                    transport = sys.argv[i + 1]
+                elif arg.startswith("--transport="):
+                    transport = arg.split("=", 1)[1]
+
+            if transport != "stdio":
+                print(
+                    f"[SERVER] Starting {transport} server on {config.server_host}:{config.server_port}",
+                    file=sys.stderr,
+                )
+
+            mcp.run(transport=transport)
     except AlreadyRunningError as e:
         print(f"[ERROR] {e}", file=sys.stderr)
         raise SystemExit(ALREADY_RUNNING_EXIT_CODE) from e
diff --git a/npm/package.json b/npm/package.json
index 596a19d..b06b338 100644
--- a/npm/package.json
+++ b/npm/package.json
@@ -1,6 +1,6 @@
 {
   "name": "knowledge-rag",
-  "version": "3.9.1",
+  "version": "4.0.0",
   "description": "Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers.",
   "bin": {
     "knowledge-rag": "./bin/cli.js"
diff --git a/pyproject.toml b/pyproject.toml
index 1b9d106..7f37863 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -4,7 +4,7 @@ build-backend = "hatchling.build"
 
 [project]
 name = "knowledge-rag"
-version = "3.9.1"
+version = "4.0.0"
 description = "Local RAG System for Claude Code — Hybrid search + Cross-encoder Reranking + 12 MCP Tools + 20 Format Parsers. Zero external servers."
 readme = "README.md"
 license = {text = "MIT"}
@@ -32,7 +32,7 @@ dependencies = [
     "chromadb>=1.4.0",
     "pymupdf>=1.23.0",
     "fastembed[reranking]>=0.4.0",
-    "mcp>=1.0.0",
+    "mcp>=1.6.0",
     "rank-bm25>=0.2.2",
     "requests>=2.33.0",
     "beautifulsoup4>=4.12.0",
@@ -45,6 +45,7 @@ dependencies = [
 
 [project.optional-dependencies]
 gpu = ["onnxruntime-gpu>=1.14.0"]
+server = ["uvicorn>=0.20.0"]
 
 [project.urls]
 Homepage = "https://github.com/lyonzin/knowledge-rag"

From 2a3b4bfedde5207a9c83fe4dfddc7eff8b0a96c7 Mon Sep 17 00:00:00 2001
From: Lyonzin <lyonzin@users.noreply.github.com>
Date: Tue, 9 Jun 2026 13:35:30 -0300
Subject: [PATCH 2/3] docs(changelog): add v4.0.0 changelog entry

---
 README.md | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/README.md b/README.md
index ae400bd..a064654 100644
--- a/README.md
+++ b/README.md
@@ -1224,6 +1224,20 @@ A second instance exits immediately with code 75. Default is OFF (multi-client f
 - **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI.
 - **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34)
 
+### v4.0.0 (2026-06-09) — Enterprise Concurrent Access
+
+- **NEW**: SSE and streamable-http transport modes — 1 server serves N clients (`server.transport: "sse"` in config.yaml or `--transport sse` CLI).
+- **NEW**: Thread-safe shared state for concurrent queries — QueryCache locking, BM25 build lock, orchestrator double-checked locking.
+- **NEW**: ChromaDB WAL mode enabled automatically in SSE/HTTP mode for concurrent read performance.
+- **NEW**: Optional rate limiting — sliding-window counter, configurable RPM and burst, disabled by default.
+- **NEW**: Optional Prometheus metrics endpoint — tool call counts, latency histograms, separate port, disabled by default.
+- **NEW**: All 12 MCP tools instrumented with `@rate_limited` and `@instrument` decorators (zero-cost when disabled).
+- **NEW**: `--transport` CLI override for Docker/systemd deployments.
+- **NEW**: `pip install knowledge-rag[server]` optional dependency for SSE/HTTP (uvicorn).
+- **CHANGED**: SSE/HTTP mode auto-enables single-instance lock (port collision prevention).
+- **CHANGED**: `mcp` dependency bumped to `>=1.6.0` (SSE/streamable-http support).
+- **MIGRATION**: Default transport remains `stdio` — existing users need zero changes. See config.example.yaml for SSE setup.
+
 ### v3.9.1 (2026-06-08)
 
 - **FIX**: Expand `~` in `config.yaml` path values (`documents_dir`, `data_dir`, `models_cache_dir`) via `expanduser()` on all platforms (#86).

From fd6564517bcdc2cd07fcb448ebf922e835930d4c Mon Sep 17 00:00:00 2001
From: Lyonzin <lyonzin@users.noreply.github.com>
Date: Tue, 9 Jun 2026 14:06:29 -0300
Subject: [PATCH 3/3] =?UTF-8?q?docs(readme):=20update=20for=20v4.0.0=20?=
 =?UTF-8?q?=E2=80=94=20SSE=20transport,=20enterprise=20features,=20changel?=
 =?UTF-8?q?og=20reorder?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

---
 README.md | 158 ++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 131 insertions(+), 27 deletions(-)

diff --git a/README.md b/README.md
index a064654..9b4057b 100644
--- a/README.md
+++ b/README.md
@@ -28,7 +28,7 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que
 
 **12 MCP Tools** | **Hybrid Search + Reranking** | **20 File Formats** | **Optional NVIDIA GPU** | **100% Local**
 
-[What's New](#whats-new-in-v390) | [Supported Formats](#supported-formats) | [Installation](#installation) | [Configuration](#configuration) | [API Reference](#api-reference) | [Architecture](#architecture)
+[What's New](#whats-new-in-v400) | [Supported Formats](#supported-formats) | [Installation](#installation) | [Configuration](#configuration) | [API Reference](#api-reference) | [Architecture](#architecture)
 
 </div>
 
@@ -50,7 +50,30 @@ pip install knowledge-rag → restart Claude Code → search_knowledge("your que
 
 ---
 
-## What's New in v3.9.0
+## What's New in v4.0.0
+
+### Enterprise Concurrent Access — SSE/HTTP Transport (v4.0.0)
+
+The server now supports **SSE** and **streamable-http** transport modes. Instead of spawning a separate process per client (stdio), a single server process serves all clients with shared resources — 1 embedding model, 1 ChromaDB, 1 query cache.
+
+```yaml
+# config.yaml
+server:
+  transport: "sse"        # "stdio" | "sse" | "streamable-http"
+  host: "127.0.0.1"
+  port: 8179
+```
+
+Or via CLI: `knowledge-rag --transport sse`
+
+**Optional enterprise features** (all disabled by default):
+- **Rate limiting**: Sliding-window counter, configurable RPM and burst
+- **Prometheus metrics**: `/metrics` endpoint on separate port
+- **Bearer auth**: Token validation for SSE/HTTP connections
+
+All 12 MCP tools are instrumented with `@rate_limited` and `@instrument` decorators — zero overhead when features are disabled. Default transport remains **stdio** for full backwards compatibility.
+
+> **Migration**: Existing users need zero changes. SSE mode is opt-in via `server.transport: "sse"` in config.yaml. See [Configuration](#configuration) for details.
 
 ### Quality Gate — 7-Pillar PR Validation
 
@@ -102,6 +125,7 @@ All methods produce the same MCP server. See [Installation](#installation) for f
 
 ### Recent Highlights
 
+- **v4.0.0** — **Enterprise concurrent access**: SSE/HTTP transport (1 server → N clients), thread-safe shared state, optional rate limiting + Prometheus metrics, ChromaDB WAL mode, `--transport` CLI
 - **v3.9.0** — **Quality Gate** activated: 35+ automated PR checks across 7 pillars (Security, Stability, Memory Leak, Versatility, Scalability, Versioning, Quality) + nightly resilience suite (chaos, soak, determinism, mutation)
 - **v3.8.1** — Critical hotfix: loud-fail embeddings (no more silent zero-vector corruption); Windows CI flake erradicated (HF_HUB_OFFLINE + shell:bash + atexit wrapper)
 - **v3.8.0** — Lazy-load embeddings, opt-in single-instance guard, version sync across PyPI/NPM/Docker
@@ -463,9 +487,33 @@ Add to `~/.claude.json`:
 > Replace `YOUR_USER` with your username, or use the full path from `echo $HOME`.
 </details>
 
+#### Option F: SSE Server Mode (multi-agent)
+
+For multi-agent setups where multiple clients query the same knowledge base simultaneously:
+
+```bash
+pip install knowledge-rag[server]    # Adds uvicorn for SSE/HTTP
+knowledge-rag --transport sse        # Starts on http://127.0.0.1:8179
+```
+
+Then configure each MCP client to connect via SSE:
+
+```json
+{
+  "mcpServers": {
+    "knowledge-rag": {
+      "type": "sse",
+      "url": "http://127.0.0.1:8179/sse"
+    }
+  }
+}
+```
+
+One server process serves all agents — shared embedding model, shared cache, shared ChromaDB. See [Configuration > Server](#server) for rate limiting, metrics, and auth options.
+
 ### Use with other MCP clients
 
-`knowledge-rag` is a standard **stdio MCP server** — it works with any MCP-compatible client, not only Claude Code. The launch command is the same everywhere (the `python -m mcp_server.server` from whichever install method you picked); only the **config file location** and **JSON shape** differ per client.
+`knowledge-rag` supports both **stdio** (default, 1:1) and **SSE** (1:N) transport modes. In stdio mode, it works with any MCP-compatible client, not only Claude Code. The launch command is the same everywhere (the `python -m mcp_server.server` from whichever install method you picked); only the **config file location** and **JSON shape** differ per client.
 
 #### Clients using the standard `mcpServers` format
 
@@ -923,6 +971,21 @@ query_expansions:
   privesc:
     - privilege escalation
     - privesc
+
+# Server — enterprise features (new in v4.0.0)
+server:
+  transport: "stdio"              # "stdio" | "sse" | "streamable-http"
+  host: "127.0.0.1"              # Bind address (SSE/HTTP only)
+  port: 8179                      # Bind port (SSE/HTTP only)
+  auth:
+    bearer_token: ""              # Set a secret to enable auth (SSE/HTTP only)
+  rate_limit:
+    enabled: false
+    requests_per_minute: 60
+    burst: 10
+  metrics:
+    enabled: false
+    port: 9179                    # Separate port for Prometheus scraping
 ```
 
 > See `config.example.yaml` for the fully documented template with explanations for every field.
@@ -942,6 +1005,22 @@ Pre-built configurations for common use cases:
 
 ### Configuration Reference
 
+#### Server
+
+| Field | Default | Description |
+|-------|---------|-------------|
+| `server.transport` | `"stdio"` | Transport protocol: `"stdio"`, `"sse"`, or `"streamable-http"` |
+| `server.host` | `"127.0.0.1"` | Bind address for SSE/HTTP mode |
+| `server.port` | `8179` | Bind port for SSE/HTTP mode |
+| `server.auth.bearer_token` | `""` (disabled) | Bearer token for SSE/HTTP auth. Empty = no auth |
+| `server.rate_limit.enabled` | `false` | Enable per-client rate limiting |
+| `server.rate_limit.requests_per_minute` | `60` | Max requests per minute |
+| `server.rate_limit.burst` | `10` | Burst allowance above steady rate |
+| `server.metrics.enabled` | `false` | Enable Prometheus `/metrics` endpoint |
+| `server.metrics.port` | `9179` | Port for metrics scraping |
+
+In stdio mode (default), server settings are ignored. SSE/HTTP mode auto-enables the single-instance lock.
+
 #### Paths
 
 | Field | Default | Description |
@@ -1179,10 +1258,59 @@ export KNOWLEDGE_RAG_SINGLE_INSTANCE=1
 
 A second instance exits immediately with code 75. Default is OFF (multi-client friendly). Full guide: [docs/single-instance.md](docs/single-instance.md). Sample MCP config: [examples/mcp-config-single-instance.json](examples/mcp-config-single-instance.json).
 
+### SSE server won't start
+
+```bash
+# Check if port 8179 is already in use
+# Windows:
+netstat -aon | findstr :8179
+# Linux/macOS:
+lsof -i :8179
+```
+
+If `uvicorn` is not found, install the server extras: `pip install knowledge-rag[server]`
+
+### Can't connect to SSE server
+
+Verify the server is running and the URL is correct:
+
+```bash
+curl http://127.0.0.1:8179/sse
+```
+
+Common issues:
+- Wrong URL: must end with `/sse` (not just the port)
+- Firewall blocking the port
+- Server started with a different host/port than configured in the MCP client
+
 ---
 
 ## Changelog
 
+### v4.0.0 (2026-06-09) — Enterprise Concurrent Access
+
+- **NEW**: SSE and streamable-http transport modes — 1 server serves N clients (`server.transport: "sse"` in config.yaml or `--transport sse` CLI).
+- **NEW**: Thread-safe shared state for concurrent queries — QueryCache locking, BM25 build lock, orchestrator double-checked locking.
+- **NEW**: ChromaDB WAL mode enabled automatically in SSE/HTTP mode for concurrent read performance.
+- **NEW**: Optional rate limiting — sliding-window counter, configurable RPM and burst, disabled by default.
+- **NEW**: Optional Prometheus metrics endpoint — tool call counts, latency histograms, separate port, disabled by default.
+- **NEW**: All 12 MCP tools instrumented with `@rate_limited` and `@instrument` decorators (zero-cost when disabled).
+- **NEW**: `--transport` CLI override for Docker/systemd deployments.
+- **NEW**: `pip install knowledge-rag[server]` optional dependency for SSE/HTTP (uvicorn).
+- **CHANGED**: SSE/HTTP mode auto-enables single-instance lock (port collision prevention).
+- **CHANGED**: `mcp` dependency bumped to `>=1.6.0` (SSE/streamable-http support).
+- **MIGRATION**: Default transport remains `stdio` — existing users need zero changes. See config.example.yaml for SSE setup.
+
+### v3.9.1 (2026-06-08)
+
+- **FIX**: Expand `~` in `config.yaml` path values (`documents_dir`, `data_dir`, `models_cache_dir`) via `expanduser()` on all platforms (#86).
+- **FIX**: Warn when `documents_dir` resolves to a non-existent path instead of silently indexing zero files.
+- **FIX**: File watcher now uses accumulate-mode debounce — bulk file copies no longer starve the reindex trigger.
+- **FIX**: Concurrent `index_all()` calls are serialized via `_index_lock` to prevent ChromaDB SQLite corruption.
+- **FIX**: `collection.add()` is batched (500 chunks/call) to cap memory usage during large reindex operations.
+- **NEW**: `KNOWLEDGE_RAG_WATCHER_DISABLED=1` env var to disable the file watcher for troubleshooting.
+- **NEW**: Progress logging every 10% for reindex operations with >100 documents.
+
 ### v3.9.0 (2026-05-10) — Quality Gate
 
 **Major governance + CI hardening release. No runtime behavior change in `mcp_server/`. Public API surface unchanged from v3.8.1.**
@@ -1224,30 +1352,6 @@ A second instance exits immediately with code 75. Default is OFF (multi-client f
 - **CHORE**: pytest `tmp_path_retention_count=1` to avoid Windows atexit cleanup race in CI.
 - **ROADMAP**: Tracked v4.0 shared-service architecture (one daemon, many thin MCP clients) as the long-term fix for multi-process resource duplication. (#34)
 
-### v4.0.0 (2026-06-09) — Enterprise Concurrent Access
-
-- **NEW**: SSE and streamable-http transport modes — 1 server serves N clients (`server.transport: "sse"` in config.yaml or `--transport sse` CLI).
-- **NEW**: Thread-safe shared state for concurrent queries — QueryCache locking, BM25 build lock, orchestrator double-checked locking.
-- **NEW**: ChromaDB WAL mode enabled automatically in SSE/HTTP mode for concurrent read performance.
-- **NEW**: Optional rate limiting — sliding-window counter, configurable RPM and burst, disabled by default.
-- **NEW**: Optional Prometheus metrics endpoint — tool call counts, latency histograms, separate port, disabled by default.
-- **NEW**: All 12 MCP tools instrumented with `@rate_limited` and `@instrument` decorators (zero-cost when disabled).
-- **NEW**: `--transport` CLI override for Docker/systemd deployments.
-- **NEW**: `pip install knowledge-rag[server]` optional dependency for SSE/HTTP (uvicorn).
-- **CHANGED**: SSE/HTTP mode auto-enables single-instance lock (port collision prevention).
-- **CHANGED**: `mcp` dependency bumped to `>=1.6.0` (SSE/streamable-http support).
-- **MIGRATION**: Default transport remains `stdio` — existing users need zero changes. See config.example.yaml for SSE setup.
-
-### v3.9.1 (2026-06-08)
-
-- **FIX**: Expand `~` in `config.yaml` path values (`documents_dir`, `data_dir`, `models_cache_dir`) via `expanduser()` on all platforms (#86).
-- **FIX**: Warn when `documents_dir` resolves to a non-existent path instead of silently indexing zero files.
-- **FIX**: File watcher now uses accumulate-mode debounce — bulk file copies no longer starve the reindex trigger.
-- **FIX**: Concurrent `index_all()` calls are serialized via `_index_lock` to prevent ChromaDB SQLite corruption.
-- **FIX**: `collection.add()` is batched (500 chunks/call) to cap memory usage during large reindex operations.
-- **NEW**: `KNOWLEDGE_RAG_WATCHER_DISABLED=1` env var to disable the file watcher for troubleshooting.
-- **NEW**: Progress logging every 10% for reindex operations with >100 documents.
-
 ### Unreleased
 
 - **FIX**: Startup preflight probes ChromaDB in a child process and moves crashing persistent indexes to `data/backups/auto-repair-*` before MCP initialization.