From cbc1dac12ec47c93502e10838a618058c1779955 Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sat, 30 May 2026 15:09:57 +0700 Subject: [PATCH 1/6] feat(mcp): expand MCP server with learn, status, session_list tools (#717) - mcp-server.py: add learn tool (writes knowledge entry via subprocess), status tool (DB health snapshot), session_list tool (last N sessions) - Wire all 3 in _handle_tools_call() with proper input schemas - test_fixes.py: +35 tests (I717) Closes #717 Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- mcp-server.py | 235 ++++++++++++++++++++++++++++++ test_fixes.py | 388 ++++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 623 insertions(+) diff --git a/mcp-server.py b/mcp-server.py index b55a7514..58df9eda 100644 --- a/mcp-server.py +++ b/mcp-server.py @@ -6,13 +6,20 @@ - briefing(task, mode?, limit?, agent_tag?, msg_tag?) - query_session(query, semantic?, limit?, agent_tag?, msg_tag?) - query_memory(query?, category?, agent_tag?, msg_tag?, limit?, token?) # issue #404 + +Write tools (issue #717): +- learn(category, title, description, tags?) +- status() +- session_list(limit?) """ import importlib.util import io import json import os +import re import sqlite3 +import subprocess import sys from contextlib import redirect_stderr, redirect_stdout from pathlib import Path @@ -38,6 +45,7 @@ JSONRPC_INTERNAL_ERROR = -32603 VALID_BRIEFING_MODES = {"auto", "implement", "debug", "review", "plan", "test"} +VALID_LEARN_CATEGORIES = {"mistake", "pattern", "decision", "tool", "feature", "refactor", "discovery"} # Auth error code for query_memory token failures (issue #404, fails closed) _MCP_AUTH_ERROR = -32600 # reuse INVALID_REQUEST for auth failures @@ -173,6 +181,55 @@ def _load_script_module(module_name: str, filename: str): "additionalProperties": False, }, }, + { + "name": "learn", + "description": "Write a knowledge entry (mistake, pattern, feature, discovery, etc.) to the local knowledge base. Issue #717.", + "inputSchema": { + "type": "object", + "properties": { + "category": { + "type": "string", + "enum": sorted(VALID_LEARN_CATEGORIES), + "description": "Knowledge category.", + }, + "title": {"type": "string", "description": "Short title for the knowledge entry."}, + "description": {"type": "string", "description": "Content / body of the knowledge entry."}, + "tags": { + "type": "string", + "description": "Comma-separated tags (optional).", + }, + }, + "required": ["category", "title", "description"], + "additionalProperties": False, + }, + }, + { + "name": "status", + "description": "Return a JSON health snapshot: session count, entry count, watcher status. Issue #717.", + "inputSchema": { + "type": "object", + "properties": {}, + "required": [], + "additionalProperties": False, + }, + }, + { + "name": "session_list", + "description": "Return the most recent sessions from the local knowledge base. Issue #717.", + "inputSchema": { + "type": "object", + "properties": { + "limit": { + "type": "integer", + "minimum": 1, + "maximum": 100, + "description": "Maximum sessions to return (default 20).", + }, + }, + "required": [], + "additionalProperties": False, + }, + }, ] @@ -408,6 +465,178 @@ def _run_query_memory(arguments: dict[str, Any]) -> dict[str, Any]: } +# --------------------------------------------------------------------------- +# learn — write a knowledge entry via learn.py subprocess (issue #717) +# --------------------------------------------------------------------------- + + +def _run_learn(arguments: dict[str, Any]) -> dict[str, Any]: + """Write a knowledge entry by calling learn.py as a subprocess.""" + category = _require_string(arguments, "category") + if category not in VALID_LEARN_CATEGORIES: + raise JsonRpcError( + JSONRPC_INVALID_PARAMS, + f"'category' must be one of: {', '.join(sorted(VALID_LEARN_CATEGORIES))}", + ) + title = _require_string(arguments, "title") + description = _require_string(arguments, "description") + tags = _optional_string(arguments, "tags", max_length=500) + + learn_py = TOOLS_DIR / "learn.py" + if not learn_py.exists(): + raise JsonRpcError(JSONRPC_INTERNAL_ERROR, "learn.py not found") + + flag = f"--{category}" + cmd = [sys.executable, str(learn_py), flag, title, description] + if tags: + cmd += ["--tags", tags] + + try: + result = subprocess.run(cmd, capture_output=True, text=True, timeout=30) + except subprocess.TimeoutExpired as exc: + raise JsonRpcError(JSONRPC_INTERNAL_ERROR, "learn.py timed out") from exc + except Exception as exc: + raise JsonRpcError(JSONRPC_INTERNAL_ERROR, f"learn.py subprocess error: {exc}") from exc + + if result.returncode != 0: + msg = result.stderr.strip() or result.stdout.strip() or "learn.py failed" + raise JsonRpcError(JSONRPC_INTERNAL_ERROR, msg) + + combined = result.stdout + result.stderr + entry_id = None + m = re.search(r"#(\d+)", combined) + if m: + entry_id = int(m.group(1)) + + body = {"status": "ok", "message": "Entry recorded", "id": entry_id} + return { + "content": [{"type": "text", "text": json.dumps(body, ensure_ascii=False)}], + "structuredContent": body, + } + + +# --------------------------------------------------------------------------- +# status — return DB health snapshot (issue #717) +# --------------------------------------------------------------------------- + + +def _is_watcher_pid_running(pid: int) -> bool: + try: + os.kill(pid, 0) + return True + except (ProcessLookupError, PermissionError): + return False + except Exception: + return False + + +def _run_status(_arguments: dict[str, Any]) -> dict[str, Any]: + """Return a JSON health snapshot: session_count, entry_count, watcher.""" + session_count = 0 + entry_count = 0 + watcher = "stopped" + + # Check watcher via lock file + lock_file = _DB_PATH.parent / ".watcher.lock" + if lock_file.exists(): + try: + pid = int(lock_file.read_text(encoding="utf-8").strip()) + if _is_watcher_pid_running(pid): + watcher = "running" + except Exception: + pass + + if not _DB_PATH.exists(): + body = { + "session_count": 0, + "entry_count": 0, + "watcher": watcher, + "db_path": str(_DB_PATH), + } + return { + "content": [{"type": "text", "text": json.dumps(body, ensure_ascii=False)}], + "structuredContent": body, + } + + try: + db_uri = _DB_PATH.as_uri() + "?mode=ro" + db = sqlite3.connect(db_uri, uri=True) + try: + row = db.execute("SELECT COUNT(*) FROM sessions").fetchone() + session_count = row[0] if row else 0 + except sqlite3.OperationalError: + session_count = 0 + try: + row = db.execute("SELECT COUNT(*) FROM knowledge_entries").fetchone() + entry_count = row[0] if row else 0 + except sqlite3.OperationalError: + entry_count = 0 + db.close() + except Exception: + pass + + body = { + "session_count": session_count, + "entry_count": entry_count, + "watcher": watcher, + "db_path": str(_DB_PATH), + } + return { + "content": [{"type": "text", "text": json.dumps(body, ensure_ascii=False)}], + "structuredContent": body, + } + + +# --------------------------------------------------------------------------- +# session_list — return last N sessions (issue #717) +# --------------------------------------------------------------------------- + + +def _run_session_list(arguments: dict[str, Any]) -> dict[str, Any]: + """Return the most recent sessions from the local DB.""" + limit = _optional_int(arguments, "limit", default=20, minimum=1, maximum=100) + + if not _DB_PATH.exists(): + body: dict[str, Any] = {"sessions": [], "count": 0} + return { + "content": [{"type": "text", "text": json.dumps(body, ensure_ascii=False)}], + "structuredContent": body, + } + + try: + db_uri = _DB_PATH.as_uri() + "?mode=ro" + db = sqlite3.connect(db_uri, uri=True) + db.row_factory = sqlite3.Row + try: + rows = db.execute( + "SELECT id, summary, source, indexed_at FROM sessions ORDER BY indexed_at DESC LIMIT ?", + (limit,), + ).fetchall() + except sqlite3.OperationalError as exc: + db.close() + raise JsonRpcError(JSONRPC_INTERNAL_ERROR, f"Query error: {exc}") from exc + db.close() + except JsonRpcError: + raise + except Exception as exc: + raise JsonRpcError(JSONRPC_INTERNAL_ERROR, f"DB error: {exc}") from exc + + sessions = [ + { + "id": dict(r).get("id"), + "summary": dict(r).get("summary", ""), + "source": dict(r).get("source", ""), + "indexed_at": dict(r).get("indexed_at"), + } + for r in rows + ] + body = {"sessions": sessions, "count": len(sessions)} + return { + "content": [{"type": "text", "text": json.dumps(body, ensure_ascii=False)}], + "structuredContent": body, + } + + def _handle_tools_call(params: dict[str, Any]) -> dict[str, Any]: name = params.get("name") if not isinstance(name, str) or not name: @@ -423,6 +652,12 @@ def _handle_tools_call(params: dict[str, Any]) -> dict[str, Any]: return _run_query_session(arguments) if name == "query_memory": return _run_query_memory(arguments) + if name == "learn": + return _run_learn(arguments) + if name == "status": + return _run_status(arguments) + if name == "session_list": + return _run_session_list(arguments) raise JsonRpcError(JSONRPC_INVALID_PARAMS, f"Unknown tool: {name}") diff --git a/test_fixes.py b/test_fixes.py index ed14c1ed..8adc296a 100755 --- a/test_fixes.py +++ b/test_fixes.py @@ -10084,3 +10084,391 @@ def _make_i720_db(db_path: Path) -> None: for _fn in FAIL_NAMES: print(f" ❌ {_fn}") sys.exit(0 if FAIL == 0 else 1) + +# === I717: MCP Write Tools === + +print("\n✍️ I717: MCP Write Tools") + +_mcp717_src = (REPO / "mcp-server.py").read_text(encoding="utf-8") + +# I717-01: TOOLS list includes 'learn' +try: + test("I717-01: TOOLS list contains learn tool", + '"name": "learn"' in _mcp717_src or "'name': 'learn'" in _mcp717_src, + "learn tool not found in TOOLS list") +except Exception as _e717_01: + test("I717-01: learn in TOOLS", False, str(_e717_01)) + +# I717-02: TOOLS list includes 'status' +try: + test("I717-02: TOOLS list contains status tool", + '"name": "status"' in _mcp717_src or "'name': 'status'" in _mcp717_src, + "status tool not found in TOOLS list") +except Exception as _e717_02: + test("I717-02: status in TOOLS", False, str(_e717_02)) + +# I717-03: TOOLS list includes 'session_list' +try: + test("I717-03: TOOLS list contains session_list tool", + '"name": "session_list"' in _mcp717_src or "'name': 'session_list'" in _mcp717_src, + "session_list tool not found in TOOLS list") +except Exception as _e717_03: + test("I717-03: session_list in TOOLS", False, str(_e717_03)) + +# I717-04: _run_learn function exists +try: + test("I717-04: _run_learn function defined", + "def _run_learn(" in _mcp717_src, + "_run_learn not found") +except Exception as _e717_04: + test("I717-04: _run_learn defined", False, str(_e717_04)) + +# I717-05: _run_status function exists +try: + test("I717-05: _run_status function defined", + "def _run_status(" in _mcp717_src, + "_run_status not found") +except Exception as _e717_05: + test("I717-05: _run_status defined", False, str(_e717_05)) + +# I717-06: _run_session_list function exists +try: + test("I717-06: _run_session_list function defined", + "def _run_session_list(" in _mcp717_src, + "_run_session_list not found") +except Exception as _e717_06: + test("I717-06: _run_session_list defined", False, str(_e717_06)) + +# I717-07: dispatch learn, status, session_list in _handle_tools_call +try: + _dispatch_body = _mcp717_src.split("def _handle_tools_call(")[1].split("def _read_exact(")[0] + test("I717-07a: dispatch learn in _handle_tools_call", + "_run_learn" in _dispatch_body, + "_run_learn not dispatched") + test("I717-07b: dispatch status in _handle_tools_call", + "_run_status" in _dispatch_body, + "_run_status not dispatched") + test("I717-07c: dispatch session_list in _handle_tools_call", + "_run_session_list" in _dispatch_body, + "_run_session_list not dispatched") +except Exception as _e717_07: + test("I717-07: dispatch check", False, str(_e717_07)) + +# I717-08: learn tool has category enum in schema +try: + test("I717-08: learn schema has category enum", + "VALID_LEARN_CATEGORIES" in _mcp717_src or '"mistake"' in _mcp717_src, + "category enum missing from learn schema") +except Exception as _e717_08: + test("I717-08: learn category enum", False, str(_e717_08)) + +# I717-09: learn uses subprocess (not importlib) for calling learn.py +try: + _learn_fn = _mcp717_src.split("def _run_learn(")[1].split("def _run_status(")[0] + test("I717-09: _run_learn uses subprocess.run", + "subprocess.run(" in _learn_fn, + "subprocess.run not used in _run_learn") +except Exception as _e717_09: + test("I717-09: _run_learn subprocess", False, str(_e717_09)) + +# I717-10: no SQL injection in session_list (uses ? placeholder) +try: + _sl_fn = _mcp717_src.split("def _run_session_list(")[1].split("def _handle_tools_call(")[0] + test("I717-10: session_list uses ? placeholder", + "?" in _sl_fn and "f\"SELECT" not in _sl_fn and "f'SELECT" not in _sl_fn, + "session_list may use string-interpolated SQL") +except Exception as _e717_10: + test("I717-10: session_list SQL safety", False, str(_e717_10)) + +# I717-11: status returns watcher field +try: + _status_fn = _mcp717_src.split("def _run_status(")[1].split("def _run_session_list(")[0] + test("I717-11: status returns watcher field", + '"watcher"' in _status_fn or "'watcher'" in _status_fn, + "watcher field missing from status output") +except Exception as _e717_11: + test("I717-11: status watcher field", False, str(_e717_11)) + +# I717-12: status returns db_path field +try: + _status_fn12 = _mcp717_src.split("def _run_status(")[1].split("def _run_session_list(")[0] + test("I717-12: status returns db_path field", + '"db_path"' in _status_fn12 or "'db_path'" in _status_fn12, + "db_path missing from status output") +except Exception as _e717_12: + test("I717-12: status db_path field", False, str(_e717_12)) + +# I717-13: VALID_LEARN_CATEGORIES constant defined +try: + test("I717-13: VALID_LEARN_CATEGORIES constant defined", + "VALID_LEARN_CATEGORIES" in _mcp717_src, + "VALID_LEARN_CATEGORIES not found") +except Exception as _e717_13: + test("I717-13: VALID_LEARN_CATEGORIES", False, str(_e717_13)) + +# I717-14: learn schema requires category, title, description +try: + _learn_schema = _mcp717_src[_mcp717_src.find('"name": "learn"'):_mcp717_src.find('"name": "status"')] + test("I717-14: learn required fields include title and description", + '"category"' in _learn_schema and '"title"' in _learn_schema and '"description"' in _learn_schema, + "learn schema missing required fields") +except Exception as _e717_14: + test("I717-14: learn required fields", False, str(_e717_14)) + +# I717-15..22: Integration tests via MCP subprocess +try: + import sqlite3 as _sq717 + import tempfile as _tmp717 + import subprocess as _sp717 + import json as _json717 + import pathlib as _pl717 + + with _tmp717.TemporaryDirectory(prefix="mcp717-test-") as _tmp717_dir: + _home717 = _pl717.Path(_tmp717_dir) + _state717 = _home717 / ".copilot" / "session-state" + _state717.mkdir(parents=True, exist_ok=True) + _db717 = _sq717.connect(_state717 / "knowledge.db") + _db717.executescript(""" + CREATE TABLE sessions ( + id TEXT PRIMARY KEY, + path TEXT NOT NULL, + summary TEXT DEFAULT '', + source TEXT DEFAULT 'copilot', + indexed_at TEXT + ); + CREATE TABLE knowledge_entries ( + id INTEGER PRIMARY KEY AUTOINCREMENT, + session_id TEXT NOT NULL DEFAULT '', + document_id INTEGER, + category TEXT NOT NULL, + title TEXT NOT NULL, + stable_id TEXT, + content TEXT NOT NULL DEFAULT '', + tags TEXT DEFAULT '', + confidence REAL DEFAULT 1.0, + occurrence_count INTEGER DEFAULT 1, + first_seen TEXT, + last_seen TEXT, + source TEXT DEFAULT 'copilot', + topic_key TEXT, + revision_count INTEGER DEFAULT 1, + content_hash TEXT, + wing TEXT DEFAULT '', + room TEXT DEFAULT '', + facts TEXT DEFAULT '[]', + est_tokens INTEGER DEFAULT 0, + task_id TEXT DEFAULT '', + affected_files TEXT DEFAULT '[]', + source_section TEXT DEFAULT '', + source_file TEXT DEFAULT '', + start_line INTEGER DEFAULT 0, + end_line INTEGER DEFAULT 0, + code_language TEXT DEFAULT '', + code_snippet TEXT DEFAULT '', + error_type TEXT DEFAULT '', + root_cause TEXT DEFAULT '', + severity TEXT DEFAULT 'medium', + is_resolved INTEGER DEFAULT 0, + fix_steps TEXT DEFAULT '', + prevention_hook TEXT DEFAULT '', + recurrence_after_briefing INTEGER DEFAULT 0, + valence TEXT DEFAULT '', + intensity REAL DEFAULT 0.5, + priority TEXT DEFAULT 'P2', + project_id TEXT DEFAULT '', + agent_id TEXT DEFAULT '' + ); + CREATE VIRTUAL TABLE knowledge_fts USING fts5(title, section_name, content, doc_type UNINDEXED, session_id UNINDEXED, document_id UNINDEXED); + CREATE VIRTUAL TABLE ke_fts USING fts5(title, content); + CREATE VIRTUAL TABLE sessions_fts USING fts5(session_id UNINDEXED, title, user_messages, assistant_messages, tool_names); + CREATE TABLE documents (id INTEGER PRIMARY KEY, session_id TEXT NOT NULL, doc_type TEXT NOT NULL, title TEXT NOT NULL, file_path TEXT DEFAULT '', seq INTEGER DEFAULT 1, size_bytes INTEGER DEFAULT 0, source TEXT DEFAULT 'copilot'); + CREATE TABLE sections (id INTEGER PRIMARY KEY, document_id INTEGER NOT NULL, section_name TEXT DEFAULT '', content TEXT DEFAULT ''); + """) + _db717.execute("INSERT INTO sessions (id, path, summary, source, indexed_at) VALUES (?, ?, ?, ?, ?)", + ("sess-i717-1", "/a/b", "Session I717 Alpha", "copilot", "2025-01-01T10:00:00")) + _db717.execute("INSERT INTO sessions (id, path, summary, source, indexed_at) VALUES (?, ?, ?, ?, ?)", + ("sess-i717-2", "/c/d", "Session I717 Beta", "copilot", "2025-01-02T12:00:00")) + _db717.commit() + _db717.close() + + _env717 = os.environ.copy() + _env717["HOME"] = str(_home717) + _env717["USERPROFILE"] = str(_home717) + + def _mcp717_roundtrip(method, params): + proc = _sp717.Popen( + [sys.executable, str(REPO / "mcp-server.py")], + stdin=_sp717.PIPE, stdout=_sp717.PIPE, stderr=_sp717.PIPE, + env=_env717, + ) + try: + # initialize + init_msg = _json717.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize", + "params": {"protocolVersion": "2024-11-05", "capabilities": {}}}).encode() + proc.stdin.write(f"Content-Length: {len(init_msg)}\r\n\r\n".encode() + init_msg) + notif = _json717.dumps({"jsonrpc": "2.0", "method": "notifications/initialized"}).encode() + proc.stdin.write(f"Content-Length: {len(notif)}\r\n\r\n".encode() + notif) + # send actual request + req = _json717.dumps({"jsonrpc": "2.0", "id": 2, "method": method, "params": params}).encode() + proc.stdin.write(f"Content-Length: {len(req)}\r\n\r\n".encode() + req) + # shutdown + shutdown_msg = _json717.dumps({"jsonrpc": "2.0", "id": 3, "method": "shutdown"}).encode() + proc.stdin.write(f"Content-Length: {len(shutdown_msg)}\r\n\r\n".encode() + shutdown_msg) + proc.stdin.flush() + proc.stdin.close() + out = proc.stdout.read() + proc.wait(timeout=15) + finally: + try: + proc.kill() + except Exception: + pass + # Parse all JSON-RPC messages from output + responses = [] + remaining = out + while remaining: + if b"Content-Length:" not in remaining: + break + hdr_end = remaining.find(b"\r\n\r\n") + if hdr_end == -1: + break + hdr = remaining[:hdr_end].decode("ascii", errors="replace") + cl = int([l.split(":")[1].strip() for l in hdr.split("\r\n") if "content-length" in l.lower()][0]) + body_start = hdr_end + 4 + body = remaining[body_start:body_start + cl] + remaining = remaining[body_start + cl:] + try: + responses.append(_json717.loads(body)) + except Exception: + pass + return [r for r in responses if r.get("id") == 2] + + # I717-15: tools/list includes all 6 tools + try: + _r717_list = _mcp717_roundtrip("tools/list", {}) + _names717 = [t.get("name") for t in (_r717_list[0].get("result", {}).get("tools", []) if _r717_list else [])] + test("I717-15: tools/list includes learn", "learn" in _names717, str(_names717)) + test("I717-16: tools/list includes status", "status" in _names717, str(_names717)) + test("I717-17: tools/list includes session_list", "session_list" in _names717, str(_names717)) + except Exception as _e717_15: + test("I717-15: tools/list learn", False, str(_e717_15)) + test("I717-16: tools/list status", False, str(_e717_15)) + test("I717-17: tools/list session_list", False, str(_e717_15)) + + # I717-18: status tool returns JSON with expected fields + try: + _r717_status = _mcp717_roundtrip("tools/call", {"name": "status", "arguments": {}}) + if _r717_status: + _body717_s = _json717.loads(_r717_status[0].get("result", {}).get("content", [{}])[0].get("text", "{}")) + test("I717-18a: status has session_count key", "session_count" in _body717_s, str(_body717_s)) + test("I717-18b: status has entry_count key", "entry_count" in _body717_s, str(_body717_s)) + test("I717-18c: status has watcher key", "watcher" in _body717_s, str(_body717_s)) + test("I717-18d: status has db_path key", "db_path" in _body717_s, str(_body717_s)) + test("I717-18e: status session_count is int", isinstance(_body717_s.get("session_count"), int), str(_body717_s)) + test("I717-18f: status watcher is running or stopped", + _body717_s.get("watcher") in ("running", "stopped"), str(_body717_s)) + else: + for sfx in ["a", "b", "c", "d", "e", "f"]: + test(f"I717-18{sfx}: status (no response)", False, "no MCP response") + except Exception as _e717_18: + for sfx in ["a", "b", "c", "d", "e", "f"]: + test(f"I717-18{sfx}: status tool", False, str(_e717_18)) + + # I717-19: session_list returns sessions array + try: + _r717_sl = _mcp717_roundtrip("tools/call", {"name": "session_list", "arguments": {"limit": 10}}) + if _r717_sl: + _body717_sl = _json717.loads(_r717_sl[0].get("result", {}).get("content", [{}])[0].get("text", "{}")) + test("I717-19a: session_list has sessions key", "sessions" in _body717_sl, str(_body717_sl)) + test("I717-19b: session_list has count key", "count" in _body717_sl, str(_body717_sl)) + test("I717-19c: session_list count matches len", + _body717_sl.get("count") == len(_body717_sl.get("sessions", [])), str(_body717_sl)) + test("I717-19d: session_list returns 2 sessions", + _body717_sl.get("count") == 2, f"count={_body717_sl.get('count')}") + _sess0 = _body717_sl.get("sessions", [{}])[0] if _body717_sl.get("sessions") else {} + test("I717-19e: session entry has id field", "id" in _sess0, str(_sess0)) + test("I717-19f: session entry has summary field", "summary" in _sess0, str(_sess0)) + else: + for sfx in ["a", "b", "c", "d", "e", "f"]: + test(f"I717-19{sfx}: session_list (no response)", False, "no MCP response") + except Exception as _e717_19: + for sfx in ["a", "b", "c", "d", "e", "f"]: + test(f"I717-19{sfx}: session_list tool", False, str(_e717_19)) + + # I717-20: session_list default limit (no args) + try: + _r717_sl20 = _mcp717_roundtrip("tools/call", {"name": "session_list", "arguments": {}}) + if _r717_sl20: + _body717_sl20 = _json717.loads(_r717_sl20[0].get("result", {}).get("content", [{}])[0].get("text", "{}")) + test("I717-20: session_list default limit works", + "sessions" in _body717_sl20, + str(_body717_sl20)) + else: + test("I717-20: session_list default limit", False, "no response") + except Exception as _e717_20: + test("I717-20: session_list default limit", False, str(_e717_20)) + + # I717-21: learn tool records entry (subprocess roundtrip) + try: + _r717_learn = _mcp717_roundtrip( + "tools/call", + {"name": "learn", "arguments": { + "category": "pattern", + "title": "I717 MCP test pattern", + "description": "Test pattern recorded via MCP learn tool", + "tags": "mcp,i717", + }}, + ) + if _r717_learn: + _r717_lr = _r717_learn[0] + if "error" in _r717_lr: + test("I717-21: learn tool records entry", False, + _r717_lr["error"].get("message", str(_r717_lr["error"]))) + else: + _body717_l = _json717.loads(_r717_lr.get("result", {}).get("content", [{}])[0].get("text", "{}")) + test("I717-21a: learn returns status ok", _body717_l.get("status") == "ok", str(_body717_l)) + test("I717-21b: learn returns message field", "message" in _body717_l, str(_body717_l)) + test("I717-21c: learn returns id field", "id" in _body717_l, str(_body717_l)) + else: + for sfx in ["a", "b", "c"]: + test(f"I717-21{sfx}: learn (no response)", False, "no MCP response") + except Exception as _e717_21: + for sfx in ["a", "b", "c"]: + test(f"I717-21{sfx}: learn tool", False, str(_e717_21)) + + # I717-22: learn rejects invalid category + try: + _r717_inv = _mcp717_roundtrip( + "tools/call", + {"name": "learn", "arguments": { + "category": "invalid_cat", + "title": "Test", + "description": "Should fail", + }}, + ) + if _r717_inv: + _r717_iv = _r717_inv[0] + test("I717-22: learn rejects invalid category", + "error" in _r717_iv or _r717_iv.get("result", {}).get("isError"), + str(_r717_inv)) + else: + test("I717-22: learn invalid category", False, "no response") + except Exception as _e717_22: + test("I717-22: learn invalid category", False, str(_e717_22)) + +except Exception as _e717_outer: + for _sfx in ["15", "16", "17", "18a", "18b", "18c", "18d", "18e", "18f", + "19a", "19b", "19c", "19d", "19e", "19f", "20", "21a", "21b", "21c", "22"]: + test(f"I717-{_sfx}: MCP write tools (setup error)", False, str(_e717_outer)) + +# I718-17: briefing.py 📌 badge source check +try: + _br_src718 = (REPO / "briefing.py").read_text(encoding="utf-8") + test("I718-17a: briefing.py has pinned_badge for P0", "pinned_badge" in _br_src718 or "📌" in _br_src718, + "pinned_badge or 📌 not found in briefing.py") + test("I718-17b: briefing.py checks priority P0 for badge", "P0" in _br_src718 and ("pinned_badge" in _br_src718 or "📌" in _br_src718), + "P0 badge logic not found in briefing.py") +except Exception as _e718_br: + test("I718-17: briefing.py badge source check", False, str(_e718_br)) + +# --------------------------------------------------------------------------- \ No newline at end of file From f5f50bb7b4fbd36be5a4df7c81abacc74708421f Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sat, 30 May 2026 15:30:05 +0700 Subject: [PATCH 2/6] fix(tests): move sys.exit() after I717 tests so they actually run The 35 I717 test cases were placed after sys.exit() and never executed. Move the terminal exit to after the I717 block and add a proper final summary block. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- test_fixes.py | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/test_fixes.py b/test_fixes.py index 8adc296a..867dd545 100755 --- a/test_fixes.py +++ b/test_fixes.py @@ -10076,17 +10076,7 @@ def _make_i720_db(db_path: Path) -> None: except Exception as _e722: test("I722: bulk-tag test setup", False, str(_e722)) -# --------------------------------------------------------------------------- -if FAIL == 0: - print("🎉 All tests passed!") -else: - print(f"⚠️ {FAIL} test(s) need attention") - for _fn in FAIL_NAMES: - print(f" ❌ {_fn}") -sys.exit(0 if FAIL == 0 else 1) - # === I717: MCP Write Tools === - print("\n✍️ I717: MCP Write Tools") _mcp717_src = (REPO / "mcp-server.py").read_text(encoding="utf-8") @@ -10471,4 +10461,11 @@ def _mcp717_roundtrip(method, params): except Exception as _e718_br: test("I718-17: briefing.py badge source check", False, str(_e718_br)) -# --------------------------------------------------------------------------- \ No newline at end of file +# --------------------------------------------------------------------------- +if FAIL == 0: + print("🎉 All tests passed!") +else: + print(f"⚠️ {FAIL} test(s) need attention") + for _fn in FAIL_NAMES: + print(f" ❌ {_fn}") +sys.exit(0 if FAIL == 0 else 1) From 1e9ede8071f7c57a9652d04facbf0a6418baea19 Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sat, 30 May 2026 15:59:26 +0700 Subject: [PATCH 3/6] fix(lint): ruff fix in test_fixes.py --- test_fixes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_fixes.py b/test_fixes.py index 867dd545..dcf03d9a 100755 --- a/test_fixes.py +++ b/test_fixes.py @@ -10207,11 +10207,11 @@ def _make_i720_db(db_path: Path) -> None: # I717-15..22: Integration tests via MCP subprocess try: - import sqlite3 as _sq717 - import tempfile as _tmp717 - import subprocess as _sp717 import json as _json717 import pathlib as _pl717 + import sqlite3 as _sq717 + import subprocess as _sp717 + import tempfile as _tmp717 with _tmp717.TemporaryDirectory(prefix="mcp717-test-") as _tmp717_dir: _home717 = _pl717.Path(_tmp717_dir) From 1ec75c16f7c5467d134fe8b8c9903e5cd447b3dc Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sat, 30 May 2026 16:04:33 +0700 Subject: [PATCH 4/6] fix(format): ruff format test_fixes.py --- test_fixes.py | 266 +++++++++++++++++++++++++++++++++----------------- 1 file changed, 174 insertions(+), 92 deletions(-) diff --git a/test_fixes.py b/test_fixes.py index dcf03d9a..0b69ad12 100755 --- a/test_fixes.py +++ b/test_fixes.py @@ -10083,125 +10083,143 @@ def _make_i720_db(db_path: Path) -> None: # I717-01: TOOLS list includes 'learn' try: - test("I717-01: TOOLS list contains learn tool", - '"name": "learn"' in _mcp717_src or "'name': 'learn'" in _mcp717_src, - "learn tool not found in TOOLS list") + test( + "I717-01: TOOLS list contains learn tool", + '"name": "learn"' in _mcp717_src or "'name': 'learn'" in _mcp717_src, + "learn tool not found in TOOLS list", + ) except Exception as _e717_01: test("I717-01: learn in TOOLS", False, str(_e717_01)) # I717-02: TOOLS list includes 'status' try: - test("I717-02: TOOLS list contains status tool", - '"name": "status"' in _mcp717_src or "'name': 'status'" in _mcp717_src, - "status tool not found in TOOLS list") + test( + "I717-02: TOOLS list contains status tool", + '"name": "status"' in _mcp717_src or "'name': 'status'" in _mcp717_src, + "status tool not found in TOOLS list", + ) except Exception as _e717_02: test("I717-02: status in TOOLS", False, str(_e717_02)) # I717-03: TOOLS list includes 'session_list' try: - test("I717-03: TOOLS list contains session_list tool", - '"name": "session_list"' in _mcp717_src or "'name': 'session_list'" in _mcp717_src, - "session_list tool not found in TOOLS list") + test( + "I717-03: TOOLS list contains session_list tool", + '"name": "session_list"' in _mcp717_src or "'name': 'session_list'" in _mcp717_src, + "session_list tool not found in TOOLS list", + ) except Exception as _e717_03: test("I717-03: session_list in TOOLS", False, str(_e717_03)) # I717-04: _run_learn function exists try: - test("I717-04: _run_learn function defined", - "def _run_learn(" in _mcp717_src, - "_run_learn not found") + test("I717-04: _run_learn function defined", "def _run_learn(" in _mcp717_src, "_run_learn not found") except Exception as _e717_04: test("I717-04: _run_learn defined", False, str(_e717_04)) # I717-05: _run_status function exists try: - test("I717-05: _run_status function defined", - "def _run_status(" in _mcp717_src, - "_run_status not found") + test("I717-05: _run_status function defined", "def _run_status(" in _mcp717_src, "_run_status not found") except Exception as _e717_05: test("I717-05: _run_status defined", False, str(_e717_05)) # I717-06: _run_session_list function exists try: - test("I717-06: _run_session_list function defined", - "def _run_session_list(" in _mcp717_src, - "_run_session_list not found") + test( + "I717-06: _run_session_list function defined", + "def _run_session_list(" in _mcp717_src, + "_run_session_list not found", + ) except Exception as _e717_06: test("I717-06: _run_session_list defined", False, str(_e717_06)) # I717-07: dispatch learn, status, session_list in _handle_tools_call try: _dispatch_body = _mcp717_src.split("def _handle_tools_call(")[1].split("def _read_exact(")[0] - test("I717-07a: dispatch learn in _handle_tools_call", - "_run_learn" in _dispatch_body, - "_run_learn not dispatched") - test("I717-07b: dispatch status in _handle_tools_call", - "_run_status" in _dispatch_body, - "_run_status not dispatched") - test("I717-07c: dispatch session_list in _handle_tools_call", - "_run_session_list" in _dispatch_body, - "_run_session_list not dispatched") + test("I717-07a: dispatch learn in _handle_tools_call", "_run_learn" in _dispatch_body, "_run_learn not dispatched") + test( + "I717-07b: dispatch status in _handle_tools_call", "_run_status" in _dispatch_body, "_run_status not dispatched" + ) + test( + "I717-07c: dispatch session_list in _handle_tools_call", + "_run_session_list" in _dispatch_body, + "_run_session_list not dispatched", + ) except Exception as _e717_07: test("I717-07: dispatch check", False, str(_e717_07)) # I717-08: learn tool has category enum in schema try: - test("I717-08: learn schema has category enum", - "VALID_LEARN_CATEGORIES" in _mcp717_src or '"mistake"' in _mcp717_src, - "category enum missing from learn schema") + test( + "I717-08: learn schema has category enum", + "VALID_LEARN_CATEGORIES" in _mcp717_src or '"mistake"' in _mcp717_src, + "category enum missing from learn schema", + ) except Exception as _e717_08: test("I717-08: learn category enum", False, str(_e717_08)) # I717-09: learn uses subprocess (not importlib) for calling learn.py try: _learn_fn = _mcp717_src.split("def _run_learn(")[1].split("def _run_status(")[0] - test("I717-09: _run_learn uses subprocess.run", - "subprocess.run(" in _learn_fn, - "subprocess.run not used in _run_learn") + test( + "I717-09: _run_learn uses subprocess.run", + "subprocess.run(" in _learn_fn, + "subprocess.run not used in _run_learn", + ) except Exception as _e717_09: test("I717-09: _run_learn subprocess", False, str(_e717_09)) # I717-10: no SQL injection in session_list (uses ? placeholder) try: _sl_fn = _mcp717_src.split("def _run_session_list(")[1].split("def _handle_tools_call(")[0] - test("I717-10: session_list uses ? placeholder", - "?" in _sl_fn and "f\"SELECT" not in _sl_fn and "f'SELECT" not in _sl_fn, - "session_list may use string-interpolated SQL") + test( + "I717-10: session_list uses ? placeholder", + "?" in _sl_fn and 'f"SELECT' not in _sl_fn and "f'SELECT" not in _sl_fn, + "session_list may use string-interpolated SQL", + ) except Exception as _e717_10: test("I717-10: session_list SQL safety", False, str(_e717_10)) # I717-11: status returns watcher field try: _status_fn = _mcp717_src.split("def _run_status(")[1].split("def _run_session_list(")[0] - test("I717-11: status returns watcher field", - '"watcher"' in _status_fn or "'watcher'" in _status_fn, - "watcher field missing from status output") + test( + "I717-11: status returns watcher field", + '"watcher"' in _status_fn or "'watcher'" in _status_fn, + "watcher field missing from status output", + ) except Exception as _e717_11: test("I717-11: status watcher field", False, str(_e717_11)) # I717-12: status returns db_path field try: _status_fn12 = _mcp717_src.split("def _run_status(")[1].split("def _run_session_list(")[0] - test("I717-12: status returns db_path field", - '"db_path"' in _status_fn12 or "'db_path'" in _status_fn12, - "db_path missing from status output") + test( + "I717-12: status returns db_path field", + '"db_path"' in _status_fn12 or "'db_path'" in _status_fn12, + "db_path missing from status output", + ) except Exception as _e717_12: test("I717-12: status db_path field", False, str(_e717_12)) # I717-13: VALID_LEARN_CATEGORIES constant defined try: - test("I717-13: VALID_LEARN_CATEGORIES constant defined", - "VALID_LEARN_CATEGORIES" in _mcp717_src, - "VALID_LEARN_CATEGORIES not found") + test( + "I717-13: VALID_LEARN_CATEGORIES constant defined", + "VALID_LEARN_CATEGORIES" in _mcp717_src, + "VALID_LEARN_CATEGORIES not found", + ) except Exception as _e717_13: test("I717-13: VALID_LEARN_CATEGORIES", False, str(_e717_13)) # I717-14: learn schema requires category, title, description try: - _learn_schema = _mcp717_src[_mcp717_src.find('"name": "learn"'):_mcp717_src.find('"name": "status"')] - test("I717-14: learn required fields include title and description", - '"category"' in _learn_schema and '"title"' in _learn_schema and '"description"' in _learn_schema, - "learn schema missing required fields") + _learn_schema = _mcp717_src[_mcp717_src.find('"name": "learn"') : _mcp717_src.find('"name": "status"')] + test( + "I717-14: learn required fields include title and description", + '"category"' in _learn_schema and '"title"' in _learn_schema and '"description"' in _learn_schema, + "learn schema missing required fields", + ) except Exception as _e717_14: test("I717-14: learn required fields", False, str(_e717_14)) @@ -10274,10 +10292,14 @@ def _make_i720_db(db_path: Path) -> None: CREATE TABLE documents (id INTEGER PRIMARY KEY, session_id TEXT NOT NULL, doc_type TEXT NOT NULL, title TEXT NOT NULL, file_path TEXT DEFAULT '', seq INTEGER DEFAULT 1, size_bytes INTEGER DEFAULT 0, source TEXT DEFAULT 'copilot'); CREATE TABLE sections (id INTEGER PRIMARY KEY, document_id INTEGER NOT NULL, section_name TEXT DEFAULT '', content TEXT DEFAULT ''); """) - _db717.execute("INSERT INTO sessions (id, path, summary, source, indexed_at) VALUES (?, ?, ?, ?, ?)", - ("sess-i717-1", "/a/b", "Session I717 Alpha", "copilot", "2025-01-01T10:00:00")) - _db717.execute("INSERT INTO sessions (id, path, summary, source, indexed_at) VALUES (?, ?, ?, ?, ?)", - ("sess-i717-2", "/c/d", "Session I717 Beta", "copilot", "2025-01-02T12:00:00")) + _db717.execute( + "INSERT INTO sessions (id, path, summary, source, indexed_at) VALUES (?, ?, ?, ?, ?)", + ("sess-i717-1", "/a/b", "Session I717 Alpha", "copilot", "2025-01-01T10:00:00"), + ) + _db717.execute( + "INSERT INTO sessions (id, path, summary, source, indexed_at) VALUES (?, ?, ?, ?, ?)", + ("sess-i717-2", "/c/d", "Session I717 Beta", "copilot", "2025-01-02T12:00:00"), + ) _db717.commit() _db717.close() @@ -10288,13 +10310,21 @@ def _make_i720_db(db_path: Path) -> None: def _mcp717_roundtrip(method, params): proc = _sp717.Popen( [sys.executable, str(REPO / "mcp-server.py")], - stdin=_sp717.PIPE, stdout=_sp717.PIPE, stderr=_sp717.PIPE, + stdin=_sp717.PIPE, + stdout=_sp717.PIPE, + stderr=_sp717.PIPE, env=_env717, ) try: # initialize - init_msg = _json717.dumps({"jsonrpc": "2.0", "id": 1, "method": "initialize", - "params": {"protocolVersion": "2024-11-05", "capabilities": {}}}).encode() + init_msg = _json717.dumps( + { + "jsonrpc": "2.0", + "id": 1, + "method": "initialize", + "params": {"protocolVersion": "2024-11-05", "capabilities": {}}, + } + ).encode() proc.stdin.write(f"Content-Length: {len(init_msg)}\r\n\r\n".encode() + init_msg) notif = _json717.dumps({"jsonrpc": "2.0", "method": "notifications/initialized"}).encode() proc.stdin.write(f"Content-Length: {len(notif)}\r\n\r\n".encode() + notif) @@ -10325,8 +10355,8 @@ def _mcp717_roundtrip(method, params): hdr = remaining[:hdr_end].decode("ascii", errors="replace") cl = int([l.split(":")[1].strip() for l in hdr.split("\r\n") if "content-length" in l.lower()][0]) body_start = hdr_end + 4 - body = remaining[body_start:body_start + cl] - remaining = remaining[body_start + cl:] + body = remaining[body_start : body_start + cl] + remaining = remaining[body_start + cl :] try: responses.append(_json717.loads(body)) except Exception: @@ -10336,7 +10366,9 @@ def _mcp717_roundtrip(method, params): # I717-15: tools/list includes all 6 tools try: _r717_list = _mcp717_roundtrip("tools/list", {}) - _names717 = [t.get("name") for t in (_r717_list[0].get("result", {}).get("tools", []) if _r717_list else [])] + _names717 = [ + t.get("name") for t in (_r717_list[0].get("result", {}).get("tools", []) if _r717_list else []) + ] test("I717-15: tools/list includes learn", "learn" in _names717, str(_names717)) test("I717-16: tools/list includes status", "status" in _names717, str(_names717)) test("I717-17: tools/list includes session_list", "session_list" in _names717, str(_names717)) @@ -10354,9 +10386,16 @@ def _mcp717_roundtrip(method, params): test("I717-18b: status has entry_count key", "entry_count" in _body717_s, str(_body717_s)) test("I717-18c: status has watcher key", "watcher" in _body717_s, str(_body717_s)) test("I717-18d: status has db_path key", "db_path" in _body717_s, str(_body717_s)) - test("I717-18e: status session_count is int", isinstance(_body717_s.get("session_count"), int), str(_body717_s)) - test("I717-18f: status watcher is running or stopped", - _body717_s.get("watcher") in ("running", "stopped"), str(_body717_s)) + test( + "I717-18e: status session_count is int", + isinstance(_body717_s.get("session_count"), int), + str(_body717_s), + ) + test( + "I717-18f: status watcher is running or stopped", + _body717_s.get("watcher") in ("running", "stopped"), + str(_body717_s), + ) else: for sfx in ["a", "b", "c", "d", "e", "f"]: test(f"I717-18{sfx}: status (no response)", False, "no MCP response") @@ -10371,10 +10410,16 @@ def _mcp717_roundtrip(method, params): _body717_sl = _json717.loads(_r717_sl[0].get("result", {}).get("content", [{}])[0].get("text", "{}")) test("I717-19a: session_list has sessions key", "sessions" in _body717_sl, str(_body717_sl)) test("I717-19b: session_list has count key", "count" in _body717_sl, str(_body717_sl)) - test("I717-19c: session_list count matches len", - _body717_sl.get("count") == len(_body717_sl.get("sessions", [])), str(_body717_sl)) - test("I717-19d: session_list returns 2 sessions", - _body717_sl.get("count") == 2, f"count={_body717_sl.get('count')}") + test( + "I717-19c: session_list count matches len", + _body717_sl.get("count") == len(_body717_sl.get("sessions", [])), + str(_body717_sl), + ) + test( + "I717-19d: session_list returns 2 sessions", + _body717_sl.get("count") == 2, + f"count={_body717_sl.get('count')}", + ) _sess0 = _body717_sl.get("sessions", [{}])[0] if _body717_sl.get("sessions") else {} test("I717-19e: session entry has id field", "id" in _sess0, str(_sess0)) test("I717-19f: session entry has summary field", "summary" in _sess0, str(_sess0)) @@ -10389,10 +10434,10 @@ def _mcp717_roundtrip(method, params): try: _r717_sl20 = _mcp717_roundtrip("tools/call", {"name": "session_list", "arguments": {}}) if _r717_sl20: - _body717_sl20 = _json717.loads(_r717_sl20[0].get("result", {}).get("content", [{}])[0].get("text", "{}")) - test("I717-20: session_list default limit works", - "sessions" in _body717_sl20, - str(_body717_sl20)) + _body717_sl20 = _json717.loads( + _r717_sl20[0].get("result", {}).get("content", [{}])[0].get("text", "{}") + ) + test("I717-20: session_list default limit works", "sessions" in _body717_sl20, str(_body717_sl20)) else: test("I717-20: session_list default limit", False, "no response") except Exception as _e717_20: @@ -10402,18 +10447,24 @@ def _mcp717_roundtrip(method, params): try: _r717_learn = _mcp717_roundtrip( "tools/call", - {"name": "learn", "arguments": { - "category": "pattern", - "title": "I717 MCP test pattern", - "description": "Test pattern recorded via MCP learn tool", - "tags": "mcp,i717", - }}, + { + "name": "learn", + "arguments": { + "category": "pattern", + "title": "I717 MCP test pattern", + "description": "Test pattern recorded via MCP learn tool", + "tags": "mcp,i717", + }, + }, ) if _r717_learn: _r717_lr = _r717_learn[0] if "error" in _r717_lr: - test("I717-21: learn tool records entry", False, - _r717_lr["error"].get("message", str(_r717_lr["error"]))) + test( + "I717-21: learn tool records entry", + False, + _r717_lr["error"].get("message", str(_r717_lr["error"])), + ) else: _body717_l = _json717.loads(_r717_lr.get("result", {}).get("content", [{}])[0].get("text", "{}")) test("I717-21a: learn returns status ok", _body717_l.get("status") == "ok", str(_body717_l)) @@ -10430,34 +10481,65 @@ def _mcp717_roundtrip(method, params): try: _r717_inv = _mcp717_roundtrip( "tools/call", - {"name": "learn", "arguments": { - "category": "invalid_cat", - "title": "Test", - "description": "Should fail", - }}, + { + "name": "learn", + "arguments": { + "category": "invalid_cat", + "title": "Test", + "description": "Should fail", + }, + }, ) if _r717_inv: _r717_iv = _r717_inv[0] - test("I717-22: learn rejects invalid category", - "error" in _r717_iv or _r717_iv.get("result", {}).get("isError"), - str(_r717_inv)) + test( + "I717-22: learn rejects invalid category", + "error" in _r717_iv or _r717_iv.get("result", {}).get("isError"), + str(_r717_inv), + ) else: test("I717-22: learn invalid category", False, "no response") except Exception as _e717_22: test("I717-22: learn invalid category", False, str(_e717_22)) except Exception as _e717_outer: - for _sfx in ["15", "16", "17", "18a", "18b", "18c", "18d", "18e", "18f", - "19a", "19b", "19c", "19d", "19e", "19f", "20", "21a", "21b", "21c", "22"]: + for _sfx in [ + "15", + "16", + "17", + "18a", + "18b", + "18c", + "18d", + "18e", + "18f", + "19a", + "19b", + "19c", + "19d", + "19e", + "19f", + "20", + "21a", + "21b", + "21c", + "22", + ]: test(f"I717-{_sfx}: MCP write tools (setup error)", False, str(_e717_outer)) # I718-17: briefing.py 📌 badge source check try: _br_src718 = (REPO / "briefing.py").read_text(encoding="utf-8") - test("I718-17a: briefing.py has pinned_badge for P0", "pinned_badge" in _br_src718 or "📌" in _br_src718, - "pinned_badge or 📌 not found in briefing.py") - test("I718-17b: briefing.py checks priority P0 for badge", "P0" in _br_src718 and ("pinned_badge" in _br_src718 or "📌" in _br_src718), - "P0 badge logic not found in briefing.py") + test( + "I718-17a: briefing.py has pinned_badge for P0", + "pinned_badge" in _br_src718 or "📌" in _br_src718, + "pinned_badge or 📌 not found in briefing.py", + ) + test( + "I718-17b: briefing.py checks priority P0 for badge", + "P0" in _br_src718 and ("pinned_badge" in _br_src718 or "📌" in _br_src718), + "P0 badge logic not found in briefing.py", + ) except Exception as _e718_br: test("I718-17: briefing.py badge source check", False, str(_e718_br)) From 6bfb7b484945d7b150ca602e8afb089922a6036b Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sat, 30 May 2026 16:20:21 +0700 Subject: [PATCH 5/6] =?UTF-8?q?fix(tests):=20update=20mcp=20tool=20count?= =?UTF-8?q?=20from=203=E2=86=926=20after=20wave=208=20adds=20learn/status/?= =?UTF-8?q?session=5Flist?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- tests/test_mcp_server.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/tests/test_mcp_server.py b/tests/test_mcp_server.py index 5aa3a562..699bdab0 100644 --- a/tests/test_mcp_server.py +++ b/tests/test_mcp_server.py @@ -102,8 +102,8 @@ def setUp(self): self.tools = {t["name"]: t for t in mcp.TOOLS} def test_exactly_two_tools(self): - # Updated: query_memory added in issue #404; now 3 tools total - self.assertEqual(len(mcp.TOOLS), 3) + # Updated: wave 8 added learn, status, session_list; now 6 tools total + self.assertEqual(len(mcp.TOOLS), 6) def test_briefing_tool_present(self): self.assertIn("briefing", self.tools) @@ -1059,10 +1059,11 @@ def test_query_memory_auth_error_propagates(self): mcp._run_query_memory({"token": "bad"}) self.assertEqual(ctx.exception.code, mcp._MCP_AUTH_ERROR) - # -- three tools total --- + # -- six tools total --- def test_exactly_three_tools(self): - self.assertEqual(len(mcp.TOOLS), 3) + # Updated: wave 8 added learn, status, session_list; now 6 tools total + self.assertEqual(len(mcp.TOOLS), 6) # --------------------------------------------------------------------------- From 2552679d7f04c1aef45117cfe354dfc9b45c682b Mon Sep 17 00:00:00 2001 From: Linh Ngo Date: Sat, 30 May 2026 19:02:27 +0700 Subject: [PATCH 6/6] =?UTF-8?q?fix(mcp):=20reviewer=20issues=20=E2=80=94?= =?UTF-8?q?=20token=20gating=20+=20length=20caps=20+=20try/finally=20+=20J?= =?UTF-8?q?SON=20ID=20parsing?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Add _check_auth() call to _run_learn for COPILOT_MCP_TOKEN gating (matching _run_query_memory) - Add max_length=500 to title and max_length=10000 to description in _run_learn - Update _require_string to accept optional max_length parameter - Fix fragile #(\d+) regex: pass --json to learn.py and parse structured JSON output for entry_id; anchored fallback regex 'Added new \w+ #(\d+)' used only if JSON missing - Fix connection leak in _run_status: use try/finally around db.close() - Fix connection leak in _run_session_list: use try/finally around db.close() Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- .../skills/tentacle-orchestration/SKILL.md | 656 +++++++----------- mcp-server.py | 61 +- 2 files changed, 286 insertions(+), 431 deletions(-) mode change 100755 => 100644 .github/skills/tentacle-orchestration/SKILL.md diff --git a/.github/skills/tentacle-orchestration/SKILL.md b/.github/skills/tentacle-orchestration/SKILL.md old mode 100755 new mode 100644 index db1991a2..1d458df0 --- a/.github/skills/tentacle-orchestration/SKILL.md +++ b/.github/skills/tentacle-orchestration/SKILL.md @@ -1,132 +1,29 @@ --- name: tentacle-orchestration -description: Break complex tasks into scoped parallel work units for multi-agent execution. Always use task-step-generator first as a reviewed planning scaffold, then adapt the reviewed steps into tentacles. Use when a task spans multiple modules or layers, needs agent delegation, or the user says "orchestrate", "multi-agent", "parallel agents", "tentacle", or "swarm". Each implementation/fix tentacle runs strict-tdd-workflow internally. Features Opus Leader Council for quality-first multi-platform work. +description: Break complex tasks into scoped parallel work units for multi-agent execution. Always use task-step-generator first as a reviewed planning scaffold, then adapt the reviewed steps into tentacles. Use when a task spans multiple modules or layers, needs agent delegation, or the user says "orchestrate", "multi-agent", "parallel agents", "tentacle", or "swarm". Each implementation/fix tentacle runs strict-tdd-workflow internally. --- -# Tentacle Orchestration — copilot-session-knowledge +# Tentacle Orchestration Break a complex task into scoped work units ("tentacles"), enrich each with context, then dispatch agents in parallel. Results persist in files so nothing is lost between agent boundaries. -Adapted from the [OctoGent](https://github.com/hesamsheikh/octogent) tentacle pattern. Customized for **copilot-session-knowledge**: a multi-platform hybrid of Python stdlib tools, Next.js/React browse-ui, and a Rust binary — where **quality outweighs speed**. +Adapted from the [OctoGent](https://github.com/hesamsheikh/octogent) tentacle pattern. > **Relationship with strict-tdd-workflow**: Tentacle is the **orchestrator** (splits work), strict-tdd is the **executor** (runs inside each implementation/fix tentacle). For single-module tasks, skip tentacle and use strict-tdd directly. > > **Relationship with task-step-generator**: `task-step-generator` is the **planning scaffold**. Run it before creating tentacles, then review and edit the generated steps. Do not copy generated steps blindly. ---- - -## ⚡ Opus Leader Council (Project-Specific Pattern) - -This project uses **Opus-class Leader agents** instead of flat swarming. Each leader owns a domain and cannot be bypassed. Leaders discuss before acting and escalate to peers when stuck. - -### Leader Roster - -| Leader | Model | Domain | Scope | -|--------|-------|--------|-------| -| **dev-leader** | `claude-opus-4.7` | Python tools, hooks, Rust binary | `*.py`, `hooks/**/*`, `crates/**/*` | -| **test-leader** | `claude-opus-4.7` | All test surfaces (Python + TypeScript + Rust) | `test_*.py`, `run_all_tests.py`, `browse-ui/src/**/*.test.*`, `browse-ui/e2e/**/*` | -| **qa-leader** | `claude-opus-4.7` | Verification gates, cross-surface sync, security | All changed surfaces | -| **browse-leader** | `claude-opus-4.7` | Next.js/React browse-ui frontend | `browse-ui/src/**/*`, `browse-ui/e2e/**/*` | -| **research-leader** | `claude-opus-4.7` | Any question with confidence < 1.0 | Read-only, all surfaces | - -### Leader Dispatch Pattern - -```bash -# Each leader gets its own tentacle with --model claude-opus-4.7 -sk tentacle create dev- --scope "*.py hooks/**/*" --desc "Python implementation" --briefing -sk tentacle create test- --scope "test_*.py run_all_tests.py" --desc "Test coverage" --briefing -sk tentacle create qa- --scope "." --desc "Verification + cross-surface sync" --briefing - -# Dispatch leaders in parallel — all use opus -sk tentacle swarm dev- --agent-type general-purpose --model claude-opus-4.7 --briefing -sk tentacle swarm test-leader --agent-type general-purpose --model claude-opus-4.7 --briefing -sk tentacle swarm qa-leader --agent-type verification-gate --model claude-opus-4.7 --briefing -``` - -### Leader Peer-Discussion Protocol - -When a leader is stuck or has confidence < 1.0, it does NOT stop — it requests peer input: - -1. **dev-leader stuck on architecture** → write question to `handoff.md` with `STATUS: PEER_DISCUSS`, dispatch `research-leader` tentacle with the question -2. **test-leader unsure of coverage strategy** → write question, dispatch `dev-leader` review, loop until both agree -3. **qa-leader finds gate failure** → write failure report, dispatch `dev-leader` fix tentacle, loop back through qa-leader - -```bash -# Peer discussion: dev-leader writes question, research-leader answers -sk tentacle handoff dev- "Architecture question: [question]" --status AMBIGUOUS -sk tentacle create research- --scope "." --desc "Research: [question]" --briefing -sk tentacle swarm research- --agent-type research-planner --model claude-opus-4.7 --briefing -# After research-leader answers → continue dev-leader -sk tentacle resume dev- -``` - ---- - -## ♾️ Infinite Confidence Loop Protocol - -**This project never marks BLOCKED when confidence < 1.0.** Instead, it loops until certainty. - -### The Loop - -``` -CONFIDENCE CHECK - │ - ▼ - ≥ 1.0? ──YES──► Proceed to execution - │ - NO - │ - ▼ - Split ambiguity into atomic questions - │ - ▼ - Dispatch research-leader tentacle(s) on claude-opus-4.7 - │ - ▼ - Research completes, evidence recorded - │ - ▼ - Re-evaluate confidence ───────────────► back to top - │ - (loop forever until ≥ 1.0 or user explicitly overrides) -``` - -### Implementation - -```bash -# Step 1: Spawn research tentacle for each ambiguous question -sk tentacle create research-q1 --scope "." --desc "Research: " --briefing -sk tentacle swarm research-q1 --agent-type research-planner --model claude-opus-4.7 --briefing - -# Step 2: After research, record evidence and re-evaluate -sk tentacle goal gate pass research-q1 --reason "Evidence: " - -# Step 3: Only proceed when ALL research gates pass -sk tentacle goal criteria check -# If not met: loop back to Step 1 with remaining questions -# If met: proceed to Plan phase -``` - -**Override rule**: Only the human operator can exit the loop early. Record the override with rationale: -```bash -sk tentacle goal gate pass override --reason "Human override: " -``` - ---- - ## Planning Discipline Use this sequence before creating any tentacle: -1. Generate a step file with `task-step-generator` (`.github/steps/.md`). +1. Generate a step file with `task-step-generator` (`.github/steps/.md` when the project uses `.github/`, otherwise `STEPS.md` or the path requested by the user). 2. Review the generated step file with `references/decomposition-review.md`. 3. Record what was accepted, edited, and rejected before dispatching agents. 4. Convert only the reviewed steps into non-overlapping tentacles and atomic todos. Why: decomposition and checklists reduce avoidable cognitive load, but generated plans can anchor on the first plausible split. Treat the step file as a draft planning artifact, not as authority. ---- - ## Decision Confidence Gate Before creating, dispatching, merging, deleting, or closing tentacles, verify the routing/plan @@ -137,15 +34,18 @@ or validation agents first. Required behavior when confidence `< 1.0`: 1. Stop implementation/deletion/merge decisions for the uncertain scope. -2. Split the ambiguity into atomic questions: task type, scope boundaries, dependencies, acceptance evidence, and affected systems. -3. Dispatch research/validation tentacles on **`claude-opus-4.7`** (opus-class mandatory, no exceptions). -4. Record the evidence and rejected alternatives in the tentacle `handoff.md`. -5. Continue only after the synthesized decision reaches confidence `1.0`, or after an explicit user override is recorded with its rationale. -6. **Never mark BLOCKED** — instead create a `research-` tentacle, loop, and resume when certain. - -Why this gate exists: low-confidence orchestration creates the worst kind of parallelism — many agents confidently doing the wrong work. Research-first decomposition is cheaper than unwinding a bad swarm. - ---- +2. Split the ambiguity into atomic questions: task type, scope boundaries, dependencies, + acceptance evidence, and affected systems. +3. Dispatch research/validation tentacles or sub-agents on the strongest available model + (`claude-opus-4.7` when available; otherwise the newest opus-class model). +4. Record the evidence and rejected alternatives in the tentacle `handoff.md` or the + conductor `research_gate` artifact. +5. Continue only after the synthesized decision reaches confidence `1.0`, or after an + explicit user override is recorded with its rationale. + +Why this gate exists: low-confidence orchestration creates the worst kind of parallelism — +many agents confidently doing the wrong work. Research-first decomposition is cheaper than +unwinding a bad swarm. ## When to use @@ -156,22 +56,44 @@ Why this gate exists: low-confidence orchestration creates the worst kind of par | 3+ files, multiple modules | **Tentacle required** — decompose into scoped units | | Multi-phase with agent delegation | **Tentacle required** — each delegated agent gets a tentacle | | Bug investigation, multiple hypotheses | Tentacle recommended — one tentacle per hypothesis | -| Cross-surface (Python + browse-ui + Rust) | **Opus Leader Council required** — one leader per surface | -| Confidence < 1.0 at any point | **Research tentacle required** — loop until certain | **Not a good fit:** strictly sequential single-file tasks, limited token budget, trivial edits. ---- - ## Sub-agent Guardrails These guardrails apply to dispatched sub-agents. The **commit restriction is enforced at the -git level** when hooks are installed. +git level** when hooks are installed; all other items are conventions reinforced by prompt +context. + +**Why git hooks, not preToolUse alone:** When the orchestrator dispatches a sub-agent via the +`task()` tool, the platform does not guarantee that `preToolUse` hooks from the parent +`hooks.json` propagate into the sub-agent's context window. Git hooks (`pre-commit`, +`pre-push`) are filesystem-level and fire for any git process regardless of which agent spawned +it — they are the reliable enforcement surface. **How enforcement works:** 1. `tentacle.py create` generates a UUID `tentacle_id` stored in the tentacle's `meta.json`. -2. `hooks/pre-commit` and `hooks/pre-push` call `hooks/check_subagent_marker.py`, which blocks git operations while the marker is fresh. -3. `sk tentacle complete ` removes the matching marker entry. + If the requested name directory already exists, `create` auto-resolves the collision by + creating `-` — the slug is printed and must be used for all subsequent + commands. `tentacle.py swarm` reads `tentacle_id` from `meta.json` and writes an HMAC-signed + marker file at `~/.copilot/markers/dispatched-subagent-active` containing `active_tentacles` + entries of the form `{"name": ..., "ts": ..., "git_root": ..., "tentacle_id": ...}`. + **Primary deduplication key: `tentacle_id`** (when present) — two instances with the same + logical name in the same repo each get a separate entry. Fallback for legacy entries without + `tentacle_id`: `(name, git_root)`. +2. `hooks/pre-commit` and `hooks/pre-push` call `hooks/check_subagent_marker.py`, which blocks + the git operation when the marker is present, auth-valid, within its 4-hour TTL, **and the + entry's `git_root` matches the repo running the git command**. A marker from a different repo + does not block commits there — this prevents cross-repo false positives when tentacles are + active in other repos concurrently. Entries without `git_root` (old format) conservatively + block all repos. + > **Upgrade migration:** Cross-repo isolation is not retroactive. In-flight old-format marker + > entries (no `git_root`) continue to block all repos until completed, cleared, or expired (4h + > TTL). To get isolation immediately: `sk tentacle complete ` then re-dispatch. +3. `hooks/rules/subagent_guard.py` provides a secondary `preToolUse` intercept for the + orchestrator session (defense-in-depth only — not the primary path). +4. `sk tentacle complete ` reads `tentacle_id` from `meta.json` and removes only the + matching marker entry; the marker is deleted when `active_tentacles` becomes empty. **Install the git hooks** (once per repository): @@ -180,39 +102,52 @@ sk install --install-git-hooks # fallback: python3 ~/.copilot/tools/install.py --install-git-hooks ``` +**Enforcement scope and known limitations:** + +- **Local-only.** Cloud-delegated or remote agent runs are not covered. +- **`preToolUse` non-inheritance.** The `preToolUse` guard in the main session is + defense-in-depth — it does not replace git hooks. Whether `preToolUse` propagates into + `task()`-spawned subagents is undefined by the platform. +- **Same-repo multi-session supported (phase 5) — with working-tree caveat.** `tentacle_id` + isolation at the marker/runtime layer means two instances with the same logical name in the + same repo each hold a separate entry and `complete` only clears the matching one. However, + the working tree and git index are shared — concurrent tentacles with overlapping file scopes + will produce conflicts. Keep scopes non-overlapping. +- **Collision-resolved slug names:** When `create` auto-resolves a directory collision, the + printed `-` slug must be used for all subsequent commands. +- **After tool updates,** `auto-update-tools.py` does NOT auto-reinstall git hooks. Re-run + `install.py --install-git-hooks` in each protected repo after relevant updates. + | Convention | What to do | |------------|-----------| -| **Commit restriction** | Do not run `git commit` or `git push`. Both are blocked at the filesystem level while `dispatched-subagent-active` marker is fresh. | -| **Stay in scope** | Do not edit files outside your tentacle's declared `scope`. | -| **Escalate, don't expand** | If scope is insufficient, write the gap to `handoff.md` and stop. | -| **No over-implementation** | Implement only what your todos specify. | -| **Handoff before stopping** | Always write a structured handoff: `sk tentacle handoff "" --status DONE --changed-file --learn` | -| **No platform `create` for reports** | Use `tentacle.py handoff` to persist output to `handoff.md`. | - ---- +| **Commit restriction** | Do not run `git commit` or `git push`. When git hooks are installed, both are blocked at the filesystem level while the `dispatched-subagent-active` marker is fresh. Even without hooks, committing from a subagent mid-run risks corrupting the orchestrator's merge flow. | +| **Stay in scope** | Do not edit files outside your tentacle's declared `scope`. If you discover that more files are needed, escalate — do not expand unilaterally. | +| **Escalate, don't expand** | If your scope is insufficient to complete the task, write the gap to `handoff.md` (e.g. "blocked: need changes in `src/db/` which is outside my scope") and stop. The orchestrator decides whether to create a new tentacle or adjust scope. | +| **No over-implementation** | Implement only what your todos specify. Do not add features, refactors, or improvements that are not in your todo list — even if they seem obvious. | +| **Handoff before stopping** | Always write a structured handoff before marking your work done — even if the session ends early. Use `tentacle.py handoff "" --status --changed-file --learn`. Required fields: a prose summary and `--status` (one of `DONE`, `BLOCKED`, `TOO_BIG`, `AMBIGUOUS`, `REGRESSED`). Add one `--changed-file` per modified file; omit it when no files changed (common for `BLOCKED`, `TOO_BIG`, or `AMBIGUOUS`). Old form `handoff "" --learn` still works when no structured status is needed. The orchestrator reads `STATUS:` and `Changed:` receipts to decide next steps and triage. | +| **No platform `create` for reports** | Do **not** use the runtime platform's `create` file-creation tool to save research output, investigation findings, or final reports. The `create` tool is a platform capability that is **not available in all agent runtimes** (cloud agents, Copilot cloud runs, background tasks). Use `tentacle.py handoff` to persist agent output to `handoff.md` — this is always available when `tentacle.py` is on disk. If even `tentacle.py` is unavailable, print the report to chat so the orchestrator can capture it. Orchestrators must not assume sub-agents can create arbitrary files. | ## Anti-patterns -- ❌ SQL/markdown todos only for multi-agent work → agents lose scope isolation -- ❌ Launching sub-agents without `swarm` prompt → no scope, constraints, or key files -- ❌ Skipping `--briefing` → past mistakes not injected into CONTEXT.md +- ❌ SQL/markdown todos only for multi-agent work → agents lose scope isolation and CONTEXT.md +- ❌ Launching sub-agents without `swarm` prompt → agent gets no scope, constraints, or key files +- ❌ Skipping `--briefing` on create → past mistakes not injected into CONTEXT.md - ❌ Skipping `complete` before `delete` → learnings from handoff.md lost permanently - ❌ Overlapping tentacle scopes → agents overwrite each other's work -- ❌ Creating tentacles from intuition without a generated-and-reviewed step file -- ❌ Copying `task-step-generator` output blindly without checking dependencies -- ❌ Skipping the runtime bundle on multi-agent work → no `recall-pack.json` -- ❌ Sub-agent commits or pushes → blocked by git hooks when installed -- ❌ Sub-agent edits files outside declared scope → silent conflicts -- ❌ Treating confidence `< 1.0` as acceptable → use opus research loop, never skip -- ❌ **Marking BLOCKED when stuck** → loop via peer leader discussion instead -- ❌ **Using haiku/sonnet for leader agents** → all leaders must use `claude-opus-4.7` -- ❌ **Skipping test-leader** → every feature/fix needs test coverage verified by test-leader -- ❌ **qa-leader bypassed on cross-surface changes** → Python + browse-ui + Rust changes always need qa-leader -- ❌ Accepting sub-agent claims of "tests pass" without running commands → unverified claims are not evidence -- ❌ Closing a tentacle `DONE` with no verification evidence → treated as `AMBIGUOUS` -- ❌ Sub-agent uses the platform `create` file-creation tool for research output → use `tentacle.py handoff` - ---- +- ❌ Creating tentacles directly from intuition without a generated-and-reviewed step file +- ❌ Copying `task-step-generator` output blindly without checking dependencies, ownership, work-in-progress limits, evidence, and agent fit +- ❌ Skipping the runtime bundle on multi-agent work → agents lose file-backed context and `recall-pack.json` +- ❌ Using `--briefing --output json --no-bundle` → briefing cannot be represented without the bundle +- ❌ Sub-agent commits or pushes → blocked by git hooks when installed (and risky regardless: corrupts orchestrator's merge/verify flow) +- ❌ Sub-agent edits files outside declared scope → silent conflicts with other parallel agents +- ❌ Sub-agent silently expands scope instead of escalating → orchestrator loses visibility +- ❌ Treating confidence `< 1.0` as acceptable → split ambiguity and run opus-class research + before implementation, deletion, merge, or routing decisions +- ❌ Skipping `install.py --install-git-hooks` → git-level commit/push guard is inactive; enforcement falls back to preToolUse only (not guaranteed in subagent contexts) +- ❌ Accepting sub-agent claims of "tests pass" / "lint clean" / "CI green" without running the commands → unverified claims are not evidence; always run the gates yourself and record output +- ❌ Closing a tentacle `DONE` with no verification evidence → treated as `AMBIGUOUS`; requires triage before proceeding +- ❌ Closing an issue without per-criterion evidence → acceptance criteria are unproven until commands run and output is recorded +- ❌ Sub-agent uses the platform `create` file-creation tool to save research or investigation output → use `tentacle.py handoff` instead. The `create` tool is a runtime-platform capability and is **not guaranteed in all agent contexts** (cloud agents, Copilot cloud runs, background tasks). `tentacle.py handoff` writes to `handoff.md` in the tentacle directory and is always available as long as `tentacle.py` is on disk. If `tentacle.py` is also unavailable, fall back to printing the report to chat. Orchestrators should not assume sub-agents can create arbitrary files. ## Core concept @@ -236,44 +171,33 @@ A **tentacle** is a scoped work unit stored as files: The octopus metaphor: one orchestrator (you), multiple tentacles (agents), each handling a distinct code region. -**Task:** Add token cost display to browse-ui (multi-surface: Python API + React component + Vitest tests) - -**Confidence check:** Python API shape? → confidence 0.8 → spawn research-leader tentacle first +**Task:** Add dark mode support to a Next.js app -**After research (confidence 1.0):** +**Decomposition:** +- `theme-tokens` tentacle — scope: `src/styles/tokens.css`, `tailwind.config.ts` — create CSS variables for dark/light palettes +- `component-update` tentacle — scope: `src/components/**/*` — apply `dark:` Tailwind classes to all components +- `test-suite` tentacle — scope: `tests/**/*` — write Playwright visual regression tests for dark mode -**Opus Leader decomposition:** -- `dev-leader-cost-api` — scope: `browse/routes/*.py`, `browse/api/*.py` — add /api/session/cost endpoint -- `browse-leader-cost-ui` — scope: `browse-ui/src/components/**/*` — React cost display component -- `test-leader-cost` — scope: `test_*.py`, `browse-ui/src/**/*.test.*` — Python unit tests + Vitest tests -- `qa-leader-cost` — scope: `.` — verify cross-surface sync, run all gates - -**Dispatch order:** dev-leader + browse-leader in parallel → test-leader → qa-leader → commit - -Each leader uses `--model claude-opus-4.7`. +Each tentacle is independent, non-overlapping, and completable in isolation. The orchestrator merges results after all three pass verification gates. ---- - ## Internal workflow The workflow has 5 phases: **Clarify → Plan → Execute → Verify → Close**. -Clarification is the most important phase. A bug found in spec costs 1x to fix. Found in code: 10x. Found in production: 100x. Never skip this phase. +Clarification is the most important phase. A bug found in spec costs 1x to fix. Found in code: 10x. Found in production: 100x. Never skip this phase — time invested here prevents entire categories of downstream waste. ### Phase 0: Clarify Spec (Steps 0.0–0.5) This phase takes a raw specification and makes it implementation-ready through iterative Q&A. No planning or coding happens until the spec is CLEAN. -- **Step 0.0** (optional): Co-author the spec when user has no written spec +- **Step 0.0** (optional): Co-author the spec when user has no written spec — structured context gathering + iterative drafting - **Steps 0.1–0.4**: Analyze spec against 8 quality dimensions, generate Spec Health Report, iterative refinement until CLEAN - **Step 0.5**: Reader Testing — verify a fresh agent (no context) can correctly understand the spec For the full process, see `references/spec-clarification.md`. -**Gate**: Never proceed to Phase 1 until the spec is CLEAN and reader-tested. - -**Multi-platform note**: For cross-surface tasks, clarification must identify which surfaces are affected (Python / browse-ui / Rust) so the leader mapping is correct before any planning. +**Gate**: Planning on an unclear spec produces incorrect decomposition, wasted agent work, and rework. Never proceed to Phase 1 until the spec is CLEAN and reader-tested. ### Phase 1: Plan @@ -284,103 +208,103 @@ Use the CLEAN spec and its Impact Analysis / Risk Assessment to inform decomposi Use `task-step-generator` before creating tentacles: ```text -Generate a step file for this task. Include CLARIFY, RED evidence/test strategy, BUILD, TEST, REVIEW, LOOP-EVAL, and COMMIT/CLOSE. +Generate a step file for this task. Include CLARIFY, RED evidence/test strategy for implementation or fixes, BUILD, TEST, REVIEW, LOOP-EVAL when iteration is likely, and COMMIT/CLOSE. ``` +The output may say the task is too large for a single step file. That is acceptable: use the step file as a top-level scaffold, then split reviewed steps into tentacles. + #### Plan B: Review and edit the generated steps -Apply `references/decomposition-review.md`. Verify: +Apply `references/decomposition-review.md`. At minimum, verify: + - acceptance signal is observable, - RED evidence/test strategy exists before implementation, - dependencies are ordered before parallel work, - steps are small enough to review in one context, -- each step maps to the correct **Opus Leader** agent type. +- only independent work is parallelized, +- each evidence-producing step names logs/screenshots/hashes or equivalent artifacts, +- each step maps to the correct agent type/model available in the project. -#### Plan C: Confidence check before decomposition +Do not proceed until the reviewed plan clearly states accepted, edited, and rejected steps. -```bash -# For each ambiguous point in the step file: -sk tentacle create research- --scope "." --desc "Research: " --briefing -sk tentacle swarm research- --agent-type research-planner --model claude-opus-4.7 --briefing -# Wait for handoff, evaluate evidence, repeat until confidence = 1.0 -sk tentacle goal gate pass research- --reason "Evidence: " -``` +#### Plan C: Decompose the reviewed task into modules -#### Plan D: Map steps to Opus Leaders +Read the task description and identify independent code regions. Each region becomes one tentacle. -For each tentacle, assign an appropriate leader model: +Each code tentacle must declare: -| Tentacle type | agent_type | Model | Scope pattern | -|--------------|-----------|-------|--------------| -| Python dev (tools/hooks) | general-purpose | `claude-opus-4.7` | `*.py`, `hooks/**/*` | -| Python dev (browse backend) | python-browse-backend | `claude-opus-4.7` | `browse/**/*.py` | -| browse-ui frontend | browse-ui-host-state | `claude-opus-4.7` | `browse-ui/src/**/*` | -| Browse-ui tests (Vitest/Playwright) | general-purpose | `claude-opus-4.7` | `browse-ui/src/**/*.test.*`, `browse-ui/e2e/**/*` | -| Python tests | general-purpose | `claude-opus-4.7` | `test_*.py`, `run_all_tests.py` | -| Security/auth review | browser-security-reviewer | `claude-opus-4.7` | `browse/**/*`, `browse-ui/src/**/*` | -| Research / architecture decisions | research-planner | `claude-opus-4.7` | Read-only, all | -| Cross-surface verification | verification-gate | `claude-opus-4.7` | All changed surfaces | -| Whole-app impact audit | whole-app-impact-auditor | `claude-opus-4.7` | All | -| Docs / skills / hooks | general-purpose | `claude-opus-4.7` | `docs/**/*`, `.github/**/*` | +- source step file and accepted/edited/rejected step numbers, +- decision confidence (`1.0` required; otherwise create a research/validation tentacle first), +- dependency order, +- test/evidence owner, +- implementation owner, +- acceptance signal, +- verification/evidence paths. -#### Plan E: Create tentacles +#### Plan D: Create tentacles ```bash -sk tentacle create \ +sk tentacle create \ --scope "" \ - --profile "" \ + --profile "" \ --desc "" \ --briefing +# fallback: python3 ~/.copilot/tools/tentacle.py create ... ``` -Available profiles (from `.github/agents/`): -- `browse-ui-host-state` — browse-ui host/provider work -- `python-browse-backend` — Python backend routes -- `browser-security-reviewer` — security review -- `research-planner` — research and architecture -- `verification-gate` — gate verification -- `whole-app-impact-auditor` — cross-surface impact +The `--profile` flag loads a specialist `.agent.md` contract (role, domain, +quality gates, escalation rules, and evidence requirements) into CONTEXT.md and +meta.json. The `--briefing` flag injects past mistakes and patterns from +session-knowledge into CONTEXT.md — use both when a suitable profile exists. -#### Plan F: Add todos +#### Plan E: Add todos ```bash sk tentacle todo add "" +# fallback: python3 ~/.copilot/tools/tentacle.py todo add "" ``` -#### Plan G: Enrich CONTEXT.md +Each todo should be one deliverable — testable, reviewable, and completable in isolation. -Add to each tentacle's CONTEXT.md: -- **Step-plan review**: accepted/edited/rejected steps, confidence evidence -- **Multi-platform context**: which surfaces are touched and why -- **Key files**: full paths to reference files -- **Constraints**: project rules (stdlib-only Python, parameterized SQL, no pickle, etc.) -- **Verification requirement**: what commands prove this tentacle DONE +#### Plan F: Enrich CONTEXT.md ---- +Read reference files with `view`, then edit CONTEXT.md to add: +- **Step-plan review**: source step file, accepted/edited/rejected steps, dependency order, and evidence contract +- **What exists**: describe the current code in the scope area +- **Key files**: full paths to reference files the agent needs +- **Constraints**: rules specific to this code region + +This is the most important step. Agent quality is directly proportional to CONTEXT.md quality. ### Phase 2: Execute (Steps 5–6) -#### Step 5: Dispatch Opus Leaders (swarm) +#### Step 5: Dispatch agents (swarm) ```bash -# Always use claude-opus-4.7 for all leaders -sk tentacle swarm --agent-type --model claude-opus-4.7 --briefing +sk tentacle swarm --agent-type --model --briefing sk tentacle swarm --output parallel --briefing +sk tentacle dispatch --agent-type --model --briefing +# fallback: python3 ~/.copilot/tools/tentacle.py swarm/dispatch ... +``` -# Example: multi-surface feature -# Wave 1 (parallel): dev + browse -sk tentacle swarm dev- --agent-type general-purpose --model claude-opus-4.7 --briefing & -sk tentacle swarm browse- --agent-type browse-ui-host-state --model claude-opus-4.7 --briefing & -wait +`swarm` and `dispatch` materialize a runtime bundle by default. If the tentacle +was created with `--profile`, dispatch also injects a compact Specialist Profile +section before the task list so the worker sees the expert role, gates, escalation +rules, and evidence requirements. The dispatch prompt stays token-lean and points +agents at `.octogent/tentacles//bundle/manifest.json` first. +The bundle carries the full `CONTEXT.md`, todos, latest checkpoint, instruction snippets, +skills catalogue, and `recall-pack.json`. -# Wave 2 (after Wave 1): tests -sk tentacle swarm test- --agent-type general-purpose --model claude-opus-4.7 --briefing +`--briefing` fetches live past knowledge from session-knowledge at dispatch time. With the +default bundle, briefing is stored in `briefing.md` and machine-readable recall is stored in +`recall-pack.json` by trying `briefing.py --task --json` first and falling back to +`briefing.py "" --pack --limit 3`. Use `--no-bundle` only for tiny/manual prompts; if +you combine `--output json --briefing`, keep the default bundle enabled so JSON output can +surface `bundle_path`. -# Wave 3 (after Wave 2): QA gate -sk tentacle swarm qa- --agent-type verification-gate --model claude-opus-4.7 --briefing -``` +Use the output as the prompt for `task()`. Launch independent tentacles in parallel. -Every implementation tentacle must execute strict-TDD internally: define failing evidence first, make the smallest change, prove criterion turns green. +Every implementation or bug-fix tentacle must execute the strict-TDD loop internally: define or reproduce the failing evidence first, make the smallest change, then prove the same criterion turns green. Research-only, documentation-only, and review-only tentacles still need explicit evidence gates, but they do not fabricate code tests just to satisfy the pattern. #### Step 6: Monitor progress @@ -389,245 +313,159 @@ sk tentacle status sk tentacle show ``` -If a leader reports `AMBIGUOUS` or `PEER_DISCUSS`, dispatch a research or peer tentacle immediately — do not wait: +### Phase 3: Verify (Steps 7–12) -```bash -# Leader is stuck: dispatch research immediately -sk tentacle create research- --scope "." --desc "Research: " --briefing -sk tentacle swarm research- --agent-type research-planner --model claude-opus-4.7 --briefing -# After research resolves: resume the stuck leader -sk tentacle resume -``` +Every step here catches a different class of agent error. For detailed gate descriptions (build, lint, test, review, docs, QA audit), see `references/verification-gates.md`. ---- +Summary: -### Phase 3: Verify (Steps 7–12) +| Gate | What it catches | Skip when | +|------|----------------|-----------| +| **Build** | Syntax errors, type mismatches, import failures | Never skip | +| **Lint** | Style violations, unused imports, formatting | Never skip | +| **Test** | Logic bugs, regressions, broken contracts | Never skip | +| **Review** | Security issues, design flaws, scope creep | Never skip | +| **Docs** | Stale README, outdated JSDoc, missing CHANGELOG | Internal refactors only | +| **QA audit** | Hallucinated tests, spec mismatches, blind spots | Low-risk changes only | -Every step catches a different class of agent error. - -| Gate | What it catches | Command | Skip when | -|------|----------------|---------|-----------| -| **Python build** | Syntax, import failures | `python3 -c "import ast; ast.parse(open('file.py').read())"` | Never | -| **Python lint** | Ruff violations | `ruff check *.py hooks/**/*.py` | Never | -| **Python tests** | Logic bugs, regressions | `python3 test_security.py && python3 test_fixes.py` | Never | -| **browse-ui typecheck** | TypeScript errors | `cd browse-ui && pnpm typecheck` | Never (if browse-ui touched) | -| **browse-ui lint** | ESLint violations | `cd browse-ui && pnpm lint` | Never (if browse-ui touched) | -| **browse-ui format** | Prettier violations | `cd browse-ui && pnpm format:check` | Never (if browse-ui touched) | -| **browse-ui test** | Vitest failures | `cd browse-ui && pnpm test` | Never (if browse-ui touched) | -| **browse-ui build** | Next.js build | `cd browse-ui && pnpm build` | Never (if browse-ui touched) | -| **Rust** | Compile + clippy | `cargo fmt --check && cargo clippy -- -D warnings && cargo test` | If Rust untouched | -| **Review** | Security, design flaws | `code-reviewer` agent | Never | -| **Docs sync** | Stale docs | Check `docs/` matches changed behavior | Internal refactors only | -| **QA audit** | Hallucinated tests, blind spots | `qa-leader` tentacle | Low-risk only | - -**Verification surface matrix** (from `copilot-instructions.md`): - -| Surface | Required evidence | -|---------|-------------------| -| Python | `python3 test_security.py && python3 test_fixes.py` | -| Hooks/docs/skills | `python3 tests/test_quality_gates.py` | -| browse-ui | `pnpm typecheck && pnpm lint && pnpm format:check && pnpm test && pnpm build` | -| Rust | `cargo fmt --all -- --check && cargo clippy -- -D warnings && cargo test` | -| remote-terminal | `npm test && npm run lint && npm run lint:clean` | - -**Evidence requirement:** Run commands yourself. Record output. "Tests pass" is not evidence — the command output is. +The first 4 gates are mandatory. Skipping any of them means you don't know if the agent output is correct. ---- +**Evidence requirement:** Each gate must produce concrete, recorded output before being marked as passed. Do not rely on agent claims that "lint is clean" or "tests pass" — run the commands yourself and attach or reference the output. A gate is only passed when you hold the proof, not when the sub-agent says it is. See Rule 9 (Claims Require Evidence) in `docs/AGENT-RULES.md`. ### Phase 3.5: Goal Evaluation Loop -After all verification gates pass, evaluate whether the overarching goal is met. This is the **loop-until-verified** phase. +After all verification gates pass, evaluate whether the overarching goal is met before proceeding to commit and close. This is the **loop-until-verified** phase — the orchestrator decides whether to iterate or close. + +**Step: Record goal-eval evidence** ```bash -# Run success-criteria check and persist result +# Run the goal's success-criteria check and persist the result sk tentacle verify "" --label "goal-eval" -sk tentacle goal criteria check -sk tentacle goal eval --decision continue|complete +# fallback: python3 ~/.copilot/tools/tentacle.py verify ... ``` +**Decision logic:** + | Result | Action | |--------|--------| -| Goal met — all success criteria satisfied | Proceed to Phase 4 | -| Goal partially met — gaps identified | Return to Phase 1, create new leader tentacles for gaps | -| Goal blocked by external dependency | Surface to user. **Do not exit loop without user decision.** | +| Goal met — all success criteria satisfied | Proceed to Phase 4 (Commit + Close) | +| Goal partially met — remaining gaps identified | Return to Phase 1 (Plan), create new tentacles for gaps | +| Goal blocked — external dependency or scope issue | Write gap to handoff, surface to user, decide whether to continue | -**Loop rules:** -1. Success criteria defined in Phase 1, not invented during evaluation. -2. The orchestrator owns the loop — sub-agents report via handoff and stop. -3. Create **new tentacles** for remaining gaps; do not re-open completed tentacles. -4. Record evidence for every evaluation using `tentacle.py verify`. -5. Do not infer goal status from handoff prose — run the command. +**Rules:** +1. Success criteria must be defined **before** dispatching tentacles (in Phase 1), not invented during evaluation. +2. Evaluation is the **orchestrator's responsibility** — sub-agents do not loop. They report via handoff and stop. +3. When looping, create **new tentacles** for remaining gaps; do not re-open completed tentacles. +4. Record evidence for every evaluation using `tentacle.py verify` so the decision is auditable. Closing without recorded evidence is an anti-pattern — it removes the audit trail. +5. Do not infer goal status from handoff prose alone. Run the success-criteria command and record its output. ---- +**Example loop iteration:** + +``` +Goal: "All 137 tests pass and benchmark score ≥ 90" + +Wave 1 results: 130/137 tests pass, score = 85 +→ Eval: NOT MET. Gaps: 7 failing tests, score delta = 5 pts +→ Create tentacle "fix-failing-tests" (scope: tests/), tentacle "benchmark-perf" (scope: embed.py) +→ Dispatch Wave 2 + +Wave 2 results: 137/137 pass, score = 92 +→ Eval: MET. Proceed to Phase 4. +``` ### Phase 4: Commit + Close (Steps 13–17) -#### Step 13: Commit (orchestrator only) +#### Step 13: Commit after each completed phase (orchestrator only) -```bash -git add -A && git commit -m "feat(): +Commit working code after completing each major phase — not just at the end. +If a later phase fails or the session crashes, earlier work is preserved and rollback is possible. -Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>" +```bash +git add -A && git commit -m "feat(): " ``` -Commit cadence: -- After Phase 1 foundation tentacles + build passes → commit -- After each parallel batch + build passes → commit -- After Phase 3 verification → commit -- Final integration → commit +**Commit cadence:** +- After Phase 1 shared/foundation tentacles complete + build passes → commit +- After each Phase 2 parallel batch completes + build passes → commit +- After Phase 3 verification passes → commit +- Final integration wiring → commit + +**Commit restriction:** Sub-agents must not run `git commit` or `git push`. When git hooks +are installed (`install.py --install-git-hooks`), both operations are blocked at the git level +while the `dispatched-subagent-active` marker is fresh. Even without hooks, this is a hard +convention: the orchestrator commits after merging and verifying all tentacle results. +This enforcement is **local-only** — cloud-delegated or remote agent runs are not covered. #### Step 14: Runtime verification -```bash -# Python browse backend -python3 browse.py --dev & -curl -s http://localhost:8765/healthz | jq . +Build passing ≠ app works. After all tentacles are merged, run the app: -# browse-ui -cd browse-ui && pnpm dev & -# Check browser: http://localhost:3000 +```bash +# Desktop: ./gradlew :composeApp:jvmRun +# Mobile: deploy to emulator/simulator +# Web: npm run dev / python manage.py runserver ``` +DI frameworks (Koin, Dagger, Spring) crash at runtime if bindings are missing — the compiler +won't catch this. A 30-second launch test catches what build+test cannot. + #### Step 15: Complete and learn ```bash sk tentacle complete ``` -Always call `complete` before `delete`. `complete` auto-extracts learnings into long-term knowledge. +Only call `complete` after all verification gates pass. This marks all todos done and auto-extracts learnings from handoff.md into long-term knowledge. -#### Step 16: Resume an interrupted tentacle +#### Step 16: Resume a tentacle (when picking up interrupted work) ```bash sk tentacle resume # Refresh briefing, mark active -sk tentacle resume --no-briefing # Skip if briefing is fresh +sk tentacle resume --no-briefing # Skip briefing injection +# fallback: python3 ~/.copilot/tools/tentacle.py resume [--no-briefing] ``` +`resume` refreshes the live briefing in CONTEXT.md and marks the tentacle active again. Use it when returning to a tentacle after an interruption or session boundary. Pass `--no-briefing` only when the briefing is already fresh and re-fetching would be wasteful. + #### Step 17: Cleanup ```bash sk tentacle delete ``` ---- - -## Verification summary - -Full gate table (mirrors Phase 3 gates): - -| Gate | Command | Surface | -|------|---------|---------| -| Python security | `python3 test_security.py` | Python | -| Python fixes | `python3 test_fixes.py` | Python | -| Python all | `python3 run_all_tests.py` | Python | -| Quality gates | `python3 tests/test_quality_gates.py` | Hooks/docs/skills | -| TS typecheck | `cd browse-ui && pnpm typecheck` | browse-ui | -| TS lint | `cd browse-ui && pnpm lint` | browse-ui | -| TS format | `cd browse-ui && pnpm format:check` | browse-ui | -| Vitest | `cd browse-ui && pnpm test` | browse-ui | -| Next.js build | `cd browse-ui && pnpm build` | browse-ui | -| Rust | `cargo fmt --all -- --check && cargo clippy -- -D warnings && cargo test` | Rust | -| Code review | `code-reviewer` agent | Any | - ---- - ## CLI reference -```bash -# Planning -sk tentacle create --scope "" --desc "" --briefing [--profile ] -sk tentacle todo add "" - -# Dispatch (always use --model claude-opus-4.7 for leaders) -sk tentacle swarm --agent-type --model claude-opus-4.7 --briefing -sk tentacle swarm --output parallel --briefing -sk tentacle dispatch --agent-type --model claude-opus-4.7 --briefing +See `references/cli-reference.md` for the full command reference, CONTEXT.md template, and agent selection guidance. -# Monitoring -sk tentacle status -sk tentacle show +Quick reference: -# Handoff +```bash +tentacle.py create --scope "" --desc "" --briefing +tentacle.py todo add "" +tentacle.py swarm --agent-type --model --briefing # bundle-first default +tentacle.py swarm --output parallel --briefing # one worker per todo +sk tentacle swarm --output json --briefing # JSON + bundle_path +sk tentacle dispatch --agent-type --briefing # single-agent dispatch +sk tentacle swarm --no-bundle # rare opt-out for tiny prompts sk tentacle handoff "" --status DONE --changed-file --learn -sk tentacle handoff "" --status AMBIGUOUS # request peer input - -# Goal loop -sk tentacle goal init --title "" -sk tentacle goal link -sk tentacle goal criteria check -sk tentacle goal eval --decision continue|complete -sk tentacle goal gate pass --reason "" -sk tentacle verify "" --label "goal-eval" -sk tentacle goal verify-loop [--escalate] -sk tentacle goal resume - -# Close -sk tentacle resume +sk tentacle goal init --title "" [--desc ""] +sk tentacle goal link # stamp goal metadata into meta.json +sk tentacle goal eval --decision continue|pause|complete|abandon # record orchestrator decision +sk tentacle goal status [--format text|json] +sk tentacle resume # resume interrupted tentacle (refreshes briefing) +sk tentacle resume --no-briefing # resume without re-fetching briefing +sk tentacle status sk tentacle complete sk tentacle delete -sk tentacle marker-cleanup [--apply] - # fallback: python3 ~/.copilot/tools/tentacle.py ``` ---- - ## Tips -1. **Opus for all leaders** — never dispatch a leader with haiku or sonnet; quality-over-speed means `claude-opus-4.7` for every leader tentacle -2. **Loop, never block** — when confidence < 1.0, create a research-leader tentacle; never write BLOCKED to a handoff without first trying the loop -3. **Peer leaders as peers** — if dev-leader is stuck on a test strategy, handoff to test-leader for input; leaders collaborate, not silo -4. **Invest in CONTEXT.md** — 2-3 minutes writing good context saves 10 minutes of agent confusion -5. **Keep todos atomic** — each item = one testable deliverable -6. **No scope overlap** — overlapping scopes cause agents to overwrite each other -7. **Complete before delete** — `complete` saves learnings; `delete` alone loses them -8. **Commit after each phase** — uncommitted code is lost if the session crashes -9. **Run the app** — build+test ≠ works. Launch browse-ui + Python backend to verify E2E behavior -10. **Multi-surface = multi-wave** — dev + browse-leader in parallel, then test-leader, then qa-leader; never send qa-leader before tests pass -11. **⚠️ Commit restriction** — Sub-agents must not run `git commit`/`git push`; enforced by git hooks when installed - ---- - -## ⛔ Workflow Integration - -This project's verification workflow (from `copilot-instructions.md` and `AGENTS.md`) maps to tentacle phases as follows: - -| Outer Workflow Phase | Tentacle Phase | -|---------------------|---------------| -| **Preflight**: `sk briefing --auto --compact` | Phase 0: Clarify Spec | -| **Edit**: minimal footprint, no SQL interpolation | Phase 2: Execute | -| **Verification by surface**: run all gates | Phase 3: Verify | -| **Closeout**: `sk learn`, `task_complete` | Phase 4: Close | - -**Key rule**: The tentacle's internal lifecycle (Clarify→Plan→Execute→Verify→Close) is NOT the entire workflow. The outer workflow gates (briefing, verification, learn) must run AROUND the tentacle lifecycle. - -``` -sk briefing --auto --compact ← BEFORE first tentacle - │ - ▼ -Tentacle Lifecycle (Clarify→Plan→Execute→Verify→Close) - │ - ▼ -python3 test_security.py ← AFTER all tentacles complete (Python surface) -python3 test_fixes.py - │ - ▼ -sk learn --pattern/--mistake ← BEFORE task_complete - │ - ▼ -task_complete / git commit -``` - ---- - -## Reference docs - -- `~/.copilot/tools/skills/tentacle-orchestration/references/` — canonical reference docs - - `cli-reference.md` — full command reference and CONTEXT.md template - - `decomposition-review.md` — step file review checklist - - `verification-gates.md` — gate descriptions - - `spec-clarification.md` — Phase 0 full process -- `docs/AGENT-RULES.md` — all 11 agent rules including confidence gate and tentacle obligations -- `docs/ARCHITECTURE.md` — Python/Rust boundary, script inventory -- `docs/HOOKS.md` — hook enforcement table -- `.github/agents/*.agent.md` — available specialist agent profiles +1. **Invest in CONTEXT.md** — 2-3 minutes writing good context saves 10 minutes of agent confusion +2. **Keep todos atomic** — each item = one testable deliverable +3. **No scope overlap** — overlapping scopes cause agents to overwrite each other +4. **Complete before delete** — `complete` saves learnings; `delete` alone loses them +5. **Commit after each phase** — uncommitted code is lost if the session crashes or compacts +6. **Run the app** — build+test ≠ works. Launch the app to verify DI resolution and runtime behavior +7. **⚠️ Commit restriction** — Sub-agents must not run `git commit`/`git push`. When git hooks are installed (`sk install --install-git-hooks`), both are blocked at the filesystem level for the repo where the tentacle was dispatched, while the `dispatched-subagent-active` marker is fresh. Commits in other repos are not affected. Even without hooks, a sub-agent commit mid-run corrupts the orchestrator's merge flow. Enforcement is local-only; cloud-delegated runs are not covered. diff --git a/mcp-server.py b/mcp-server.py index 58df9eda..e3e23a4f 100644 --- a/mcp-server.py +++ b/mcp-server.py @@ -233,11 +233,14 @@ def _load_script_module(module_name: str, filename: str): ] -def _require_string(arguments: dict[str, Any], key: str) -> str: +def _require_string(arguments: dict[str, Any], key: str, max_length: int | None = None) -> str: value = arguments.get(key) if not isinstance(value, str) or not value.strip(): raise JsonRpcError(JSONRPC_INVALID_PARAMS, f"'{key}' must be a non-empty string") - return value.strip() + stripped = value.strip() + if max_length is not None: + stripped = stripped[:max_length] + return stripped def _optional_string(arguments: dict[str, Any], key: str, max_length: int = 200) -> str: @@ -472,14 +475,16 @@ def _run_query_memory(arguments: dict[str, Any]) -> dict[str, Any]: def _run_learn(arguments: dict[str, Any]) -> dict[str, Any]: """Write a knowledge entry by calling learn.py as a subprocess.""" + _check_auth(arguments) + category = _require_string(arguments, "category") if category not in VALID_LEARN_CATEGORIES: raise JsonRpcError( JSONRPC_INVALID_PARAMS, f"'category' must be one of: {', '.join(sorted(VALID_LEARN_CATEGORIES))}", ) - title = _require_string(arguments, "title") - description = _require_string(arguments, "description") + title = _require_string(arguments, "title", max_length=500) + description = _require_string(arguments, "description", max_length=10_000) tags = _optional_string(arguments, "tags", max_length=500) learn_py = TOOLS_DIR / "learn.py" @@ -487,7 +492,7 @@ def _run_learn(arguments: dict[str, Any]) -> dict[str, Any]: raise JsonRpcError(JSONRPC_INTERNAL_ERROR, "learn.py not found") flag = f"--{category}" - cmd = [sys.executable, str(learn_py), flag, title, description] + cmd = [sys.executable, str(learn_py), flag, title, description, "--json"] if tags: cmd += ["--tags", tags] @@ -502,13 +507,23 @@ def _run_learn(arguments: dict[str, Any]) -> dict[str, Any]: msg = result.stderr.strip() or result.stdout.strip() or "learn.py failed" raise JsonRpcError(JSONRPC_INTERNAL_ERROR, msg) - combined = result.stdout + result.stderr entry_id = None - m = re.search(r"#(\d+)", combined) - if m: - entry_id = int(m.group(1)) + try: + parsed = json.loads(result.stdout.strip()) + raw_id = parsed.get("id") + if isinstance(raw_id, int) and raw_id > 0: + entry_id = raw_id + except (json.JSONDecodeError, AttributeError): + pass - body = {"status": "ok", "message": "Entry recorded", "id": entry_id} + status = "ok" + if entry_id is None: + # Fallback: scan for "Added new #N" pattern in combined output + m = re.search(r"Added new \w+ #(\d+)", result.stdout + result.stderr) + if m: + entry_id = int(m.group(1)) + + body = {"status": status, "message": "Entry recorded", "id": entry_id} return { "content": [{"type": "text", "text": json.dumps(body, ensure_ascii=False)}], "structuredContent": body, @@ -562,16 +577,18 @@ def _run_status(_arguments: dict[str, Any]) -> dict[str, Any]: db_uri = _DB_PATH.as_uri() + "?mode=ro" db = sqlite3.connect(db_uri, uri=True) try: - row = db.execute("SELECT COUNT(*) FROM sessions").fetchone() - session_count = row[0] if row else 0 - except sqlite3.OperationalError: - session_count = 0 - try: - row = db.execute("SELECT COUNT(*) FROM knowledge_entries").fetchone() - entry_count = row[0] if row else 0 - except sqlite3.OperationalError: - entry_count = 0 - db.close() + try: + row = db.execute("SELECT COUNT(*) FROM sessions").fetchone() + session_count = row[0] if row else 0 + except sqlite3.OperationalError: + session_count = 0 + try: + row = db.execute("SELECT COUNT(*) FROM knowledge_entries").fetchone() + entry_count = row[0] if row else 0 + except sqlite3.OperationalError: + entry_count = 0 + finally: + db.close() except Exception: pass @@ -613,9 +630,9 @@ def _run_session_list(arguments: dict[str, Any]) -> dict[str, Any]: (limit,), ).fetchall() except sqlite3.OperationalError as exc: - db.close() raise JsonRpcError(JSONRPC_INTERNAL_ERROR, f"Query error: {exc}") from exc - db.close() + finally: + db.close() except JsonRpcError: raise except Exception as exc: