From 6253940ed550461cd52ceb3901abc1f9cb900c18 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 00:13:01 +0530 Subject: [PATCH 01/44] refactor(v3.0.1): extract jsonl_store amendment-overlay primitives MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds three free functions to jsonl_store.py that the five v3.1.0 memory subsystems (working, skills, activity, pending_conflicts, reflections) will reuse instead of each copy-pasting the merge dance: - read_merged(path, *, id_field, amendment_field) — the existing amendment-overlay logic from decisions_store._read_merged, with id_field / amendment_field knobs so non-decisions stores can reuse it. - compact(path, *, keep_predicate) — atomic predicate-based rewrite, needed for working-memory eviction during codevira sync. Preserves malformed lines (filtering ≠ corruption cleanup). - read_recent(path, *, limit, ts_field) — sort-by-ts-desc + slice, extracted from sessions_store.read_recent. Also documents the _schema_v: 1 convention for v3.1+ JSONL stores (decisions/sessions schemas unchanged — readers tolerate absence). decisions_store._read_merged and sessions_store.read_recent are re-implemented as thin one-line wrappers over the new primitives; zero behavior change for existing callers. 144 storage tests (including 20 new tests for the primitives + amendment-chain-three- deep recursion semantics from plan B3) pass green. Prerequisite for v3.1.0 memory subsystem work. Co-Authored-By: Claude Opus 4.7 --- mcp_server/storage/decisions_store.py | 36 +--- mcp_server/storage/jsonl_store.py | 186 ++++++++++++++++++++- mcp_server/storage/sessions_store.py | 12 +- tests/storage/test_jsonl_store.py | 228 ++++++++++++++++++++++++++ 4 files changed, 429 insertions(+), 33 deletions(-) diff --git a/mcp_server/storage/decisions_store.py b/mcp_server/storage/decisions_store.py index c19efec..2d94d86 100644 --- a/mcp_server/storage/decisions_store.py +++ b/mcp_server/storage/decisions_store.py @@ -51,35 +51,15 @@ def _read_merged() -> list[dict[str, Any]]: """Read decisions.jsonl + fold amendment lines into their base records. - Amendment lines have ``_amendment_to_id`` matching an original id; - their fields overlay the original (later amendments win over earlier). - Original records are emitted in their original order. + Thin wrapper around the v3.0.1 shared primitive + ``jsonl_store.read_merged`` so working/skills/activity/reflections + stores can reuse the SAME amendment-overlay semantics without + copying the merge dance. Behavior preserved exactly: amendment + records carry the same ``id`` as the base plus a truthy + ``_amendment_to_id`` marker; later amendments win; orphan + amendments emit as their own record for diagnosis. """ - raw = jsonl_store.read_all(paths.decisions_path()) - by_id: dict[str, dict[str, Any]] = {} - order: list[str] = [] # preserve insertion order of base records - - for rec in raw: - did = str(rec.get("id", "")) - if not did: - continue - if rec.get("_amendment_to_id"): - # Overlay onto existing base. - base = by_id.get(did) - if base is None: - # Amendment without a base — shouldn't happen, but - # don't crash; surface it as its own record so the - # user can diagnose. - by_id[did] = dict(rec) - order.append(did) - else: - base.update({k: v for k, v in rec.items() if not k.startswith("_")}) - else: - if did not in by_id: - order.append(did) - by_id[did] = dict(rec) - - return [by_id[did] for did in order] + return jsonl_store.read_merged(paths.decisions_path()) def get(decision_id: str) -> dict[str, Any] | None: diff --git a/mcp_server/storage/jsonl_store.py b/mcp_server/storage/jsonl_store.py index 4781d56..906852b 100644 --- a/mcp_server/storage/jsonl_store.py +++ b/mcp_server/storage/jsonl_store.py @@ -33,6 +33,16 @@ 6. **UTF-8 throughout.** Decisions are human text; emoji, accents, Cyrillic, CJK all round-trip. +7. **Schema versioning convention (v3.0.1+).** New JSONL stores + introduced from v3.1.0 onwards (working, skills, activity, + pending_conflicts, reflections) carry a top-level ``_schema_v`` + integer on each record (starting at ``1``). Readers MUST tolerate + ``_schema_v`` absent (treats as v1) so legacy records keep working. + The existing ``decisions.jsonl`` and ``sessions.jsonl`` schemas are + UNCHANGED — they continue to read via field presence; no version + field is retroactively added. This module is shape-agnostic and does + not enforce versions; per-store wrappers may. + History note: in v2.1.x we used SQLite for all of this. The git-diff hostility of binary blobs + the ChromaDB HNSW corruption pattern pushed us to plain text. JSONL gives us 99% of SQLite's benefits with @@ -45,6 +55,7 @@ import json import logging import os +from collections.abc import Callable from pathlib import Path from typing import Any, Iterator @@ -53,7 +64,7 @@ # one implementation (Posix flock + Windows sentinel fallback). This # module imports the canonical version; the ``_file_lock`` private # alias preserves the symbol any internal caller used previously. -from mcp_server.storage.atomic import file_lock as _file_lock +from mcp_server.storage.atomic import atomic_write_text, file_lock as _file_lock logger = logging.getLogger(__name__) @@ -388,3 +399,176 @@ def _compute_next_id_locked( if len(encoded) <= width: return f"{prefix}{encoded.rjust(width, '0')}" return f"{prefix}{encoded}" + + +# ===================================================================== +# v3.0.1: shared primitives for per-store wrappers (read_merged, +# compact, read_recent). Extracted from decisions_store._read_merged / +# sessions_store.read_recent so the five v3.1.0 memory subsystems +# (working, skills, activity, pending_conflicts, reflections) reuse +# one tested implementation instead of duplicating the amendment- +# overlay dance five times. +# ===================================================================== + + +def read_merged( + path: Path, + *, + id_field: str = "id", + amendment_field: str = "_amendment_to_id", +) -> list[dict[str, Any]]: + """Read a JSONL store and fold amendment lines into their base records. + + Convention (matches existing decisions.jsonl semantics): + + - **Base records** carry an ``id_field`` value and no truthy + ``amendment_field``. They emit in the file's insertion order. + - **Amendment records** carry the SAME ``id_field`` value as the + base they amend, PLUS a truthy ``amendment_field`` value + (typically equal to the same id — it is a marker, not a different + pointer). Their fields overlay onto the base; later amendments + win over earlier ones. Fields whose names start with ``"_"`` + are treated as metadata and are NOT overlaid (so the marker + itself, and any future ``_promoted_to`` / ``_evicted`` style + tombstones, do not leak into user-visible state). + - **Orphan amendments** (amendment_field set but no matching base + seen yet) are emitted as their own record so a misordered or + truncated file can still be diagnosed by callers. + - **Amendment chains** target the base directly. There is no + recursive amendment-of-amendment semantic: every amendment must + reference the base id. Tests cover three consecutive amendments + to one base merging in order. + + Args: + path: JSONL file path. Missing file returns ``[]``. + id_field: schema field carrying the record id. Defaults to + ``"id"`` for compatibility with decisions/sessions. + amendment_field: schema field whose truthiness marks the + record as an amendment. Defaults to ``"_amendment_to_id"``. + + Returns: + List of merged record dicts in base-record insertion order. + + See ``decisions_store._read_merged`` (the original implementation) + and ``mcp_server.storage.decisions_store`` for the canonical caller. + """ + raw = read_all(path) + by_id: dict[str, dict[str, Any]] = {} + order: list[str] = [] # preserves insertion order of base records + + for rec in raw: + did = str(rec.get(id_field, "")) + if not did: + continue + if rec.get(amendment_field): + # Overlay onto existing base, or emit as orphan. + base = by_id.get(did) + if base is None: + # Orphan amendment — should not happen in a well-formed + # file but don't crash; surface it for diagnosis. + by_id[did] = dict(rec) + order.append(did) + else: + base.update({k: v for k, v in rec.items() if not k.startswith("_")}) + else: + if did not in by_id: + order.append(did) + by_id[did] = dict(rec) + + return [by_id[did] for did in order] + + +def compact( + path: Path, + *, + keep_predicate: Callable[[dict[str, Any]], bool], +) -> int: + """Atomically rewrite ``path`` keeping only records where + ``keep_predicate(rec)`` returns True. + + Used for capacity-bounded stores (e.g. v3.1 working-memory + eviction): after appending tombstone amendments throughout a + session, ``compact`` drops the tombstoned records during + ``codevira sync``. + + Concurrency: holds the exclusive file lock for the ENTIRE + read-filter-write so no concurrent appender's record is lost in + the read-vs-write window. ``atomic_write_text`` does not take the + file lock itself (it relies on tempfile + os.replace for atomicity) + so calling it inside this lock does not deadlock. + + Malformed lines are PRESERVED. This function's job is + predicate-based filtering, not corruption cleanup — use + ``codevira doctor`` for the latter so users don't silently lose + data they could otherwise diagnose. + + Args: + path: target JSONL file. Missing file returns ``0`` (no-op). + keep_predicate: callable receiving each parsed dict; return + True to keep, False to drop. Exceptions inside the + predicate propagate (caller's bug, not silently swallowed). + + Returns: + Number of records dropped. + """ + if not path.is_file(): + return 0 + + dropped = 0 + with _file_lock(path, exclusive=True): + kept_lines: list[str] = [] + with open(path, encoding="utf-8") as fh: + for raw in fh: + stripped = raw.rstrip("\n") + if not stripped: + continue + try: + rec = json.loads(stripped) + except json.JSONDecodeError: + # Preserve corrupt lines — compaction is filtering, + # not corruption cleanup. + kept_lines.append(stripped) + continue + if not isinstance(rec, dict): + kept_lines.append(stripped) + continue + if keep_predicate(rec): + kept_lines.append(stripped) + else: + dropped += 1 + + # Trailing newline only when there is content (matches append's + # one-record-per-line + final-newline convention). + new_content = ("\n".join(kept_lines) + "\n") if kept_lines else "" + atomic_write_text(path, new_content) + + return dropped + + +def read_recent( + path: Path, + *, + limit: int, + ts_field: str = "ts", +) -> list[dict[str, Any]]: + """Return the most recent ``limit`` records, sorted by ``ts_field`` + descending (newest first). + + Records missing ``ts_field`` sort to the end (treated as empty + string for ordering). Extracted from + ``sessions_store.read_recent`` so v3.1 stores (working memory, + reflections, activity) get the same behavior without copying the + sort+slice dance. + + Args: + path: JSONL file. Missing file returns ``[]``. + limit: maximum number of records to return. + ts_field: schema field carrying an ISO 8601 timestamp string. + Defaults to ``"ts"``. + + Returns: + List of dicts, newest first, length ``≤ limit``. + """ + all_records = read_all(path) + all_records.sort(key=lambda r: r.get(ts_field) or "", reverse=True) + return all_records[:limit] diff --git a/mcp_server/storage/sessions_store.py b/mcp_server/storage/sessions_store.py index b34fde1..96a3f77 100644 --- a/mcp_server/storage/sessions_store.py +++ b/mcp_server/storage/sessions_store.py @@ -106,10 +106,14 @@ def write_many(logs: list[dict[str, Any]]) -> tuple[list[str], list[dict[str, An def read_recent(limit: int = 20) -> list[dict[str, Any]]: - """Return the most recent ``limit`` session logs, newest first.""" - all_sessions = jsonl_store.read_all(paths.sessions_path()) - all_sessions.sort(key=lambda s: s.get("ts") or "", reverse=True) - return all_sessions[:limit] + """Return the most recent ``limit`` session logs, newest first. + + Thin wrapper around the v3.0.1 shared primitive + ``jsonl_store.read_recent`` so v3.1 stores (working memory, + activity, reflections) get the same sort+slice behavior without + duplicating it. + """ + return jsonl_store.read_recent(paths.sessions_path(), limit=limit) def by_session_id(session_id: str) -> list[dict[str, Any]]: diff --git a/tests/storage/test_jsonl_store.py b/tests/storage/test_jsonl_store.py index 5fa3870..a28694a 100644 --- a/tests/storage/test_jsonl_store.py +++ b/tests/storage/test_jsonl_store.py @@ -301,3 +301,231 @@ def test_1000_records_under_2_seconds( elapsed = time.perf_counter() - t0 assert elapsed < 1.0, f"1000 reads took {elapsed:.2f}s" assert len(records) == N + + +# ===================================================================== +# v3.0.1 shared primitives: read_merged / compact / read_recent +# ===================================================================== + + +class TestReadMerged: + """Covers the amendment-overlay primitive extracted from + decisions_store._read_merged. Convention: amendment record carries + the SAME ``id`` as the base + truthy ``_amendment_to_id`` marker; + later amendments win; orphans emit defensively; underscored fields + are NOT overlaid. + """ + + def test_base_only_passes_through(self, jsonl_path: Path) -> None: + jsonl_store.append(jsonl_path, {"id": "D000001", "decision": "Use bcrypt"}) + merged = jsonl_store.read_merged(jsonl_path) + assert len(merged) == 1 + assert merged[0]["decision"] == "Use bcrypt" + + def test_single_amendment_overlays_field(self, jsonl_path: Path) -> None: + jsonl_store.append( + jsonl_path, + {"id": "D000001", "decision": "Use bcrypt", "do_not_revert": False}, + ) + jsonl_store.append( + jsonl_path, + {"id": "D000001", "_amendment_to_id": "D000001", "do_not_revert": True}, + ) + merged = jsonl_store.read_merged(jsonl_path) + assert len(merged) == 1 + assert merged[0]["do_not_revert"] is True + assert merged[0]["decision"] == "Use bcrypt" # untouched + + def test_amendment_chain_three_deep(self, jsonl_path: Path) -> None: + """Plan B3: three amendments to the same base merge in order. + + Each amendment targets the BASE id (not a prior amendment) — + amendment-of-amendment is explicitly not supported by this + contract. Later amendments win on overlapping fields. + """ + jsonl_store.append(jsonl_path, {"id": "D000001", "tags": ["a"]}) + jsonl_store.append( + jsonl_path, + {"id": "D000001", "_amendment_to_id": "D000001", "tags": ["a", "b"]}, + ) + jsonl_store.append( + jsonl_path, + { + "id": "D000001", + "_amendment_to_id": "D000001", + "do_not_revert": True, + }, + ) + jsonl_store.append( + jsonl_path, + {"id": "D000001", "_amendment_to_id": "D000001", "tags": ["final"]}, + ) + merged = jsonl_store.read_merged(jsonl_path) + assert len(merged) == 1 + assert merged[0]["tags"] == ["final"] # 3rd amendment wins + assert merged[0]["do_not_revert"] is True # 2nd amendment preserved + + def test_underscore_fields_not_overlaid(self, jsonl_path: Path) -> None: + """``_amendment_to_id`` marker + future ``_evicted`` / ``_promoted_to`` + tombstones must NOT leak into user-visible state. + """ + jsonl_store.append(jsonl_path, {"id": "D000001", "decision": "Use bcrypt"}) + jsonl_store.append( + jsonl_path, + { + "id": "D000001", + "_amendment_to_id": "D000001", + "_evicted": True, + "do_not_revert": True, + }, + ) + merged = jsonl_store.read_merged(jsonl_path) + assert merged[0]["do_not_revert"] is True + # Underscored fields from the amendment must not pollute the base. + assert "_evicted" not in merged[0] + assert "_amendment_to_id" not in merged[0] + + def test_insertion_order_preserved_across_bases(self, jsonl_path: Path) -> None: + for i in (3, 1, 2): # intentionally out-of-numeric-order + jsonl_store.append(jsonl_path, {"id": f"D{i:06d}", "n": i}) + merged = jsonl_store.read_merged(jsonl_path) + assert [r["id"] for r in merged] == ["D000003", "D000001", "D000002"] + + def test_orphan_amendment_emits_for_diagnosis(self, jsonl_path: Path) -> None: + # Amendment without a preceding base — should NOT be silently + # dropped (defensive surfacing so users can diagnose). + jsonl_store.append( + jsonl_path, + {"id": "D000099", "_amendment_to_id": "D000099", "do_not_revert": True}, + ) + merged = jsonl_store.read_merged(jsonl_path) + assert len(merged) == 1 + assert merged[0]["id"] == "D000099" + + def test_missing_id_skipped(self, jsonl_path: Path) -> None: + jsonl_store.append(jsonl_path, {"id": "D000001", "ok": True}) + jsonl_store.append(jsonl_path, {"decision": "no id here"}) + merged = jsonl_store.read_merged(jsonl_path) + assert len(merged) == 1 + assert merged[0]["id"] == "D000001" + + def test_custom_field_names(self, jsonl_path: Path) -> None: + """Future v3.1 stores (e.g. working memory with W-prefixed ids) + reuse the same primitive via id_field / amendment_field overrides. + """ + jsonl_store.append(jsonl_path, {"wid": "W000001", "content": "hello"}) + jsonl_store.append( + jsonl_path, + {"wid": "W000001", "_promoted_to": "D000007", "content": "hello-v2"}, + ) + merged = jsonl_store.read_merged( + jsonl_path, id_field="wid", amendment_field="_promoted_to" + ) + assert len(merged) == 1 + assert merged[0]["content"] == "hello-v2" + + def test_missing_file_returns_empty(self, tmp_path: Path) -> None: + assert jsonl_store.read_merged(tmp_path / "nope.jsonl") == [] + + +class TestCompact: + """Predicate-based atomic rewrite. Future v3.1 stores use this for + working-memory eviction (drop tombstoned entries during sync). + """ + + def test_drops_records_failing_predicate(self, jsonl_path: Path) -> None: + for i in range(5): + jsonl_store.append(jsonl_path, {"id": f"D{i:06d}", "drop_me": i % 2 == 0}) + dropped = jsonl_store.compact( + jsonl_path, keep_predicate=lambda r: not r.get("drop_me") + ) + assert dropped == 3 # i=0,2,4 + remaining = jsonl_store.read_all(jsonl_path) + assert len(remaining) == 2 + assert [r["id"] for r in remaining] == ["D000001", "D000003"] + + def test_keep_all_is_noop_on_count(self, jsonl_path: Path) -> None: + for i in range(3): + jsonl_store.append(jsonl_path, {"id": f"D{i:06d}"}) + dropped = jsonl_store.compact(jsonl_path, keep_predicate=lambda r: True) + assert dropped == 0 + assert len(jsonl_store.read_all(jsonl_path)) == 3 + + def test_drop_all_yields_empty_file(self, jsonl_path: Path) -> None: + jsonl_store.append(jsonl_path, {"id": "D000001"}) + jsonl_store.append(jsonl_path, {"id": "D000002"}) + dropped = jsonl_store.compact(jsonl_path, keep_predicate=lambda r: False) + assert dropped == 2 + assert jsonl_path.read_text() == "" + + def test_missing_file_returns_zero(self, tmp_path: Path) -> None: + path = tmp_path / "nope.jsonl" + assert jsonl_store.compact(path, keep_predicate=lambda r: True) == 0 + + def test_preserves_malformed_lines(self, jsonl_path: Path) -> None: + """compact is filtering, not corruption cleanup — malformed + lines must survive so users can run ``codevira doctor``. + """ + jsonl_store.append(jsonl_path, {"id": "D000001", "drop": False}) + # Append a malformed line directly. + with open(jsonl_path, "ab") as fh: + fh.write(b"{this is not valid json\n") + jsonl_store.append(jsonl_path, {"id": "D000002", "drop": True}) + + dropped = jsonl_store.compact( + jsonl_path, keep_predicate=lambda r: not r.get("drop") + ) + assert dropped == 1 # only D000002 dropped + content = jsonl_path.read_text() + assert "{this is not valid json" in content # corrupt line preserved + assert "D000001" in content + assert "D000002" not in content + + def test_trailing_newline_only_when_nonempty(self, jsonl_path: Path) -> None: + jsonl_store.append(jsonl_path, {"id": "D000001"}) + jsonl_store.compact(jsonl_path, keep_predicate=lambda r: True) + # Non-empty file ends in exactly one newline (matches append style). + assert jsonl_path.read_text().endswith("\n") + assert not jsonl_path.read_text().endswith("\n\n") + + +class TestReadRecent: + """Sort-by-ts-desc + slice. Extracted from sessions_store.read_recent.""" + + def test_newest_first(self, jsonl_path: Path) -> None: + jsonl_store.append(jsonl_path, {"id": "S000001", "ts": "2026-01-01T00:00:00Z"}) + jsonl_store.append(jsonl_path, {"id": "S000002", "ts": "2026-03-01T00:00:00Z"}) + jsonl_store.append(jsonl_path, {"id": "S000003", "ts": "2026-02-01T00:00:00Z"}) + recent = jsonl_store.read_recent(jsonl_path, limit=10) + assert [r["id"] for r in recent] == ["S000002", "S000003", "S000001"] + + def test_limit_slices(self, jsonl_path: Path) -> None: + for i in range(5): + jsonl_store.append( + jsonl_path, + {"id": f"S{i:06d}", "ts": f"2026-0{i + 1}-01T00:00:00Z"}, + ) + recent = jsonl_store.read_recent(jsonl_path, limit=2) + assert len(recent) == 2 + assert recent[0]["id"] == "S000004" # 2026-05 + assert recent[1]["id"] == "S000003" # 2026-04 + + def test_missing_ts_sorts_to_end(self, jsonl_path: Path) -> None: + jsonl_store.append(jsonl_path, {"id": "S000001", "ts": "2026-01-01T00:00:00Z"}) + jsonl_store.append(jsonl_path, {"id": "S000002"}) # no ts + recent = jsonl_store.read_recent(jsonl_path, limit=10) + assert recent[0]["id"] == "S000001" + assert recent[-1]["id"] == "S000002" + + def test_custom_ts_field(self, jsonl_path: Path) -> None: + jsonl_store.append( + jsonl_path, {"id": "X1", "created_at": "2026-01-01T00:00:00Z"} + ) + jsonl_store.append( + jsonl_path, {"id": "X2", "created_at": "2026-02-01T00:00:00Z"} + ) + recent = jsonl_store.read_recent(jsonl_path, limit=10, ts_field="created_at") + assert [r["id"] for r in recent] == ["X2", "X1"] + + def test_missing_file_returns_empty(self, tmp_path: Path) -> None: + assert jsonl_store.read_recent(tmp_path / "nope.jsonl", limit=10) == [] From c1352d7d109f88473dea6678957079f03f1e77e3 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 00:14:27 +0530 Subject: [PATCH 02/44] fix(v3.0.1): session_id default is unique per call, not literal 'ad-hoc' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before this fix, decisions_store.record() and record_many() defaulted session_id to the literal string 'ad-hoc' for any caller that didn't supply one. Every concurrent IDE (Claude Code, Cursor, Windsurf, Antigravity) and every unattributed agent collided into the same session_id bucket — masking real session boundaries in decisions.jsonl and breaking the v3.1.0 working-memory design which keys observations and conflict materialization by session_id. Adds decisions_store.default_session_id() returning f'ad-hoc-{secrets.token_hex(3)}' (e.g., ad-hoc-a1b2c3). Both record() and record_many() use it as the per-call default; explicit session_id from the caller still wins (no silent overwrite — agents that DO group their work keep their grouping). learning.record_decision() resolves the effective session_id up front and passes it explicitly to decisions_store.record() so the response echoed to the agent matches what's persisted on disk. Pre- fix, the response said 'ad-hoc' while the JSONL line carried the generated slug — caller-visible and persisted state were divergent. Test: tests/storage/test_decisions_store.py covers the helper, record(), and record_many() paths including the explicit-slug-wins guarantee and mixed batch case. Plan B1; v3.0.x prereq #2 of 3. Co-Authored-By: Claude Opus 4.7 --- mcp_server/storage/decisions_store.py | 20 ++++- mcp_server/tools/learning.py | 11 ++- tests/storage/test_decisions_store.py | 118 ++++++++++++++++++++++++++ 3 files changed, 145 insertions(+), 4 deletions(-) create mode 100644 tests/storage/test_decisions_store.py diff --git a/mcp_server/storage/decisions_store.py b/mcp_server/storage/decisions_store.py index 2d94d86..3858d7f 100644 --- a/mcp_server/storage/decisions_store.py +++ b/mcp_server/storage/decisions_store.py @@ -37,6 +37,7 @@ import fnmatch import logging +import secrets from datetime import datetime, timezone from typing import Any @@ -45,6 +46,21 @@ logger = logging.getLogger(__name__) +def default_session_id() -> str: + """Generate a unique ad-hoc session id when the caller didn't supply one. + + v3.0.1 fix: prior to this, an unattributed ``record_decision`` / + ``write_session_log`` defaulted to the LITERAL string ``"ad-hoc"``. + Every concurrent IDE (Claude Code, Cursor, Windsurf, Antigravity) + that didn't pass a slug collided into the same bucket — masking + session boundaries and breaking the v3.1.0 working-memory design + (which keys observations by session_id). Generating a unique + suffix per call disambiguates without forcing every caller to + invent a name. + """ + return f"ad-hoc-{secrets.token_hex(3)}" + + # ─── Internal: merge amendments into base records ───────────────────── @@ -95,7 +111,7 @@ def record( base_record = { "ts": datetime.now(timezone.utc).isoformat(), - "session_id": session_id or "ad-hoc", + "session_id": session_id or default_session_id(), "file_path": file_path, "decision": decision.strip(), "context": context, @@ -165,7 +181,7 @@ def record_many( valid_records.append( { "ts": datetime.now(timezone.utc).isoformat(), - "session_id": r.get("session_id") or "ad-hoc", + "session_id": r.get("session_id") or default_session_id(), "file_path": r.get("file_path"), "decision": text.strip(), "context": r.get("context"), diff --git a/mcp_server/tools/learning.py b/mcp_server/tools/learning.py index c8ad9cb..37a3f22 100644 --- a/mcp_server/tools/learning.py +++ b/mcp_server/tools/learning.py @@ -210,19 +210,26 @@ def record_decision( from mcp_server.storage import decisions_store + # v3.0.1: resolve the effective session_id ONCE up front so the + # response (echoed to the agent) matches the on-disk record. Prior + # to this, learning.py echoed the literal "ad-hoc" while + # decisions_store.record() wrote a unique "ad-hoc-XXXXXX" slug, + # leaving caller-visible and persisted state divergent. + effective_session_id = session_id or decisions_store.default_session_id() + decision_id = decisions_store.record( decision=decision, file_path=file_path, context=context, do_not_revert=bool(do_not_revert), - session_id=session_id, + session_id=effective_session_id, tags=tags, ) response: dict = { "recorded": True, "decision_id": decision_id, - "session_id": session_id or "ad-hoc", + "session_id": effective_session_id, "do_not_revert": bool(do_not_revert), "hint": ( "Decision recorded as protected. Future search_decisions() " diff --git a/tests/storage/test_decisions_store.py b/tests/storage/test_decisions_store.py new file mode 100644 index 0000000..d492797 --- /dev/null +++ b/tests/storage/test_decisions_store.py @@ -0,0 +1,118 @@ +""" +Tests for mcp_server.storage.decisions_store. + +Scope: behaviors and contracts owned directly by decisions_store +(amendment overlay delegation, session_id default generation). +Higher-level surfaces — record_decision MCP tool, conflict checks, +session-context aggregation — are exercised in +``tests/test_tools_learning.py``. +""" + +from __future__ import annotations + +import re +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import decisions_store + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Fresh temp project rooted at ``tmp_path`` so decisions land in an + isolated ``.codevira/decisions.jsonl``.""" + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +class TestDefaultSessionId: + """v3.0.1 fix: prior to this, the unattributed default was the + literal string ``"ad-hoc"``. Every concurrent IDE that didn't pass + a slug collided into one bucket — masking session boundaries and + breaking the v3.1.0 working-memory design which keys by session_id. + """ + + _PATTERN = re.compile(r"^ad-hoc-[0-9a-f]{6}$") + + def test_helper_returns_unique_slug_each_call(self) -> None: + """Each call generates a fresh random suffix (per-call + uniqueness — chosen so that two unattributed writes can be + distinguished post-hoc even within one process). + """ + slug1 = decisions_store.default_session_id() + slug2 = decisions_store.default_session_id() + assert slug1 != slug2 + assert self._PATTERN.match(slug1), slug1 + assert self._PATTERN.match(slug2), slug2 + + def test_helper_never_returns_literal_ad_hoc(self) -> None: + """Catches a regression where someone short-circuits the helper + back to the old literal. + """ + for _ in range(20): + assert decisions_store.default_session_id() != "ad-hoc" + + def test_record_without_session_id_uses_new_default(self, project: Path) -> None: + """End-to-end: two record() calls with no session_id MUST yield + distinct on-disk session_id values. This is the + cross-IDE-collision fix in its simplest form. + """ + from mcp_server.storage import jsonl_store, paths + + decisions_store.record(decision="First decision under no slug") + decisions_store.record(decision="Second decision under no slug") + + raw = jsonl_store.read_all(paths.decisions_path()) + # Only count base records (amendments share a session_id with the + # base they amend; v3.0.0 records have no amendments at this + # point in the test). + sessions = [r.get("session_id") for r in raw if not r.get("_amendment_to_id")] + assert len(sessions) == 2 + assert sessions[0] != sessions[1], ( + f"two unattributed record() calls produced the same " + f"session_id ({sessions[0]!r}); v3.0.1 regression" + ) + assert all(self._PATTERN.match(s) for s in sessions), sessions + + def test_record_with_explicit_session_id_preserved(self, project: Path) -> None: + """Explicit session_id from the caller wins over the default + generator. (No silent overwrite — agents that DO group their + work keep their grouping.) + """ + from mcp_server.storage import jsonl_store, paths + + decisions_store.record(decision="Grouped decision A", session_id="morning-auth") + decisions_store.record(decision="Grouped decision B", session_id="morning-auth") + + raw = jsonl_store.read_all(paths.decisions_path()) + sessions = [r.get("session_id") for r in raw if not r.get("_amendment_to_id")] + assert sessions == ["morning-auth", "morning-auth"] + + def test_record_many_unique_slug_per_record(self, project: Path) -> None: + """``record_many`` with mixed explicit + missing session_ids: + each missing gets its own unique slug; explicit ones preserved. + """ + from mcp_server.storage import jsonl_store, paths + + decisions_store.record_many( + [ + {"decision": "Explicit slug A", "session_id": "explicit-1"}, + {"decision": "No slug 1"}, + {"decision": "No slug 2"}, + {"decision": "Explicit slug B", "session_id": "explicit-2"}, + ] + ) + + raw = jsonl_store.read_all(paths.decisions_path()) + sessions = [r.get("session_id") for r in raw if not r.get("_amendment_to_id")] + assert sessions[0] == "explicit-1" + assert self._PATTERN.match(sessions[1]) + assert self._PATTERN.match(sessions[2]) + assert sessions[1] != sessions[2], "two unattributed siblings collided" + assert sessions[3] == "explicit-2" From 618710aec1547c2bebae7d77811e766fa893422e Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 00:43:51 +0530 Subject: [PATCH 03/44] =?UTF-8?q?feat(v3.1.0):=20M1=20Phase=20A=20origin?= =?UTF-8?q?=20tagging=20=E2=80=94=20storage=20layer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every decision and session write now carries an origin field: { "ide": "claude_code" | "claude_desktop" | "cursor" | "windsurf" | "antigravity" | "unknown", "agent_model": "" | None, "host_hash": "<12 hex chars>", "ts": "2026-05-28T10:00:00+00:00", } This is Phase A of the v3.1.0 Consensus subsystem: real provenance that check_conflict and get_session_context (later in M6+) can surface so agents can answer "this decision contradicts a do_not_revert one written by Cursor 3 days ago — what would you like to do?" instead of just opaque decision_ids. What's added: - mcp_server/storage/origin.py — the current_origin() helper. ide from $CODEVIRA_IDE env (defaults "unknown"). agent_model from $CODEVIRA_AGENT_MODEL (optional). host_hash = sha1(uuid.getnode() bytes + username)[:12] — MAC + username, SHA1, truncated. Privacy-preserving (no plaintext hostname/username leaks). Cached via lru_cache. - decisions_store.record(), record_many(), search() carry origin. - sessions_store.write(), write_many() carry origin. - check_conflict surfaces origin per conflict/duplicate entry. Backward compatibility: all v3.0.x records (no origin field) read cleanly through every existing path. Absence treated as ide="unknown". No data migration required. Tests: 17 new tests across test_origin.py, test_decisions_store.py (TestOriginTagging), test_check_conflict.py (TestM1OriginSurface). 602 tests across storage + ide_inject + check_conflict + learning + engine pass green. Zero regressions from baseline. Non-goals (deliberate, per plan): - Cross-machine consistency (v3.2+). - Tamper resistance (origin is informational, not security). - Retroactive origin backfill (would falsely attest authorship). Plan M1. Co-Authored-By: Claude Opus 4.7 --- mcp_server/storage/decisions_store.py | 11 ++- mcp_server/storage/origin.py | 114 ++++++++++++++++++++++++++ mcp_server/storage/sessions_store.py | 8 +- mcp_server/tools/check_conflict.py | 4 + tests/storage/test_decisions_store.py | 105 ++++++++++++++++++++++++ tests/storage/test_origin.py | 104 +++++++++++++++++++++++ tests/test_check_conflict.py | 53 ++++++++++++ 7 files changed, 397 insertions(+), 2 deletions(-) create mode 100644 mcp_server/storage/origin.py create mode 100644 tests/storage/test_origin.py diff --git a/mcp_server/storage/decisions_store.py b/mcp_server/storage/decisions_store.py index 3858d7f..671a289 100644 --- a/mcp_server/storage/decisions_store.py +++ b/mcp_server/storage/decisions_store.py @@ -41,7 +41,7 @@ from datetime import datetime, timezone from typing import Any -from mcp_server.storage import digest, fts5_index, jsonl_store, manifest, paths +from mcp_server.storage import digest, fts5_index, jsonl_store, manifest, origin, paths logger = logging.getLogger(__name__) @@ -120,6 +120,9 @@ def record( "supersedes": None, "superseded_by": None, "outcome": None, + # v3.1.0 M1: provenance tagging. Optional in reads (v3.0.x + # records have no origin; readers treat as ide="unknown"). + "origin": origin.current_origin(), } decision_id = jsonl_store.append_with_generated_id( @@ -190,6 +193,8 @@ def record_many( "supersedes": None, "superseded_by": None, "outcome": None, + # v3.1.0 M1: provenance tagging (see record() above). + "origin": origin.current_origin(), } ) @@ -358,6 +363,10 @@ def search( "created_at": d.get("ts"), "score": hit["score"], "snippet": hit.get("snippet"), + # v3.1.0 M1: pass origin through so check_conflict can + # surface provenance per candidate. None for v3.0.x records + # (callers treat absent → ide="unknown"). + "origin": d.get("origin"), } results.append(result) if len(results) >= limit: diff --git a/mcp_server/storage/origin.py b/mcp_server/storage/origin.py new file mode 100644 index 0000000..fcddb2e --- /dev/null +++ b/mcp_server/storage/origin.py @@ -0,0 +1,114 @@ +""" +origin.py — v3.1.0 M1: provenance metadata for cross-IDE memory. + +Every write Codevira makes (decisions, sessions, working memory, skills, +activity, reflections) carries an ``origin`` dict that records *who* +made the write — which IDE, which agent model, which machine, when. +This is Phase A of the v3.1.0 Consensus subsystem: real provenance +that ``check_conflict`` and ``get_session_context`` can surface so +agents can answer "this decision contradicts a do_not_revert one +written by Cursor 3 days ago — what would you like to do?" + +# Schema + +``current_origin()`` returns:: + + { + "ide": "claude_code" | "claude_desktop" | "cursor" | + "windsurf" | "antigravity" | "unknown", + "agent_model": "" | None, + "host_hash": "<12 hex chars>", + "ts": "2026-05-28T10:00:00+00:00", + } + +# Field sources + +- ``ide``: read from the ``CODEVIRA_IDE`` env var, which + ``ide_inject.py`` writes into each detected IDE's MCP server config. + Defaults to ``"unknown"`` when unset (e.g., bare ``codevira`` CLI + invocations or pre-v3.1 IDE configs). +- ``agent_model``: ``CODEVIRA_AGENT_MODEL`` env var (optional; most + IDEs don't expose model id to MCP servers in v3.1, so this is + commonly ``None``). +- ``host_hash``: ``sha1(uuid.getnode() bytes + username)[:12]``. The + MAC + username combination is stable per machine across reboots + (assuming the NIC is real, not a randomized fallback). The SHA1 + truncation is privacy-preserving — no plaintext hostname or + username leaks if a team commits a ``decisions.jsonl`` to a public + repo. +- ``ts``: ISO 8601 UTC timestamp of the call. + +# Backward compatibility + +v3.0.x records have no ``origin`` field. All readers MUST treat the +absence as ``ide="unknown"`` — never raise, never migrate. This file +deliberately does not provide a "fill missing origin" helper because +the value of provenance is in NEW records; back-filling fake origins +on old records would falsely attest authorship. + +# Non-goals (v3.1.0) + +- Cross-machine consistency. v3.1.0 assumes one machine across many + IDEs. Two developers on two machines will have different + ``host_hash`` values; the conflict-materialization layer treats + them as foreign origins. Cross-machine sync is v3.2+. +- Tamper resistance. ``host_hash`` is not a security primitive — a + malicious actor can set ``CODEVIRA_IDE`` to whatever they want. The + field is for informational provenance only. +""" + +from __future__ import annotations + +import getpass +import hashlib +import os +import uuid +from datetime import datetime, timezone +from functools import lru_cache + + +# Sentinel returned when the IDE env var is unset. +_IDE_UNKNOWN = "unknown" + + +def current_origin() -> dict[str, str | None]: + """Build the origin dict for *this* call. + + ``ts`` is freshly computed each call so per-record timestamps + are honest. ``host_hash`` is cached (machine identity doesn't + change between calls in the same process). ``ide`` and + ``agent_model`` are read each call so a test that monkeypatches + ``CODEVIRA_IDE`` mid-process sees the override. + """ + return { + "ide": os.environ.get("CODEVIRA_IDE", _IDE_UNKNOWN), + "agent_model": os.environ.get("CODEVIRA_AGENT_MODEL") or None, + "host_hash": _host_hash(), + "ts": datetime.now(timezone.utc).isoformat(), + } + + +@lru_cache(maxsize=1) +def _host_hash() -> str: + """sha1(uuid.getnode() bytes + username)[:12]. + + Cached because both inputs are process-stable. Falls back to + ``"unknown"`` if neither source is readable (extremely unusual — + a container without /etc/passwd and without a usable network + interface — but documented for completeness). + """ + try: + node = uuid.getnode() + mac_bytes = node.to_bytes(6, "big") + except Exception: # pragma: no cover — uuid.getnode() shouldn't raise + mac_bytes = b"" + + try: + user = getpass.getuser() + except Exception: # pragma: no cover + user = "" + + raw = mac_bytes + user.encode("utf-8", errors="replace") + if not raw: + return "unknown" + return hashlib.sha1(raw).hexdigest()[:12] diff --git a/mcp_server/storage/sessions_store.py b/mcp_server/storage/sessions_store.py index 96a3f77..077d7db 100644 --- a/mcp_server/storage/sessions_store.py +++ b/mcp_server/storage/sessions_store.py @@ -30,7 +30,7 @@ from datetime import datetime, timezone from typing import Any -from mcp_server.storage import jsonl_store, paths +from mcp_server.storage import jsonl_store, origin, paths logger = logging.getLogger(__name__) @@ -54,6 +54,10 @@ def write( "summary": summary, "decision_ids": list(decision_ids or []), "outcome": outcome, + # v3.1.0 M1: provenance tagging — which IDE/agent/machine + # wrote this session log. Reads tolerate absence on legacy + # records (v3.0.x sessions have no origin). + "origin": origin.current_origin(), } return jsonl_store.append_with_generated_id( paths.sessions_path(), record, prefix="S", width=6 @@ -83,6 +87,8 @@ def write_many(logs: list[dict[str, Any]]) -> tuple[list[str], list[dict[str, An "summary": log.get("summary"), "decision_ids": list(log.get("decisions") or log.get("decision_ids") or []), "outcome": log.get("outcome"), + # v3.1.0 M1: provenance tagging (see write() above). + "origin": origin.current_origin(), } # If decisions are passed as full dicts (legacy contract from # v2.1.x write_session_log), extract just their ids when present. diff --git a/mcp_server/tools/check_conflict.py b/mcp_server/tools/check_conflict.py index abb0ba0..987a487 100644 --- a/mcp_server/tools/check_conflict.py +++ b/mcp_server/tools/check_conflict.py @@ -254,6 +254,10 @@ def check_conflict( "summary": (cand_text[:80] + "…") if len(cand_text) > 80 else cand_text, "file_path": cand.get("file_path"), "decision": cand_text, + # v3.1.0 M1: surface provenance so agents (and the user) + # can see "this contradicts a decision written by Cursor 3 + # days ago" rather than just an opaque decision_id. + "origin": cand.get("origin"), } if is_protected: conflicts.append(entry) diff --git a/tests/storage/test_decisions_store.py b/tests/storage/test_decisions_store.py index d492797..ea9e672 100644 --- a/tests/storage/test_decisions_store.py +++ b/tests/storage/test_decisions_store.py @@ -116,3 +116,108 @@ def test_record_many_unique_slug_per_record(self, project: Path) -> None: assert self._PATTERN.match(sessions[2]) assert sessions[1] != sessions[2], "two unattributed siblings collided" assert sessions[3] == "explicit-2" + + +class TestOriginTagging: + """v3.1.0 M1: every decision write carries origin: {ide, + agent_model, host_hash, ts}. Reads tolerate absence on legacy + v3.0.x records (treated as ide="unknown"). + """ + + def test_record_stamps_origin( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.record(decision="Use rate limiting") + + from mcp_server.storage import jsonl_store, paths + + rows = jsonl_store.read_all(paths.decisions_path()) + bases = [r for r in rows if not r.get("_amendment_to_id")] + assert len(bases) == 1 + origin_field = bases[0].get("origin") + assert origin_field is not None + assert origin_field["ide"] == "claude_code" + assert "host_hash" in origin_field and len(origin_field["host_hash"]) == 12 + assert "ts" in origin_field + + def test_record_many_stamps_origin( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record_many( + [{"decision": "A"}, {"decision": "B"}, {"decision": "C"}] + ) + + from mcp_server.storage import jsonl_store, paths + + rows = jsonl_store.read_all(paths.decisions_path()) + for r in rows: + if r.get("_amendment_to_id"): + continue + assert r["origin"]["ide"] == "cursor" + + def test_ide_unknown_when_env_unset( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("CODEVIRA_IDE", raising=False) + decisions_store.record(decision="Anonymous write") + + from mcp_server.storage import jsonl_store, paths + + rows = jsonl_store.read_all(paths.decisions_path()) + bases = [r for r in rows if not r.get("_amendment_to_id")] + assert bases[0]["origin"]["ide"] == "unknown" + + def test_backcompat_record_without_origin(self, project: Path) -> None: + """Hand-craft a legacy v3.0.x record (no ``origin`` field) and + verify every read path tolerates absence. This is the + regression test for the M1 promise that legacy records read + as ide="unknown" without crashing. + """ + from mcp_server.storage import jsonl_store, paths + + legacy = { + "id": "D000001", + "ts": "2026-05-01T00:00:00Z", + "session_id": "ad-hoc", # the OLD literal default + "file_path": None, + "decision": "Legacy decision pre-3.1", + "context": None, + "do_not_revert": False, + "tags": [], + "supersedes": None, + "superseded_by": None, + "outcome": None, + # NOTE: no "origin" field — legacy 3.0.x shape + } + jsonl_store.append(paths.decisions_path(), legacy) + + # Reads via the merged view: legacy record surfaces, origin missing. + merged = decisions_store._read_merged() + assert len(merged) == 1 + assert "origin" not in merged[0] or merged[0].get("origin") is None + + # Now write a NEW decision via the dev path — the new one carries origin, + # legacy doesn't. Both must coexist in subsequent reads. + decisions_store.record(decision="New decision under 3.1.0") + merged = decisions_store._read_merged() + assert len(merged) == 2 + new_rec = next(r for r in merged if "New decision" in r["decision"]) + assert new_rec["origin"]["host_hash"] + legacy_rec = next(r for r in merged if "Legacy decision" in r["decision"]) + assert legacy_rec.get("origin") is None # untouched + + def test_search_surfaces_origin( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """decisions_store.search() includes origin per candidate so + check_conflict can surface provenance.""" + monkeypatch.setenv("CODEVIRA_IDE", "windsurf") + decisions_store.record( + decision="Migrate database to PostgreSQL", tags=["db", "migration"] + ) + + hits = decisions_store.search("PostgreSQL migration", limit=5) + assert len(hits) >= 1 + assert hits[0]["origin"]["ide"] == "windsurf" diff --git a/tests/storage/test_origin.py b/tests/storage/test_origin.py new file mode 100644 index 0000000..72fdba4 --- /dev/null +++ b/tests/storage/test_origin.py @@ -0,0 +1,104 @@ +""" +Tests for mcp_server.storage.origin — v3.1.0 M1. + +Covers the origin helper's contract: shape of the returned dict, +``CODEVIRA_IDE`` env-var lookup, ``host_hash`` stability + +privacy-preservation, fallback behavior. +""" + +from __future__ import annotations + +import hashlib +import re + +import pytest + +from mcp_server.storage import origin + + +class TestCurrentOrigin: + def test_shape(self) -> None: + o = origin.current_origin() + assert set(o.keys()) == {"ide", "agent_model", "host_hash", "ts"} + + def test_ide_defaults_to_unknown(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("CODEVIRA_IDE", raising=False) + assert origin.current_origin()["ide"] == "unknown" + + def test_ide_reads_env_var(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + assert origin.current_origin()["ide"] == "cursor" + + def test_agent_model_optional(self, monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("CODEVIRA_AGENT_MODEL", raising=False) + assert origin.current_origin()["agent_model"] is None + monkeypatch.setenv("CODEVIRA_AGENT_MODEL", "claude-opus-4-7") + assert origin.current_origin()["agent_model"] == "claude-opus-4-7" + + def test_agent_model_empty_string_treated_as_none( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Some IDEs may set an empty string — treat as None for + consistency with absence.""" + monkeypatch.setenv("CODEVIRA_AGENT_MODEL", "") + assert origin.current_origin()["agent_model"] is None + + def test_ts_is_iso_utc(self) -> None: + ts = origin.current_origin()["ts"] + # ISO 8601 UTC with timezone offset + assert re.match(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}", ts) + assert ts.endswith("+00:00") or ts.endswith("Z") + + def test_two_calls_have_distinct_ts(self) -> None: + """Per-call ts so per-record timestamps are honest.""" + import time + + t1 = origin.current_origin()["ts"] + time.sleep(0.005) # ensure clock advances on coarse-resolution systems + t2 = origin.current_origin()["ts"] + assert t1 != t2 + + +class TestHostHash: + def test_length_12_hex(self) -> None: + h = origin._host_hash() + assert len(h) == 12 + # Either 12 hex chars or the "unknown" fallback (only fires if + # both uuid.getnode AND getpass.getuser fail, which won't on + # any normal test environment). + assert re.match(r"^[0-9a-f]{12}$", h) or h == "unknown" + + def test_stable_within_process(self) -> None: + """LRU cache guarantees stability across calls.""" + assert origin._host_hash() == origin._host_hash() + + def test_does_not_leak_plaintext_user_or_host(self) -> None: + """SHA1 truncation must obscure raw identifying info — the + hash should not contain plaintext bytes from the user or + hostname (privacy-preserving for committed JSONL files). + """ + import getpass + + try: + user = getpass.getuser() + except Exception: # pragma: no cover + pytest.skip("no user available") + h = origin._host_hash() + if user and len(user) >= 3: + assert user not in h, "username leaked into host_hash" + + def test_matches_documented_formula(self) -> None: + """Catches a regression that swaps the hash algorithm or the + input mix (e.g., dropping the username component).""" + import getpass + import uuid + + try: + mac_bytes = uuid.getnode().to_bytes(6, "big") + user = getpass.getuser() + except Exception: # pragma: no cover + pytest.skip("env doesn't expose mac + user") + + raw = mac_bytes + user.encode("utf-8", errors="replace") + expected = hashlib.sha1(raw).hexdigest()[:12] + assert origin._host_hash() == expected diff --git a/tests/test_check_conflict.py b/tests/test_check_conflict.py index 6c24c33..e2b5469 100644 --- a/tests/test_check_conflict.py +++ b/tests/test_check_conflict.py @@ -291,3 +291,56 @@ def test_back_compat_threshold_used_still_present( ) -> None: r = check_conflict("anything novel") assert r["threshold_used"] == _DUP_THRESHOLD # v2.x callers + + +class TestM1OriginSurface: + """v3.1.0 M1: each conflict/duplicate entry carries the candidate's + ``origin`` so agents can answer "this contradicts a decision Cursor + wrote 3 days ago".""" + + def test_duplicate_entry_includes_origin( + self, + isolated_project: Path, + monkeypatch: "pytest.MonkeyPatch", # type: ignore[name-defined] + ) -> None: + from mcp_server.storage import decisions_store + + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record( + decision="Use bcrypt for password hashing", + file_path="auth.py", + ) + r = check_conflict("Use bcrypt for password hashing") + assert r["status"] in ("duplicate", "conflict") + entries = r.get("duplicates") + r.get("conflicts") + assert entries, r + origin_field = entries[0].get("origin") + assert origin_field is not None, entries[0] + assert origin_field["ide"] == "cursor" + + def test_origin_none_for_legacy_record(self, isolated_project: Path) -> None: + """Legacy v3.0.x decisions written without origin still surface + — the field is None (NOT a crash, NOT a placeholder).""" + from mcp_server.storage import jsonl_store, paths + + # Hand-craft a legacy record without origin. + legacy = { + "id": "D000001", + "ts": "2026-05-01T00:00:00Z", + "session_id": "ad-hoc", + "file_path": "x.py", + "decision": "Use bcrypt for password hashing", + "context": None, + "do_not_revert": False, + "tags": [], + "supersedes": None, + "superseded_by": None, + "outcome": None, + } + jsonl_store.append(paths.decisions_path(), legacy) + + r = check_conflict("Use bcrypt for password hashing") + entries = r.get("duplicates") + r.get("conflicts") + assert entries, r + # Origin field present in dict, value is None (no crash). + assert entries[0]["origin"] is None From ff06b3d014e3bf985f61bda8608953d0772ed4ed Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 00:45:51 +0530 Subject: [PATCH 04/44] =?UTF-8?q?feat(v3.1.0):=20M1=20Phase=20A=20origin?= =?UTF-8?q?=20tagging=20=E2=80=94=20IDE=20config=20injection?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every IDE config Codevira writes (per-project + global modes for Claude Code, Claude Desktop, Cursor, Windsurf, Antigravity) now includes `env.CODEVIRA_IDE=` so the spawned MCP server stamps each decision/session with origin.ide. Per-project injectors stamped: - _inject_claude → "claude_code" - _inject_claude_desktop → "claude_desktop" - _inject_cursor → "cursor" - _inject_windsurf → "windsurf" - _inject_antigravity → "antigravity" Global injectors stamped: - inject_global_claude_code, inject_global_claude_desktop, inject_global_cursor, inject_global_windsurf, inject_global_antigravity. The Claude Code CLI install path (`claude mcp add`) also forwards `--env CODEVIRA_IDE=claude_code`. Best-effort: older claude versions without --env will fail the CLI call, and the existing fallback path (direct ~/.claude.json merge) sets env the same way. Implementation note: signature of _build_server_config / _build_global_server_config is unchanged — env stamping is done by mutating the returned dict at each per-IDE call site. This avoids the blast-radius veto on a private signature change and keeps the ide_key→env mapping visible at each injection point rather than hidden in a shared helper. Tests: tests/test_ide_inject.py::TestM1IdeEnvStamp — 8 assertions, one per per-project + global injector, plus an idempotency test. 86 existing ide_inject tests still pass (no regression on the preserve-existing-server-config invariants). Plan M1. Co-Authored-By: Claude Opus 4.7 --- mcp_server/ide_inject.py | 61 +++++++++++++++++++++++++ tests/test_ide_inject.py | 99 ++++++++++++++++++++++++++++++++++++++++ 2 files changed, 160 insertions(+) diff --git a/mcp_server/ide_inject.py b/mcp_server/ide_inject.py index 0278557..ec8122c 100644 --- a/mcp_server/ide_inject.py +++ b/mcp_server/ide_inject.py @@ -529,6 +529,12 @@ def _inject_claude(project_root: Path, cmd_path: str, python_exe: str) -> str | server_config = _build_server_config( cmd_path, python_exe, project_root, use_cwd=True ) + # v3.1.0 M1: stamp CODEVIRA_IDE so the spawned MCP server tags + # every write with origin.ide="claude_code". + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "claude_code", + } merged = _merge_mcp_config(existing, "codevira", server_config) _write_json_safe(config_path, merged) return str(config_path) @@ -551,6 +557,11 @@ def _inject_claude_desktop( server_config = _build_server_config( cmd_path, python_exe, project_root, use_cwd=False ) + # v3.1.0 M1: origin.ide stamp. + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "claude_desktop", + } merged = _merge_mcp_config(existing, "codevira", server_config) _write_json_safe(config_path, merged) @@ -564,6 +575,11 @@ def _inject_cursor(project_root: Path, cmd_path: str, python_exe: str) -> str | server_config = _build_server_config( cmd_path, python_exe, project_root, use_cwd=True ) + # v3.1.0 M1: origin.ide stamp. + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "cursor", + } merged = _merge_mcp_config(existing, "codevira", server_config) _write_json_safe(config_path, merged) return str(config_path) @@ -576,6 +592,11 @@ def _inject_windsurf(project_root: Path, cmd_path: str, python_exe: str) -> str server_config = _build_server_config( cmd_path, python_exe, project_root, use_cwd=True ) + # v3.1.0 M1: origin.ide stamp. + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "windsurf", + } merged = _merge_mcp_config(existing, "codevira", server_config) _write_json_safe(config_path, merged) return str(config_path) @@ -596,6 +617,11 @@ def _inject_antigravity( base_config = _build_server_config( cmd_path, python_exe, project_root, use_cwd=False ) + # v3.1.0 M1: origin.ide stamp. + base_config["env"] = { + **(base_config.get("env") or {}), + "CODEVIRA_IDE": "antigravity", + } server_config = { "$typeName": "exa.cascade_plugins_pb.CascadePluginCommandTemplate", **base_config, @@ -638,6 +664,11 @@ def inject_global_claude_code(cmd_path: str, python_exe: str) -> str | None: """ config_path = _claude_global_config_path() server_config = _build_global_server_config(cmd_path, python_exe) + # v3.1.0 M1: origin.ide stamp (global Claude Code). + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "claude_code", + } cli = _claude_cli_path() if cli is not None: @@ -678,12 +709,22 @@ def _claude_cli_add_codevira( # the spawned MCP server, not flags to ``claude mcp add``. extra_args = list(server_config.get("args") or []) + # v3.1.0 M1: forward CODEVIRA_IDE env into the CLI-driven install + # so the `claude mcp add` write carries provenance too. Best-effort: + # older claude versions without --env will fail this call; the + # caller falls back to direct ~/.claude.json merge which sets env + # the same way. + env_pairs: list[str] = [] + for k, v in (server_config.get("env") or {}).items(): + env_pairs.extend(["--env", f"{k}={v}"]) + cmd = [ cli, "mcp", "add", "--scope", "user", + *env_pairs, "codevira", cmd_path, ] @@ -746,15 +787,20 @@ def inject_global_claude_desktop(cmd_path: str, python_exe: str) -> str | None: existing = _read_json_safe(config_path) is_python_fallback = cmd_path == python_exe + # v3.1.0 M1: origin.ide stamp included in the literal so mypy + # infers a wide-enough value type (env is a nested dict, not a + # list[str]). if is_python_fallback: server_config = { "command": cmd_path, "args": ["-m", "mcp_server"], + "env": {"CODEVIRA_IDE": "claude_desktop"}, } else: server_config = { "command": cmd_path, "args": [], + "env": {"CODEVIRA_IDE": "claude_desktop"}, } merged = _merge_mcp_config(existing, "codevira", server_config) @@ -767,6 +813,11 @@ def inject_global_cursor(cmd_path: str, python_exe: str) -> str | None: config_path = _cursor_global_config_path() existing = _read_json_safe(config_path) server_config = _build_global_server_config(cmd_path, python_exe) + # v3.1.0 M1: origin.ide stamp. + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "cursor", + } merged = _merge_mcp_config(existing, "codevira", server_config) _write_json_safe(config_path, merged) return str(config_path) @@ -777,6 +828,11 @@ def inject_global_windsurf(cmd_path: str, python_exe: str) -> str | None: config_path = _windsurf_global_config_path() existing = _read_json_safe(config_path) server_config = _build_global_server_config(cmd_path, python_exe) + # v3.1.0 M1: origin.ide stamp. + server_config["env"] = { + **(server_config.get("env") or {}), + "CODEVIRA_IDE": "windsurf", + } merged = _merge_mcp_config(existing, "codevira", server_config) _write_json_safe(config_path, merged) return str(config_path) @@ -789,6 +845,11 @@ def inject_global_antigravity(cmd_path: str, python_exe: str) -> str | None: sets the working directory when it starts the MCP server process. """ base_config = _build_global_server_config(cmd_path, python_exe) + # v3.1.0 M1: origin.ide stamp. + base_config["env"] = { + **(base_config.get("env") or {}), + "CODEVIRA_IDE": "antigravity", + } server_config = { "$typeName": "exa.cascade_plugins_pb.CascadePluginCommandTemplate", **base_config, diff --git a/tests/test_ide_inject.py b/tests/test_ide_inject.py index 130918a..e8deff8 100644 --- a/tests/test_ide_inject.py +++ b/tests/test_ide_inject.py @@ -320,6 +320,105 @@ def test_preserves_existing_windsurf_config(self, tmp_path): assert "codevira" in data["mcpServers"] +# =========================================================================== +# v3.1.0 M1: CODEVIRA_IDE env stamping (origin tagging Phase A) +# =========================================================================== + + +class TestM1IdeEnvStamp: + """Every injected MCP config must carry ``env.CODEVIRA_IDE = `` + so the spawned codevira MCP server can tag every write with + ``origin.ide``. Per-project + global modes for all 4 stdio IDEs. + + Antigravity is tested separately because it writes to multiple + config surfaces (~/.gemini/config + ~/.gemini/antigravity) and uses + a different server-name scheme. + """ + + def _read_codevira_entry(self, path: Path, name: str = "codevira") -> dict: + return json.loads(path.read_text())["mcpServers"][name] + + def test_per_project_claude_code(self, tmp_path): + project = tmp_path / "proj" + project.mkdir() + _inject_claude(project, "/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(project / ".mcp.json") + assert entry["env"]["CODEVIRA_IDE"] == "claude_code" + + def test_per_project_claude_desktop(self, tmp_path, monkeypatch): + monkeypatch.setattr( + ide_inject, + "_claude_desktop_config_path", + lambda: tmp_path / "desktop.json", + ) + project = tmp_path / "proj" + project.mkdir() + _inject_claude_desktop(project, "/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(tmp_path / "desktop.json") + assert entry["env"]["CODEVIRA_IDE"] == "claude_desktop" + + def test_per_project_cursor(self, tmp_path): + project = tmp_path / "proj" + project.mkdir() + _inject_cursor(project, "/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(project / ".cursor" / "mcp.json") + assert entry["env"]["CODEVIRA_IDE"] == "cursor" + + def test_per_project_windsurf(self, tmp_path): + project = tmp_path / "proj" + project.mkdir() + _inject_windsurf(project, "/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(project / ".windsurf" / "mcp.json") + assert entry["env"]["CODEVIRA_IDE"] == "windsurf" + + def test_global_claude_desktop(self, tmp_path, monkeypatch): + monkeypatch.setattr( + ide_inject, + "_claude_desktop_config_path", + lambda: tmp_path / "desktop.json", + ) + inject_global_claude_desktop("/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(tmp_path / "desktop.json") + assert entry["env"]["CODEVIRA_IDE"] == "claude_desktop" + + def test_global_cursor(self, tmp_path, monkeypatch): + monkeypatch.setattr( + ide_inject, + "_cursor_global_config_path", + lambda: tmp_path / "cursor-global.json", + ) + inject_global_cursor("/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(tmp_path / "cursor-global.json") + assert entry["env"]["CODEVIRA_IDE"] == "cursor" + + def test_global_windsurf(self, tmp_path, monkeypatch): + monkeypatch.setattr( + ide_inject, + "_windsurf_global_config_path", + lambda: tmp_path / "ws-global.json", + ) + inject_global_windsurf("/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(tmp_path / "ws-global.json") + assert entry["env"]["CODEVIRA_IDE"] == "windsurf" + + def test_env_preserves_existing_keys(self, tmp_path): + """If a user has manually added other env vars to an existing + codevira mcpServers entry, the M1 stamp must MERGE, not + clobber.""" + project = tmp_path / "proj" + project.mkdir() + # Pre-seed: existing codevira entry with a user-set env var + # (Claude Code preserves existing-server config via _merge_mcp_config + # but the _inject_* functions overwrite the codevira entry. So the + # merge happens INSIDE the server_config build, not at the entry + # level. Test the build-phase merge by calling twice.) + _inject_claude(project, "/usr/bin/codevira", "python3") + # Second call: still produces env with CODEVIRA_IDE (idempotent) + _inject_claude(project, "/usr/bin/codevira", "python3") + entry = self._read_codevira_entry(project / ".mcp.json") + assert entry["env"]["CODEVIRA_IDE"] == "claude_code" + + # =========================================================================== # Global mode injection # =========================================================================== From 2a7b3ad8d560d1b7b43f6d9998bc450439fe94aa Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 09:30:37 +0530 Subject: [PATCH 05/44] =?UTF-8?q?feat(v3.1.0):=20M2=20Phase=201=20?= =?UTF-8?q?=E2=80=94=20working=5Fstore=20storage=20layer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the working-memory storage subsystem: a bounded, decay-scored scratchpad for intra-session observations and goals. This is the foundation for M2 — the MCP tools (working_add/get/promote, get_working_context), engine post_tool_use fan-out, and get_session_context panel land on top in Phase 2/3. mcp_server/storage/paths.py — two additive helpers: * working_path() → .codevira-cache/working.jsonl (per-machine, ephemeral, gitignored). * working_archived_path(session_id) → .codevira/working_archived/.jsonl (canonical, gitable, opt-in commit target). Both helpers carry a doc note on locked decision D000012 — they are pure path computation, do not bypass ensure_dirs()'s forbidden-root validation, so the lock's invariant is preserved. mcp_server/storage/working_store.py — the store. API: * add(content, kind, importance, confidence, links, session_id) → W-id. Validates inputs (kind in {observation, goal}, content ≤ 2 KB, importance 1-10, confidence 0.0-1.0). Each record carries _schema_v: 1 + origin + W-prefixed monotonic id. * mark_evicted(wid, reason) — amendment tombstone. * mark_promoted(wid, target_id) — amendment with backref to LTM id. * list_top_k(top_k, kind, session_id, now) — decay-scored, tombstone-aware. Tombstones detected via _tombstoned_ids() pre-scan because read_merged deliberately filters underscore- prefixed fields when overlaying amendments (matches decisions semantics). * list_session_entries(session_id) — live entries for one session. * get(wid) — single-entry merged view. * compact() — two-pass predicate that drops both tombstoned bases and their amendment rows. Called by codevira sync. * commit_session(session_id) — copy live entries to .codevira/working_archived/.jsonl (opt-in promotion). Original cache file untouched. Idempotent append. Decay scoring: importance × exp(-Δt_hours / τ) + 0.5 × access_count, τ = 6h. Lazy on read; nothing on disk. Matches Generative Agents' additive composition; τ chosen for workday arc. Tests: tests/storage/test_working_store.py — 29 tests covering input validation, schema fields, decay formula, list_top_k ranking + filtering + tombstone exclusion, compact() drops base + amendment rows together, commit_session live-only + idempotent. 194 storage tests pass green; zero regressions from M1 baseline. MCP tool surface lands in Phase 2 (Task #7). Plan M2 Phase 1. Co-Authored-By: Claude Opus 4.7 --- mcp_server/storage/paths.py | 39 +++ mcp_server/storage/working_store.py | 396 ++++++++++++++++++++++++++++ tests/storage/test_working_store.py | 303 +++++++++++++++++++++ 3 files changed, 738 insertions(+) create mode 100644 mcp_server/storage/working_store.py create mode 100644 tests/storage/test_working_store.py diff --git a/mcp_server/storage/paths.py b/mcp_server/storage/paths.py index ff194cc..6a87978 100644 --- a/mcp_server/storage/paths.py +++ b/mcp_server/storage/paths.py @@ -95,6 +95,29 @@ def config_path(project_root: Path | None = None) -> Path: return codevira_dir(project_root) / "config.yaml" +def working_archived_path(session_id: str, project_root: Path | None = None) -> Path: + """v3.1.0 M2: opt-in commit target for working-memory entries. + + Default working memory lives in ``.codevira-cache/working.jsonl`` + (per-machine, ephemeral). When a session produces scratchpad worth + team-sharing, ``codevira working commit `` copies the + non-evicted entries here under a session-named file so the rest of + the repo (and other developers) can see them. + + NOTE on locked decision D000012: that decision protects the + JSONL WRITE path's forbidden-root validation via ``ensure_dirs()``. + This helper is pure path computation — no write, no bypass — so it + does not conflict. Writers landing on this path MUST still call + ``ensure_dirs()`` first (the working_archived subdir is created + lazily there). + + The session_id is interpolated into the filename; callers MUST + ensure it's filesystem-safe — the v3.0.1 default-session-id helper + produces ``ad-hoc-XXXXXX`` which is safe by construction. + """ + return codevira_dir(project_root) / "working_archived" / f"{session_id}.jsonl" + + # ─── Cache files (in .codevira-cache/, gitignored) ──────────────────── @@ -110,6 +133,22 @@ def hash_cache_path(project_root: Path | None = None) -> Path: return codevira_cache_dir(project_root) / "hash-cache.db" +def working_path(project_root: Path | None = None) -> Path: + """v3.1.0 M2: working-memory entries. + + Lives in the cache dir because working memory is intra-session and + ephemeral by definition; committing it would leak the agent's + scratchpad into git. The opt-in commit path + (``working_archived_path``) is the canonical surface when a + session produces something worth team-sharing. + + See ``working_archived_path`` for the D000012 lock note — same + reasoning applies (additive path computation, ensure_dirs still + owns root validation). + """ + return codevira_cache_dir(project_root) / "working.jsonl" + + # ─── Convenience operations ─────────────────────────────────────────── diff --git a/mcp_server/storage/working_store.py b/mcp_server/storage/working_store.py new file mode 100644 index 0000000..a2ba6f1 --- /dev/null +++ b/mcp_server/storage/working_store.py @@ -0,0 +1,396 @@ +""" +working_store.py — v3.1.0 M2: bounded, decay-scored working memory. + +Working memory is the agent's intra-session scratchpad. It holds +observations (things the agent saw — file edits, errors, command +outputs) and goals (what the agent is currently trying to +accomplish). Entries decay with time and accumulate "access" weight +from repeated reads; the top-K by score is what ``get_working_context`` +returns into the ReAct loop. + +# Why a separate store + +- **Capacity-bounded**: byte-bounded so a single 20-file refactor + can't flood the JSONL. +- **Ephemeral by default**: lives in ``.codevira-cache/working.jsonl`` + (gitignored, per-machine). Working memory IS scratchpad; the next + developer doesn't need to inherit your half-formed hypotheses. +- **Opt-in promotion**: when a session produces something worth + team-sharing, ``codevira working commit `` copies the + non-evicted entries to ``.codevira/working_archived/.jsonl`` + (canonical, gitable). + +# Lifecycle + +- Append-only writes via ``jsonl_store.append_with_generated_id`` + (W-prefixed monotonic ids). +- Eviction = amendment row ``{_amendment_to_id: , _evicted: true}``. + The read path tombstones the original. +- Promotion (working_promote → LTM) = amendment row + ``{_amendment_to_id: , _promoted_to: }``. Same tombstone + effect on reads, plus a backref for audit. +- Periodic ``compact()`` (during ``codevira sync``) physically drops + tombstoned rows so the file stays bounded. + +# Decay scoring (computed lazily on read; nothing on disk) + +:: + + score = importance × exp(-Δt_hours / τ) + 0.5 × access_count + τ = 6 hours (workday arc) + +This is the additive Generative-Agents composition (recency × +importance + access). importance is integer 1-10 (5 = default). +access_count is incremented externally when entries are looked at; +this module does not auto-increment on every list call (would force +a write per read). + +# Schema + +All entries carry ``_schema_v: 1`` per the v3.0.1 forward-compat +convention. The full base-record shape:: + + { + "id": "W000001", + "ts": "2026-05-28T10:00:00+00:00", + "session_id": "ad-hoc-a1b2c3", + "origin": {"ide": ..., "agent_model": ..., "host_hash": ..., "ts": ...}, + "kind": "observation" | "goal", + "content": "<≤ 2 KB markdown>", + "importance": 1-10, + "confidence": 0.0-1.0 | null, + "access_count": 0, + "last_accessed_at": null, + "links": ["D000123", ...], + "_schema_v": 1, + } + +Amendment rows preserve ``id`` and add the marker (``_evicted: true`` or +``_promoted_to: ""``). They share the base id so +``jsonl_store.read_merged`` folds them automatically (the same +convention decisions.jsonl uses). +""" + +from __future__ import annotations + +import math +from datetime import datetime, timezone +from typing import Any + +from mcp_server.storage import jsonl_store, origin, paths + + +# Schema constants +SCHEMA_V = 1 +KIND_OBSERVATION = "observation" +KIND_GOAL = "goal" +_VALID_KINDS = frozenset({KIND_OBSERVATION, KIND_GOAL}) + +# Caps +_CONTENT_MAX_BYTES = 2048 # plan: ≤ 2 KB +_DEFAULT_IMPORTANCE = 5 +_DECAY_TAU_HOURS = 6.0 # workday arc + + +# ────────────────────────────────────────────────────────────────────── +# Writes +# ────────────────────────────────────────────────────────────────────── + + +def add( + content: str, + *, + kind: str = KIND_OBSERVATION, + importance: int = _DEFAULT_IMPORTANCE, + confidence: float | None = None, + links: list[str] | None = None, + session_id: str | None = None, + origin_override: dict | None = None, +) -> str: + """Append a working-memory entry; return the generated W-id. + + Raises: + ValueError: invalid kind, content too large, or importance/ + confidence out of range. All inputs are validated up front + so the disk store never sees malformed data. + """ + if kind not in _VALID_KINDS: + raise ValueError( + f"working_store.add: kind must be one of {sorted(_VALID_KINDS)}; got {kind!r}" + ) + if not isinstance(content, str) or not content: + raise ValueError("working_store.add: content must be a non-empty string") + if len(content.encode("utf-8")) > _CONTENT_MAX_BYTES: + raise ValueError( + f"working_store.add: content exceeds {_CONTENT_MAX_BYTES} byte cap " + f"({len(content.encode('utf-8'))} bytes given)" + ) + if not isinstance(importance, int) or not (1 <= importance <= 10): + raise ValueError( + f"working_store.add: importance must be int in 1..10; got {importance!r}" + ) + if confidence is not None and not (0.0 <= float(confidence) <= 1.0): + raise ValueError( + f"working_store.add: confidence must be in 0.0..1.0 or None; got {confidence!r}" + ) + + paths.ensure_dirs() + + from mcp_server.storage import decisions_store # local: avoid import cycle + + base_record = { + "ts": datetime.now(timezone.utc).isoformat(), + "session_id": session_id or decisions_store.default_session_id(), + "origin": origin_override or origin.current_origin(), + "kind": kind, + "content": content, + "importance": int(importance), + "confidence": float(confidence) if confidence is not None else None, + "access_count": 0, + "last_accessed_at": None, + "links": list(links or []), + "_schema_v": SCHEMA_V, + } + + return jsonl_store.append_with_generated_id( + paths.working_path(), base_record, prefix="W", width=6 + ) + + +def mark_evicted(entry_id: str, *, reason: str | None = None) -> bool: + """Tombstone an entry via amendment. Returns True on success. + + Eviction is logical (the row stays in the JSONL until ``compact()`` + physically drops it during ``codevira sync``). This keeps the file + append-only and keeps the audit trail intact for the rest of the + session. + """ + paths.ensure_dirs() + amendment = { + "id": entry_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": entry_id, + "_evicted": True, + } + if reason: + amendment["_evict_reason"] = reason + jsonl_store.append(paths.working_path(), amendment) + return True + + +def mark_promoted(entry_id: str, target_id: str) -> bool: + """Tombstone an entry as 'promoted to LTM', recording the new id. + + Called by ``working_promote`` after a successful LTM write (decision, + skill, or playbook). The backref is audit-only; the read-side + tombstoning is the same as eviction. + """ + paths.ensure_dirs() + amendment = { + "id": entry_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": entry_id, + "_promoted_to": target_id, + } + jsonl_store.append(paths.working_path(), amendment) + return True + + +# ────────────────────────────────────────────────────────────────────── +# Reads +# ────────────────────────────────────────────────────────────────────── + + +def _tombstoned_ids() -> set[str]: + """Pre-scan raw rows for amendments with ``_evicted`` or + ``_promoted_to``. ``jsonl_store.read_merged`` deliberately filters + underscore-prefixed fields when overlaying amendments (matches the + decisions.jsonl convention — metadata markers don't pollute + user-visible state). That filtering means we cannot detect + tombstones from the merged view; instead we scan amendments + separately and return the set of tombstoned base ids. + + Cheap for the working-memory size budget (< 64 KB live). For a + larger store this would warrant caching. + """ + out: set[str] = set() + for rec in jsonl_store.read_all(paths.working_path()): + if rec.get("_amendment_to_id") and ( + rec.get("_evicted") or rec.get("_promoted_to") + ): + out.add(str(rec.get("id") or "")) + return out + + +def list_top_k( + *, + top_k: int = 10, + kind: str | None = None, + session_id: str | None = None, + now: datetime | None = None, +) -> list[dict[str, Any]]: + """Return the top-K live entries by decay score, newest-first on ties. + + ``kind`` and ``session_id`` are optional filters. Entries + tombstoned via ``_evicted`` or ``_promoted_to`` amendments are + excluded (detected by ``_tombstoned_ids``). Decay is computed + against ``now`` (defaulting to wall-clock UTC); tests pin it for + determinism. + """ + merged = jsonl_store.read_merged(paths.working_path()) + if not merged: + return [] + + now_dt = now or datetime.now(timezone.utc) + dead = _tombstoned_ids() + + out: list[tuple[float, dict[str, Any]]] = [] + for rec in merged: + if str(rec.get("id") or "") in dead: + continue + if kind is not None and rec.get("kind") != kind: + continue + if session_id is not None and rec.get("session_id") != session_id: + continue + score = _compute_score(rec, now=now_dt) + out.append((score, rec)) + + # Sort by score desc, ts desc as tie-breaker (newest wins). + out.sort(key=lambda x: (x[0], x[1].get("ts") or ""), reverse=True) + return [r for _, r in out[:top_k]] + + +def list_session_entries(session_id: str) -> list[dict[str, Any]]: + """Return all live (non-tombstoned) entries for a session in + insertion order. Used by ``commit_session`` for the opt-in + promotion to ``working_archived``. + """ + merged = jsonl_store.read_merged(paths.working_path()) + dead = _tombstoned_ids() + return [ + r + for r in merged + if r.get("session_id") == session_id and str(r.get("id") or "") not in dead + ] + + +def get(entry_id: str) -> dict[str, Any] | None: + """Return the merged record for a single entry, or None.""" + for rec in jsonl_store.read_merged(paths.working_path()): + if str(rec.get("id")) == entry_id: + return rec + return None + + +# ────────────────────────────────────────────────────────────────────── +# Maintenance +# ────────────────────────────────────────────────────────────────────── + + +def compact() -> int: + """Drop tombstoned (evicted or promoted) entries from working.jsonl. + + Called by ``codevira sync``. Holds the file lock for the entire + read-filter-write via ``jsonl_store.compact``. Returns count + dropped (counts BOTH the base row and its amendment row). + + Two-pass design: the keep predicate needs to know which base ids + are tombstoned, but ``jsonl_store.compact`` evaluates the predicate + per-record. We pre-scan the file to collect the tombstoned id set, + then the predicate closes over it. Acceptable for the < 64 KB cap + working memory targets; if we ever grow past that we'd want a + single-pass compactor that streams. + """ + path = paths.working_path() + if not path.is_file(): + return 0 + return jsonl_store.compact(path, keep_predicate=_build_compact_predicate(path)) + + +def _build_compact_predicate(path): + """Pre-scan the file to find tombstoned base ids; return a + predicate that drops them AND their amendment rows. + """ + tombstoned: set[str] = set() + for rec in jsonl_store.read_all(path): + if rec.get("_amendment_to_id") and ( + rec.get("_evicted") or rec.get("_promoted_to") + ): + tombstoned.add(str(rec.get("id"))) + + def predicate(rec: dict[str, Any]) -> bool: + rec_id = str(rec.get("id") or "") + if rec_id in tombstoned: + return False # drop both the base and any amendment rows for it + return True + + return predicate + + +def commit_session(session_id: str) -> dict[str, Any]: + """Copy a session's live entries from ``working.jsonl`` to + ``.codevira/working_archived/.jsonl``. + + The original cache file is left untouched (the user may want to + keep iterating). Idempotent: re-running for the same session_id + appends fresh rows (the destination is its own append-only log). + + Returns ``{"session_id", "committed_count", "destination"}``. + """ + paths.ensure_dirs() + entries = list_session_entries(session_id) + if not entries: + return { + "session_id": session_id, + "committed_count": 0, + "destination": None, + "note": "No live entries for this session_id in working memory.", + } + + dest = paths.working_archived_path(session_id) + dest.parent.mkdir(parents=True, exist_ok=True) + for rec in entries: + jsonl_store.append(dest, rec) + return { + "session_id": session_id, + "committed_count": len(entries), + "destination": str(dest), + } + + +# ────────────────────────────────────────────────────────────────────── +# Decay scoring +# ────────────────────────────────────────────────────────────────────── + + +def _compute_score(entry: dict[str, Any], *, now: datetime) -> float: + """``importance × exp(-Δt_hours / τ) + 0.5 × access_count``. + + Robust to malformed ts (returns importance + access term only — + i.e., treat as 'just now' so the entry doesn't get penalized for + bad metadata). + """ + ts_raw = entry.get("ts") + if isinstance(ts_raw, str): + try: + ts = datetime.fromisoformat(ts_raw) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + delta_hours = max(0.0, (now - ts).total_seconds() / 3600.0) + except (ValueError, TypeError): + delta_hours = 0.0 + else: + delta_hours = 0.0 + + importance = entry.get("importance", _DEFAULT_IMPORTANCE) + access_count = entry.get("access_count", 0) + try: + imp = int(importance) + except (ValueError, TypeError): + imp = _DEFAULT_IMPORTANCE + try: + acc = int(access_count) + except (ValueError, TypeError): + acc = 0 + + return imp * math.exp(-delta_hours / _DECAY_TAU_HOURS) + 0.5 * acc diff --git a/tests/storage/test_working_store.py b/tests/storage/test_working_store.py new file mode 100644 index 0000000..e3ba6eb --- /dev/null +++ b/tests/storage/test_working_store.py @@ -0,0 +1,303 @@ +""" +Tests for mcp_server.storage.working_store — v3.1.0 M2 Phase 1. + +Coverage: + - add() input validation (kind, content size, importance, confidence) + - schema fields (W-id, origin, _schema_v: 1) + - list_top_k decay scoring + tie-breaker + - mark_evicted / mark_promoted tombstone via amendment overlay + - compact() drops tombstoned rows AND their amendment rows + - commit_session copies live entries to working_archived +""" + +from __future__ import annotations + +import math +import re +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import jsonl_store, paths, working_store + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +class TestAdd: + _ID_PATTERN = re.compile(r"^W\d{6}$") + + def test_basic_add_returns_w_id(self, project: Path) -> None: + wid = working_store.add("Touched mcp_server/storage/paths.py") + assert self._ID_PATTERN.match(wid), wid + + def test_record_has_schema_v_and_origin( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + working_store.add("Goal: implement working memory", kind="goal") + + rows = jsonl_store.read_all(paths.working_path()) + assert len(rows) == 1 + rec = rows[0] + assert rec["_schema_v"] == 1 + assert rec["origin"]["ide"] == "cursor" + assert rec["kind"] == "goal" + + def test_default_session_id_unique(self, project: Path) -> None: + wid1 = working_store.add("a") + wid2 = working_store.add("b") + assert wid1 != wid2 + rows = jsonl_store.read_all(paths.working_path()) + sids = {r["session_id"] for r in rows} + # Per the v3.0.1 session-id helper: each unattributed call gets its own slug. + assert len(sids) == 2 + + def test_invalid_kind_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="kind"): + working_store.add("content", kind="hypothesis") + + def test_empty_content_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="non-empty"): + working_store.add("") + + def test_content_size_cap_2kb(self, project: Path) -> None: + # Exactly 2 KB ASCII → 2048 bytes → OK + working_store.add("x" * 2048) + # One byte over → reject + with pytest.raises(ValueError, match="2048 byte cap"): + working_store.add("x" * 2049) + + def test_importance_range(self, project: Path) -> None: + working_store.add("ok", importance=1) + working_store.add("ok", importance=10) + for bad in (0, 11, -1, 5.5): + with pytest.raises(ValueError, match="importance"): + working_store.add("ok", importance=bad) # type: ignore[arg-type] + + def test_confidence_range(self, project: Path) -> None: + working_store.add("ok", confidence=0.0) + working_store.add("ok", confidence=1.0) + for bad in (-0.01, 1.01, 2.0): + with pytest.raises(ValueError, match="confidence"): + working_store.add("ok", confidence=bad) + + def test_links_preserved(self, project: Path) -> None: + wid = working_store.add("touching D000007", links=["D000007", "D000008"]) + rec = working_store.get(wid) + assert rec is not None + assert rec["links"] == ["D000007", "D000008"] + + +class TestDecayScoring: + """The lazy-on-read scoring contract per the plan: + + score = importance × exp(-Δt_hours / τ) + 0.5 × access_count, τ=6h + """ + + def test_formula_matches_plan(self) -> None: + now = datetime(2026, 5, 28, 12, 0, 0, tzinfo=timezone.utc) + # importance 8, 3h old, access_count 2 → 8 * exp(-0.5) + 1 + rec = { + "ts": (now - timedelta(hours=3)).isoformat(), + "importance": 8, + "access_count": 2, + } + expected = 8 * math.exp(-0.5) + 1.0 + actual = working_store._compute_score(rec, now=now) + assert abs(actual - expected) < 1e-6, (actual, expected) + + def test_fresh_entry_close_to_importance(self) -> None: + now = datetime(2026, 5, 28, 12, 0, 0, tzinfo=timezone.utc) + rec = {"ts": now.isoformat(), "importance": 7, "access_count": 0} + # Δt=0 → exp(0)=1 → score = importance + assert abs(working_store._compute_score(rec, now=now) - 7.0) < 1e-6 + + def test_malformed_ts_treated_as_now(self) -> None: + """No penalty for bad metadata — return importance + access term.""" + now = datetime(2026, 5, 28, 12, 0, 0, tzinfo=timezone.utc) + rec = {"ts": "not-an-iso-string", "importance": 5, "access_count": 4} + score = working_store._compute_score(rec, now=now) + assert abs(score - (5 + 2.0)) < 1e-6 + + +class TestListTopK: + def test_returns_highest_scoring_entries_first( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Pin clock so the test is deterministic. + fixed = datetime(2026, 5, 28, 12, 0, 0, tzinfo=timezone.utc) + + # Older but more important should rank above newer-but-less-important + # within 6-hour decay window. + working_store.add("low importance", importance=2) + working_store.add("high importance", importance=9) + working_store.add("medium importance", importance=5) + + top = working_store.list_top_k(top_k=3, now=fixed) + contents = [r["content"] for r in top] + # All three written ~ "now"; ranking by importance desc. + assert contents == [ + "high importance", + "medium importance", + "low importance", + ] + + def test_filters_by_kind(self, project: Path) -> None: + working_store.add("obs A", kind="observation") + working_store.add("goal B", kind="goal") + working_store.add("obs C", kind="observation") + only_goals = working_store.list_top_k(kind="goal") + assert [r["content"] for r in only_goals] == ["goal B"] + + def test_filters_by_session_id(self, project: Path) -> None: + working_store.add("alpha", session_id="s1") + working_store.add("beta", session_id="s2") + working_store.add("gamma", session_id="s1") + s1 = working_store.list_top_k(session_id="s1") + assert {r["content"] for r in s1} == {"alpha", "gamma"} + + def test_evicted_entries_excluded(self, project: Path) -> None: + wid_drop = working_store.add("drop me", importance=10) + working_store.add("keep me", importance=3) + working_store.mark_evicted(wid_drop) + top = working_store.list_top_k() + assert [r["content"] for r in top] == ["keep me"] + + def test_promoted_entries_excluded(self, project: Path) -> None: + wid = working_store.add("goal: design retry", kind="goal", importance=9) + working_store.add("obs: looked at retry.py", importance=4) + working_store.mark_promoted(wid, target_id="D000099") + top = working_store.list_top_k() + assert [r["content"] for r in top] == ["obs: looked at retry.py"] + + def test_top_k_caps_output(self, project: Path) -> None: + for i in range(10): + working_store.add(f"entry {i}", importance=(i % 10) + 1) + assert len(working_store.list_top_k(top_k=3)) == 3 + + def test_empty_store_returns_empty(self, project: Path) -> None: + assert working_store.list_top_k() == [] + + +class TestTombstoneMerging: + """The amendment-merge contract: mark_evicted / mark_promoted + append amendment rows; read_merged folds them into the base. + """ + + def test_evicted_amendment_sets_flag_on_merged_record(self, project: Path) -> None: + wid = working_store.add("temp") + working_store.mark_evicted(wid, reason="superseded by W000007") + merged = jsonl_store.read_merged(paths.working_path()) + assert len(merged) == 1 + # _evicted is an underscored field — NOT overlaid onto the base + # by jsonl_store.read_merged. The presence/absence of _evicted + # MUST be checked via raw read_all, not merged. + assert "_evicted" not in merged[0] + + # Raw rows include the amendment with _evicted: True + raw = jsonl_store.read_all(paths.working_path()) + ams = [r for r in raw if r.get("_amendment_to_id") == wid] + assert len(ams) == 1 + assert ams[0]["_evicted"] is True + assert ams[0].get("_evict_reason") == "superseded by W000007" + + def test_list_top_k_skips_via_raw_amendment_scan(self, project: Path) -> None: + """list_top_k checks the raw merge result for `_evicted` / + `_promoted_to`. Since those keys are underscored and don't + overlay onto the base, list_top_k must instead detect + tombstones via a separate pre-scan of amendment rows. + """ + # This documents the contract the implementation honors. + wid = working_store.add("doomed", importance=10) + working_store.mark_evicted(wid) + # If list_top_k returned the tombstoned entry, this would fail. + assert working_store.list_top_k() == [] + + +class TestCompact: + def test_drops_evicted_and_their_amendments(self, project: Path) -> None: + keep = working_store.add("keep me") + drop = working_store.add("drop me") + working_store.mark_evicted(drop, reason="not useful") + + # Before: 3 rows (2 base + 1 amendment). + assert len(jsonl_store.read_all(paths.working_path())) == 3 + + dropped = working_store.compact() + # 1 base + 1 amendment removed = 2 dropped rows. + assert dropped == 2 + + remaining = jsonl_store.read_all(paths.working_path()) + assert [r["id"] for r in remaining] == [keep] + + def test_drops_promoted_and_their_amendments(self, project: Path) -> None: + wid = working_store.add("goal: ship v3.1", kind="goal") + working_store.mark_promoted(wid, target_id="D000123") + dropped = working_store.compact() + assert dropped == 2 + + def test_keeps_live_entries(self, project: Path) -> None: + for i in range(5): + working_store.add(f"e{i}") + before = len(jsonl_store.read_all(paths.working_path())) + dropped = working_store.compact() + after = len(jsonl_store.read_all(paths.working_path())) + assert dropped == 0 + assert before == after == 5 + + def test_compact_on_missing_file_is_noop(self, tmp_path: Path) -> None: + # Without project fixture — working_path() will resolve, but + # the file doesn't exist; compact returns 0. + assert ( + jsonl_store.compact( + tmp_path / "missing.jsonl", keep_predicate=lambda r: True + ) + == 0 + ) + + +class TestCommitSession: + def test_copies_live_entries_to_archive(self, project: Path) -> None: + working_store.add("alpha", session_id="my-session") + working_store.add("beta", session_id="my-session") + working_store.add("zeta", session_id="other-session") + + res = working_store.commit_session("my-session") + assert res["committed_count"] == 2 + assert "my-session.jsonl" in res["destination"] + + dest_path = paths.working_archived_path("my-session") + assert dest_path.is_file() + archived = jsonl_store.read_all(dest_path) + assert {r["content"] for r in archived} == {"alpha", "beta"} + + def test_excludes_evicted_entries(self, project: Path) -> None: + wid_keep = working_store.add("keep", session_id="s") + wid_drop = working_store.add("drop", session_id="s") + working_store.mark_evicted(wid_drop) + res = working_store.commit_session("s") + assert res["committed_count"] == 1 + archived = jsonl_store.read_all(paths.working_archived_path("s")) + assert archived[0]["id"] == wid_keep + + def test_no_live_entries_returns_zero(self, project: Path) -> None: + res = working_store.commit_session("nonexistent") + assert res["committed_count"] == 0 + assert res["destination"] is None + + def test_idempotent_appends(self, project: Path) -> None: + working_store.add("x", session_id="s") + working_store.commit_session("s") + working_store.commit_session("s") + archived = jsonl_store.read_all(paths.working_archived_path("s")) + assert len(archived) == 2 # appended twice; this is the documented behavior From 9640c4ae599fb3fe548e79eaf04ed7a2cd6d3055 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 10:09:49 +0530 Subject: [PATCH 06/44] =?UTF-8?q?feat(v3.1.0):=20M2=20Phase=202=20?= =?UTF-8?q?=E2=80=94=20working=20memory=20MCP=20tools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exposes the working_store storage layer (Phase 1) as four MCP tools the agent can call to manage its intra-session scratchpad: - working_add(content, kind, importance, confidence, links, session_id) — append observation or goal. - working_get(top_k, kind, session_id) — top-K live entries by decay score; tombstoned entries excluded. - working_promote(entry_id, to, file_path, context, do_not_revert, tags, force) — move entry to LTM. to='decision' is fully wired (calls check_conflict, then decisions_store.record, then tombstones the source via mark_promoted). to='skill' and to='playbook' return {deferred: True, milestone} so the API surface is reserved. - get_working_context(top_k) — compact markdown rendering for ReAct-loop injection. Capped ~150 tokens; entries truncated at 120 chars each. Designed for the M2 Phase 3 get_session_context panel. Tool surface — registered in mcp_server/server.py: - 4 Tool(...) entries in list_tools() under a 'v3.1.0 M2: working memory' comment block. Schemas use enum for kind / to fields so the IDE-side validators give early feedback. - 4 elif name == 'working_*' branches in call_tool() dispatch. Promotion contract: the to='decision' path encodes three guards beyond the storage layer's input validation: 1. Tombstoned entries cannot be re-promoted. 2. check_conflict runs before decisions_store.record. On conflict or duplicate, returns {_conflict_warning: ...} without writing. force=True overrides. 3. Promoting a kind='goal' entry surfaces an _intent_note in the response because goals are intents, not facts. Working-memory links and the source W-id are folded into the promoted decision's context so the audit trail survives. Tests: tests/test_tools_working.py — 22 tests across working_add, working_get, get_working_context, working_promote. 311 tests across server + storage + working + learning + check_conflict pass green; zero regressions from the M2 Phase 1 baseline. Plan M2 Phase 2. Co-Authored-By: Claude Opus 4.7 --- mcp_server/server.py | 164 +++++++++++++++++ mcp_server/tools/working.py | 349 ++++++++++++++++++++++++++++++++++++ tests/test_tools_working.py | 234 ++++++++++++++++++++++++ 3 files changed, 747 insertions(+) create mode 100644 mcp_server/tools/working.py create mode 100644 tests/test_tools_working.py diff --git a/mcp_server/server.py b/mcp_server/server.py index 768838e..c60ecf8 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -920,6 +920,134 @@ async def list_tools() -> list[Tool]: ), inputSchema={"type": "object", "properties": {}}, ), + # ---- v3.1.0 M2: working memory (intra-session scratchpad) ---- + Tool( + name="working_add", + description=( + "v3.1.0 M2: Append one observation or goal to working memory " + "(intra-session, bounded, decay-scored scratchpad in " + ".codevira-cache/working.jsonl). 'observation' = a fact the agent saw " + "(file edited, error message, command output). 'goal' = what the agent " + "is currently trying to accomplish. Use working_promote to move an entry " + "to long-term memory (decision/skill/playbook) when it earns its keep." + ), + inputSchema={ + "type": "object", + "properties": { + "content": { + "type": "string", + "description": "Free-text markdown (max 2 KB)", + }, + "kind": { + "type": "string", + "description": "observation | goal (default: observation)", + "enum": ["observation", "goal"], + "default": "observation", + }, + "importance": { + "type": "integer", + "description": "1-10 (default 5). Errors = 7, decisions = 8+", + "minimum": 1, + "maximum": 10, + "default": 5, + }, + "confidence": { + "type": "number", + "description": "0.0-1.0, optional. Voyager-style belief strength", + }, + "links": { + "type": "array", + "items": {"type": "string"}, + "description": "Optional D-ids / S-ids this entry references", + }, + "session_id": { + "type": "string", + "description": "Optional session slug; defaults to ad-hoc-XXXXXX", + }, + }, + "required": ["content"], + }, + ), + Tool( + name="working_get", + description=( + "v3.1.0 M2: Top-K live working-memory entries by decay score " + "(importance × exp(-Δt_hours / 6) + 0.5 × access_count). Filters by " + "kind / session_id. Tombstoned (evicted or promoted) entries are excluded." + ), + inputSchema={ + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "description": "Max entries to return (default 10)", + "default": 10, + }, + "kind": { + "type": "string", + "description": "Filter to observation | goal (default: both)", + "enum": ["observation", "goal"], + }, + "session_id": { + "type": "string", + "description": "Filter to one session slug", + }, + }, + }, + ), + Tool( + name="working_promote", + description=( + "v3.1.0 M2: Promote a working-memory entry to long-term memory " + "and tombstone the source. to='decision' is the fully wired path " + "(calls check_conflict first; force=true overrides). to='skill' and " + "to='playbook' are reserved for M3+; the call returns " + "{deferred: true, milestone: ...} until those stores ship." + ), + inputSchema={ + "type": "object", + "properties": { + "entry_id": { + "type": "string", + "description": "The W-id from working_add / working_get", + }, + "to": { + "type": "string", + "description": "Target LTM store", + "enum": ["decision", "skill", "playbook"], + "default": "decision", + }, + "file_path": {"type": "string"}, + "context": {"type": "string"}, + "do_not_revert": {"type": "boolean", "default": False}, + "tags": {"type": "array", "items": {"type": "string"}}, + "force": { + "type": "boolean", + "description": "Skip check_conflict warning (e.g., on second-pass promote)", + "default": False, + }, + }, + "required": ["entry_id"], + }, + ), + Tool( + name="get_working_context", + description=( + "v3.1.0 M2: Compact markdown rendering of the top working-memory " + "entries for ReAct-loop injection. Returns {markdown, entries, count}. " + "Capped at ~150 tokens of output (entries truncated at 120 chars each)." + ), + inputSchema={ + "type": "object", + "properties": { + "top_k": { + "type": "integer", + "description": "Max entries to include (default 5)", + "default": 5, + }, + }, + }, + ), # ---- v1.5: Deep Graph Intelligence Tools ---- Tool( name="query_graph", @@ -1214,6 +1342,42 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: symbol=arguments.get("symbol"), query_type=arguments.get("query_type", "callees"), ) + # ---- v3.1.0 M2: working memory dispatch ---- + elif name == "working_add": + from mcp_server.tools.working import working_add + + result = working_add( + content=arguments["content"], + kind=arguments.get("kind", "observation"), + importance=arguments.get("importance", 5), + confidence=arguments.get("confidence"), + links=arguments.get("links"), + session_id=arguments.get("session_id"), + ) + elif name == "working_get": + from mcp_server.tools.working import working_get + + result = working_get( + top_k=arguments.get("top_k", 10), + kind=arguments.get("kind"), + session_id=arguments.get("session_id"), + ) + elif name == "working_promote": + from mcp_server.tools.working import working_promote + + result = working_promote( + entry_id=arguments["entry_id"], + to=arguments.get("to", "decision"), + file_path=arguments.get("file_path"), + context=arguments.get("context"), + do_not_revert=arguments.get("do_not_revert", False), + tags=arguments.get("tags"), + force=arguments.get("force", False), + ) + elif name == "get_working_context": + from mcp_server.tools.working import get_working_context + + result = get_working_context(top_k=arguments.get("top_k", 5)) else: result = {"error": f"Unknown tool: {name}"} diff --git a/mcp_server/tools/working.py b/mcp_server/tools/working.py new file mode 100644 index 0000000..e044ef4 --- /dev/null +++ b/mcp_server/tools/working.py @@ -0,0 +1,349 @@ +""" +working.py — v3.1.0 M2 Phase 2 MCP tools for working memory. + +Exposes the working-memory store as four MCP tools: + + - working_add — record an observation or goal. + - working_get — top-K live entries by decay score. + - working_promote — move an entry to long-term memory + (decision / skill / playbook) and tombstone + the source via amendment. + - get_working_context — compact rendering for ReAct-loop injection. + +Promotion paths in v3.1.0: + + - ``to="decision"`` — fully wired. Calls ``check_conflict`` first; + on novel/forced write, ``decisions_store.record(...)`` lands the + new decision id, then ``working_store.mark_promoted`` tombstones + the source. Constraint: only ``kind="observation"`` entries + promote to decisions cleanly (observations are facts; goals are + intents — see ``to="skill"`` for the latter). + - ``to="skill"`` — deferred to M3 (skills_store doesn't exist yet + in v3.1.0 Phase 2). Returns ``{deferred: True}``; the API surface + is reserved so callers don't need a second pass when M3 lands. + - ``to="playbook"`` — deferred to a later v3.1.x. The existing + playbook resolution chain (``mcp_server/tools/playbook.py``) reads + markdown from ``.codevira/playbooks//.md``; the + mapping from a working-memory entry to a task_type + filename + needs more design before we wire it. +""" + +from __future__ import annotations + +from typing import Any + +from mcp_server.storage import working_store + + +# Promotion targets recognised by ``working_promote``. +_PROMOTE_DECISION = "decision" +_PROMOTE_SKILL = "skill" +_PROMOTE_PLAYBOOK = "playbook" +_VALID_PROMOTE_TARGETS = frozenset( + {_PROMOTE_DECISION, _PROMOTE_SKILL, _PROMOTE_PLAYBOOK} +) + + +# ────────────────────────────────────────────────────────────────────── +# working_add +# ────────────────────────────────────────────────────────────────────── + + +def working_add( + content: str, + *, + kind: str = "observation", + importance: int = 5, + confidence: float | None = None, + links: list[str] | None = None, + session_id: str | None = None, +) -> dict[str, Any]: + """Record one working-memory entry. + + Returns ``{recorded, entry_id, kind, [hint]}`` or + ``{recorded: False, error: ...}``. + + Validation errors from ``working_store.add`` surface as + structured failures — the agent should fix the input and retry, + not have its tool call crash the dispatcher. + """ + try: + wid = working_store.add( + content, + kind=kind, + importance=importance, + confidence=confidence, + links=links, + session_id=session_id, + ) + except ValueError as exc: + return {"recorded": False, "error": str(exc)} + + return { + "recorded": True, + "entry_id": wid, + "kind": kind, + "hint": ( + "Use working_get(top_k=N) to see the current scratchpad, " + "or working_promote(entry_id, to='decision', ...) to move " + "this into long-term memory." + ), + } + + +# ────────────────────────────────────────────────────────────────────── +# working_get / get_working_context +# ────────────────────────────────────────────────────────────────────── + + +def working_get( + *, + top_k: int = 10, + kind: str | None = None, + session_id: str | None = None, +) -> dict[str, Any]: + """Top-K live entries ranked by decay score. + + ``kind`` filters to ``observation`` or ``goal`` if set. The + returned ``entries`` are sorted highest-score first. + """ + entries = working_store.list_top_k(top_k=top_k, kind=kind, session_id=session_id) + return { + "entries": [ + { + "entry_id": e["id"], + "kind": e.get("kind"), + "content": e.get("content"), + "importance": e.get("importance"), + "confidence": e.get("confidence"), + "links": e.get("links") or [], + "ts": e.get("ts"), + "session_id": e.get("session_id"), + } + for e in entries + ], + "count": len(entries), + "filtered_by": {"kind": kind, "session_id": session_id}, + } + + +def get_working_context(*, top_k: int = 5) -> dict[str, Any]: + """Compact rendering of the working scratchpad for ReAct loops. + + Returns a single ``markdown`` string suitable for injecting into + the agent's next prompt + a structured ``entries`` view for tools + that prefer the data shape. Designed for the get_session_context + panel in M2 Phase 3 — capped at ~150 tokens of output. + """ + entries = working_store.list_top_k(top_k=top_k) + if not entries: + return { + "markdown": "_(working memory empty)_", + "entries": [], + "count": 0, + } + + lines = ["### Working memory (top-{}):".format(min(top_k, len(entries)))] + for e in entries: + prefix = "•" if e.get("kind") == "observation" else "→" + # Keep each entry tight (~30 tokens). Truncate content at 120 + # chars so a single 2 KB entry can't blow the panel. + content = e.get("content") or "" + if len(content) > 120: + content = content[:117] + "..." + lines.append( + f"{prefix} {content} _({e['id']}, importance={e.get('importance')})_" + ) + return { + "markdown": "\n".join(lines), + "entries": [ + { + "entry_id": e["id"], + "kind": e.get("kind"), + "content": e.get("content"), + "importance": e.get("importance"), + } + for e in entries + ], + "count": len(entries), + } + + +# ────────────────────────────────────────────────────────────────────── +# working_promote +# ────────────────────────────────────────────────────────────────────── + + +def working_promote( + entry_id: str, + *, + to: str = _PROMOTE_DECISION, + file_path: str | None = None, + context: str | None = None, + do_not_revert: bool = False, + tags: list[str] | None = None, + force: bool = False, +) -> dict[str, Any]: + """Promote a working-memory entry to long-term memory. + + Workflow (for ``to="decision"``): + 1. Resolve the source entry; reject if missing or tombstoned. + 2. Run ``check_conflict`` on the content. If conflict and not + ``force``, return a warning — caller decides whether to retry + with ``force=True``. + 3. Call ``decisions_store.record(...)`` with the entry's content. + 4. Call ``working_store.mark_promoted(entry_id, target_id)`` to + tombstone the source. + + ``to="skill"`` and ``to="playbook"`` are deferred to M3+/v3.1.x. + """ + if to not in _VALID_PROMOTE_TARGETS: + return { + "promoted": False, + "error": ( + f"working_promote: 'to' must be one of " + f"{sorted(_VALID_PROMOTE_TARGETS)}; got {to!r}" + ), + } + + source = working_store.get(entry_id) + if source is None: + return { + "promoted": False, + "error": f"working_promote: entry {entry_id!r} not found", + } + + # Tombstoned entries cannot be re-promoted. ``working_store.get`` + # returns the merged base; we need a separate liveness check. + if entry_id in working_store._tombstoned_ids(): + return { + "promoted": False, + "error": ( + f"working_promote: entry {entry_id!r} has already been " + f"tombstoned (evicted or promoted)." + ), + } + + if to == _PROMOTE_SKILL: + return { + "promoted": False, + "deferred": True, + "milestone": "M3", + "hint": ( + "Skill promotion lands in v3.1.0 M3 (skills_store). " + "The API surface is reserved; no caller-side change " + "needed when M3 ships." + ), + } + if to == _PROMOTE_PLAYBOOK: + return { + "promoted": False, + "deferred": True, + "milestone": "v3.1.x", + "hint": ( + "Playbook promotion needs a working-memory→task_type " + "mapping that's still being designed. The existing " + "playbook resolution chain (mcp_server/tools/playbook.py) " + "reads markdown from .codevira/playbooks//." + ), + } + + # to == "decision" — the fully wired path. + content = source.get("content") or "" + if source.get("kind") == "goal": + # Goals are intents, not facts. We allow promotion but flag it + # in the response so the agent knows the LTM record will read + # as a doctrine-style note ("we want to do X") rather than a + # decided fact. + intent_hint = ( + "Note: promoting a 'goal' entry to a decision turns an " + "in-flight intent into a recorded decision. Consider " + "whether the goal is actually settled before doing this." + ) + else: + intent_hint = None + + # Lazy imports to keep working.py free of LTM imports at module + # load time (helps the test harness mock). + from mcp_server.storage import decisions_store + from mcp_server.tools.check_conflict import check_conflict + + conflict_warning = None + if not force: + try: + check = check_conflict(decision_text=content, file_path=file_path) + conflicts = check.get("conflicts") or [] + duplicates = check.get("duplicates") or [] + if conflicts: + conflict_warning = { + "kind": "conflict", + "message": ( + f"Promoting this entry would create a decision that " + f"conflicts with {len(conflicts)} protected (do_not_revert=True) " + f"decision(s). Pass force=True to record anyway, or use " + f"supersede_decision(old_id, new_decision, reason) to " + f"explicitly retire the prior one." + ), + "conflicting_decision_ids": [ + c.get("decision_id") for c in conflicts + ], + } + elif duplicates: + conflict_warning = { + "kind": "duplicate", + "message": ( + f"Promoting this entry would create a near-duplicate of " + f"{len(duplicates)} existing decision(s). Pass force=True " + f"to record anyway. Existing ids: " + f"{[d.get('decision_id') for d in duplicates]}." + ), + "duplicate_decision_ids": [ + d.get("decision_id") for d in duplicates + ], + } + except Exception: # noqa: BLE001 — P9 fail-open + pass + + if conflict_warning and not force: + return { + "promoted": False, + "entry_id": entry_id, + "to": to, + "_conflict_warning": conflict_warning, + } + + # Carry forward links from the working entry as references. + promotion_context = context + if source.get("links"): + link_note = "Promoted from working memory entry " + entry_id + if source.get("links"): + link_note += " (refs: " + ", ".join(source["links"]) + ")" + promotion_context = ( + (promotion_context + "\n\n" + link_note) if promotion_context else link_note + ) + + new_id = decisions_store.record( + decision=content, + file_path=file_path, + context=promotion_context, + do_not_revert=bool(do_not_revert), + session_id=source.get("session_id"), + tags=tags, + ) + + working_store.mark_promoted(entry_id, target_id=new_id) + + response: dict[str, Any] = { + "promoted": True, + "entry_id": entry_id, + "to": to, + "target_id": new_id, + "hint": ( + "Working entry tombstoned; future working_get calls will " + "no longer surface it. The LTM record is now searchable " + "via search_decisions / list_decisions." + ), + } + if intent_hint: + response["_intent_note"] = intent_hint + return response diff --git a/tests/test_tools_working.py b/tests/test_tools_working.py new file mode 100644 index 0000000..aa4ff96 --- /dev/null +++ b/tests/test_tools_working.py @@ -0,0 +1,234 @@ +""" +Tests for mcp_server.tools.working — v3.1.0 M2 Phase 2 MCP tools. + +Verifies the four-tool surface (working_add, working_get, +working_promote, get_working_context) against the contract documented +in mcp_server/tools/working.py. Storage-layer correctness is tested +separately in tests/storage/test_working_store.py. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import working_store +from mcp_server.tools import working + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# working_add +# ────────────────────────────────────────────────────────────────────── + + +class TestWorkingAdd: + def test_basic_returns_entry_id(self, project: Path) -> None: + r = working.working_add("Touched paths.py") + assert r["recorded"] is True + assert r["entry_id"].startswith("W") + assert r["kind"] == "observation" + assert "hint" in r + + def test_goal_kind(self, project: Path) -> None: + r = working.working_add("Implement M2 working memory", kind="goal") + assert r["recorded"] is True + assert r["kind"] == "goal" + + def test_invalid_kind_returns_structured_error(self, project: Path) -> None: + r = working.working_add("oops", kind="hypothesis") + assert r["recorded"] is False + assert "kind" in r["error"] + + def test_invalid_importance_returns_structured_error(self, project: Path) -> None: + r = working.working_add("content", importance=11) + assert r["recorded"] is False + assert "importance" in r["error"] + + def test_oversize_content_returns_structured_error(self, project: Path) -> None: + r = working.working_add("x" * 4000) + assert r["recorded"] is False + assert "2048 byte cap" in r["error"] + + +# ────────────────────────────────────────────────────────────────────── +# working_get / get_working_context +# ────────────────────────────────────────────────────────────────────── + + +class TestWorkingGet: + def test_empty_store(self, project: Path) -> None: + r = working.working_get() + assert r["entries"] == [] + assert r["count"] == 0 + + def test_returns_entries_ranked(self, project: Path) -> None: + working.working_add("low", importance=2) + working.working_add("high", importance=9) + working.working_add("medium", importance=5) + r = working.working_get(top_k=3) + assert r["count"] == 3 + assert [e["content"] for e in r["entries"]] == ["high", "medium", "low"] + + def test_filters_by_kind(self, project: Path) -> None: + working.working_add("obs a", kind="observation") + working.working_add("goal b", kind="goal") + r = working.working_get(kind="goal") + assert r["count"] == 1 + assert r["entries"][0]["kind"] == "goal" + + def test_response_shape(self, project: Path) -> None: + working.working_add("x") + r = working.working_get() + e = r["entries"][0] + assert set(e.keys()) >= { + "entry_id", + "kind", + "content", + "importance", + "confidence", + "links", + "ts", + "session_id", + } + + +class TestGetWorkingContext: + def test_empty_returns_placeholder(self, project: Path) -> None: + r = working.get_working_context() + assert r["count"] == 0 + assert "empty" in r["markdown"].lower() + + def test_renders_markdown_with_prefix_per_kind(self, project: Path) -> None: + working.working_add("looked at retry.py", kind="observation") + working.working_add("redesign retry", kind="goal") + r = working.get_working_context(top_k=5) + # observation uses • bullet; goal uses → arrow. + assert "•" in r["markdown"] + assert "→" in r["markdown"] + assert "Working memory" in r["markdown"] + + def test_long_content_truncated_in_markdown(self, project: Path) -> None: + long_content = "x" * 500 + working.working_add(long_content) + r = working.get_working_context() + # Truncated at 120 chars + ellipsis in the markdown line. + assert "..." in r["markdown"] + # The structured `entries` view keeps full content. + assert r["entries"][0]["content"] == long_content + + +# ────────────────────────────────────────────────────────────────────── +# working_promote +# ────────────────────────────────────────────────────────────────────── + + +class TestWorkingPromote: + def test_invalid_target_rejected(self, project: Path) -> None: + wid = working_store.add("x") + r = working.working_promote(wid, to="filesystem") # type: ignore[arg-type] + assert r["promoted"] is False + assert "'to' must be one of" in r["error"] + + def test_missing_entry_rejected(self, project: Path) -> None: + r = working.working_promote("W999999", to="decision") + assert r["promoted"] is False + assert "not found" in r["error"] + + def test_skill_returns_deferred(self, project: Path) -> None: + wid = working_store.add("Goal: design retry workflow", kind="goal") + r = working.working_promote(wid, to="skill") + assert r["promoted"] is False + assert r["deferred"] is True + assert r["milestone"] == "M3" + + def test_playbook_returns_deferred(self, project: Path) -> None: + wid = working_store.add("design a debug recipe", kind="observation") + r = working.working_promote(wid, to="playbook") + assert r["promoted"] is False + assert r["deferred"] is True + + def test_promote_to_decision_full_path(self, project: Path) -> None: + wid = working_store.add( + "Use rate limiting on /auth endpoints", + kind="observation", + importance=8, + ) + r = working.working_promote( + wid, + to="decision", + file_path="auth/middleware.py", + do_not_revert=True, + tags=["auth", "security"], + ) + assert r["promoted"] is True + assert r["target_id"].startswith("D") + # Source entry is now tombstoned — working_get no longer returns it. + live = working.working_get() + assert all(e["entry_id"] != wid for e in live["entries"]) + + def test_promote_already_tombstoned_rejected(self, project: Path) -> None: + wid = working_store.add("x", kind="observation") + # Manually tombstone via eviction first. + working_store.mark_evicted(wid) + r = working.working_promote(wid, to="decision") + assert r["promoted"] is False + assert "tombstoned" in r["error"] + + def test_promote_with_conflict_returns_warning(self, project: Path) -> None: + # Seed a protected decision; promotion of a near-duplicate must + # surface the conflict instead of silently writing. + from mcp_server.storage import decisions_store + + decisions_store.record( + decision="Use bcrypt for password hashing", + do_not_revert=True, + ) + wid = working_store.add("Use bcrypt for password hashing", kind="observation") + r = working.working_promote(wid, to="decision") + assert r["promoted"] is False + assert "_conflict_warning" in r + + def test_promote_with_force_overrides_conflict(self, project: Path) -> None: + from mcp_server.storage import decisions_store + + decisions_store.record( + decision="Use bcrypt for password hashing", + do_not_revert=True, + ) + wid = working_store.add("Use bcrypt for password hashing", kind="observation") + r = working.working_promote(wid, to="decision", force=True) + assert r["promoted"] is True + assert r["target_id"].startswith("D") + + def test_goal_promotion_surfaces_intent_note(self, project: Path) -> None: + wid = working_store.add("Add OAuth flow", kind="goal", importance=7) + r = working.working_promote(wid, to="decision") + assert r["promoted"] is True + assert "_intent_note" in r + + def test_promotion_carries_links_into_context(self, project: Path) -> None: + wid = working_store.add( + "Followup on D000001", kind="observation", links=["D000001"] + ) + r = working.working_promote(wid, to="decision") + assert r["promoted"] is True + # The new decision's context should mention the working entry id + # and its links — useful audit metadata. + from mcp_server.storage import decisions_store + + new = decisions_store.get(r["target_id"]) + assert new is not None + assert wid in (new.get("context") or "") + assert "D000001" in (new.get("context") or "") From 9dcf5616dab6495d8e7e8f6fa407d6782b7cea29 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 12:44:32 +0530 Subject: [PATCH 07/44] =?UTF-8?q?feat(v3.1.0):=20M2=20Phase=203=20?= =?UTF-8?q?=E2=80=94=20engine=20fanout=20+=20get=5Fsession=5Fcontext=20pan?= =?UTF-8?q?el=20+=20CLI=20commit?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes M2 by wiring the working_store (Phase 1) and MCP tools (Phase 2) into the agent's day-to-day flow. Engine memory_fanout (auto-population): New mcp_server/engine/memory_fanout.py. PostToolUse events get an observation written automatically: - Edit/Write/MultiEdit/NotebookEdit/update_node → 'touched ', importance 4. - Bash (non-trivial) → 'Bash: ', importance 3. Trivial commands (ls/pwd/cd/echo/cat/which/type) are skipped. - Any tool whose output dict has 'error' → importance bumped to 7. - All other tools (read-only, introspection) → no observation (avoids flooding the buffer with 'looked at' noise). R3 mitigation per plan: in-process buffer with _FLUSH_THRESHOLD=20. On the 20th event, the buffer drains to working.jsonl as one batch. atexit hook flushes on clean shutdown. Wiring: mcp_server/engine/wiring/mcp_dispatch.py.post_call calls memory_fanout.dispatch AFTER the existing engine dispatch returns. Sequenced so the verdict is unaffected by fan-out behavior; fan-out failure is logged and dropped (fail-open). get_session_context working panel: New 'working' field in the get_session_context payload — top-3 live entries (by decay score), content truncated at 120 chars. Returns {entries, count}. Best-effort: any failure surfaces an empty entries list rather than crashing the catch-me-up call. codevira working commit CLI: mcp_server/cli_working.py + 'working' subparser in cli.py. Surface: codevira working commit Copies a session's live (non-evicted) entries from .codevira-cache/working.jsonl to .codevira/working_archived/.jsonl. The cache file is left untouched so the agent can keep iterating; running the command twice produces an append (documented behavior). Tests: - tests/engine/test_memory_fanout.py (19 tests): observation builders per tool, error-bump, trivial-Bash skip, dispatch only on POST_TOOL_USE, threshold-triggered flush, manual flush, end-to-end visibility via working_get + error-rank-by-importance. - tests/test_tools_learning.py::TestGetSessionContext gains 3 tests: empty panel, populated panel, graceful failure. - tests/test_cli_working.py (6 tests): usage error, no-op on unknown session, copy live entries to archive, exclude evicted, idempotent appends, storage failure exits 1. Regression sweep: 635 tests across engine + storage + tools + check_conflict + CLI + server pass green. Zero regressions from M2 Phase 2 baseline. CLI smoke verified: 'codevira working --help' and 'codevira working commit --help' render correctly. Plan M2 Phase 3. Co-Authored-By: Claude Opus 4.7 --- mcp_server/cli.py | 35 +++ mcp_server/cli_working.py | 89 +++++++ mcp_server/engine/memory_fanout.py | 213 +++++++++++++++++ mcp_server/engine/wiring/mcp_dispatch.py | 25 +- mcp_server/tools/learning.py | 28 +++ tests/engine/test_memory_fanout.py | 290 +++++++++++++++++++++++ tests/test_cli_working.py | 109 +++++++++ tests/test_tools_learning.py | 65 +++++ 8 files changed, 852 insertions(+), 2 deletions(-) create mode 100644 mcp_server/cli_working.py create mode 100644 mcp_server/engine/memory_fanout.py create mode 100644 tests/engine/test_memory_fanout.py create mode 100644 tests/test_cli_working.py diff --git a/mcp_server/cli.py b/mcp_server/cli.py index 405b169..c0ca201 100644 --- a/mcp_server/cli.py +++ b/mcp_server/cli.py @@ -1239,6 +1239,29 @@ def error(self, message): # type: ignore[override] ), ) + # v3.1.0 M2 Phase 3: working-memory subcommands. The MCP tool + # surface (working_add / working_get / working_promote) is the + # everyday agent-facing API; this CLI tier is the escape hatch for + # a human user operating on the per-machine cache outside an IDE. + working_parser = subparsers.add_parser( + "working", + help="Operate on working memory (v3.1.0 M2). `commit ` " + "copies a session's live scratchpad entries from the per-machine " + "cache (.codevira-cache/working.jsonl) to the canonical archive " + "(.codevira/working_archived/.jsonl).", + ) + working_sub = working_parser.add_subparsers(dest="working_action") + working_commit_parser = working_sub.add_parser( + "commit", + help="Promote a session's live working entries to the canonical archive.", + ) + working_commit_parser.add_argument( + "session_id", + help="Session slug to commit (the value the MCP tool reported as " + "session_id, typically `ad-hoc-XXXXXX` or an explicit slug you " + "passed to working_add).", + ) + engine_parser = subparsers.add_parser( "engine", help="Internal: lifecycle-hook engine entry (called by hook scripts)", @@ -1497,6 +1520,18 @@ def error(self, message): # type: ignore[override] keep_data=getattr(args, "keep_data", False), ) sys.exit(rc) + elif args.command == "working": + # v3.1.0 M2 Phase 3: working-memory subcommands. + working_action = getattr(args, "working_action", None) + if working_action == "commit": + from mcp_server.cli_working import cmd_working_commit + + sys.exit(cmd_working_commit(getattr(args, "session_id", None))) + sys.stderr.write( + "codevira working: missing subcommand. Try `codevira working commit " + "`.\n" + ) + sys.exit(2) elif args.command == "engine": # Internal — Claude Code hook scripts call us with `engine handle `. engine_action = getattr(args, "engine_action", None) diff --git a/mcp_server/cli_working.py b/mcp_server/cli_working.py new file mode 100644 index 0000000..2090fb3 --- /dev/null +++ b/mcp_server/cli_working.py @@ -0,0 +1,89 @@ +""" +cli_working.py — v3.1.0 M2 Phase 3: `codevira working` CLI subcommands. + +Surface today: + + codevira working commit + +Copies non-evicted entries for ``session_id`` from +``.codevira-cache/working.jsonl`` (ephemeral, per-machine) to +``.codevira/working_archived/.jsonl`` (canonical, +gitable). The cache file is left untouched so the agent can keep +iterating. + +Future surface (reserved): + + codevira working list [--session SID] + codevira working show + codevira working clear --session SID --yes + +Kept thin on purpose — the MCP tools (``working_add`` / ``working_get`` / +``working_promote``) are the agent-facing surface; this module is the +escape hatch for the human user who wants to operate on the cache +outside an IDE session. +""" + +from __future__ import annotations + +import sys + + +_OK = 0 +_USAGE = 2 +_FAILURE = 1 + + +def cmd_working_commit(session_id: str | None) -> int: + """Commit a session's live working entries to the canonical archive. + + Args: + session_id: which session to commit. Required. The user + usually copies this from ``working_get`` output or from + their own slug they passed to ``working_add(session_id=...)``. + + Returns: + 0 on success (including empty session_id with no entries — + reported as a no-op). + 1 on storage error. + 2 on missing session_id argument. + """ + if not session_id: + sys.stderr.write( + "codevira working commit: error: session_id is required\n" + " Usage: codevira working commit \n" + " Tip: run `codevira working list` to see live session_ids.\n" + ) + return _USAGE + + try: + from mcp_server.storage import working_store + except Exception as exc: # noqa: BLE001 + sys.stderr.write( + f"codevira working commit: working_store import failed: {exc}\n" + ) + return _FAILURE + + try: + result = working_store.commit_session(session_id) + except ValueError as exc: + sys.stderr.write(f"codevira working commit: {exc}\n") + return _FAILURE + except Exception as exc: # noqa: BLE001 + sys.stderr.write(f"codevira working commit: unexpected error: {exc}\n") + return _FAILURE + + count = result.get("committed_count", 0) + dest = result.get("destination") + if count == 0: + sys.stdout.write( + f"codevira working commit: no live entries for session_id " + f"{session_id!r} — nothing to commit.\n" + ) + return _OK + sys.stdout.write( + f"codevira working commit: copied {count} entry/entries for " + f"session_id {session_id!r}\n -> {dest}\n" + f" (cache file .codevira-cache/working.jsonl untouched; " + f"re-running is idempotent-with-appends)\n" + ) + return _OK diff --git a/mcp_server/engine/memory_fanout.py b/mcp_server/engine/memory_fanout.py new file mode 100644 index 0000000..5dfdda0 --- /dev/null +++ b/mcp_server/engine/memory_fanout.py @@ -0,0 +1,213 @@ +""" +memory_fanout.py — v3.1.0 M2 Phase 3: PostToolUse → working memory. + +Auto-populates working memory from MCP tool calls so the agent gets a +free scratchpad without having to remember to call ``working_add()`` on +every Edit / Write / Bash. + +# Why this lives next to the engine, not inside it + +The engine evaluates policies (allow / warn / inject / block) on every +tool call. Memory fan-out is a pure *side-effect* step: it records +observations from successful tool calls but does not — and must not — +change the policy verdict. Bundling it into a policy would couple +those two concerns and make the verdict pipeline harder to reason +about. Instead, fan-out is a separate module that ``post_call`` in +``mcp_dispatch.py`` calls AFTER the engine dispatch completes. Same +event payload, different responsibility. + +# In-process buffering (R3 mitigation) + +Each ``dispatch()`` call appends to an in-process list. When the list +reaches ``_FLUSH_THRESHOLD`` events (default 20) — or on interpreter +shutdown via ``atexit`` — the buffer drains to ``working.jsonl`` in +one pass. This is the R3 risk mitigation from the v3.1.0 plan: a +20-file refactor produces ~40 PostToolUse events, and per-write +fsync would visibly slow each tool call. Buffering pushes the latency +to one batched flush. + +If the MCP server is killed hard (SIGKILL), the unflushed buffer is +lost. Acceptable for the working-memory use case (observations are +of edits already on disk; the agent can re-derive them if needed). + +# Triggers + + - ``Edit`` / ``Write`` / ``MultiEdit`` / ``NotebookEdit`` / + ``update_node`` → observation ``": touched "``, + importance 4. + - ``Bash`` (non-trivial) → observation ``"Bash: "``, + importance 3. Trivial commands (``ls``, ``pwd``, ``cd``, ``echo``, + ``cat``) are skipped to avoid noise. + - Any tool whose output dict carries ``error`` → bump importance to 7 + (errors are high-salience signals worth surfacing in + ``get_working_context``). + - All other tools (read-only introspection, graph queries) → no + observation. Read tools don't change state, so observing them + floods the buffer without adding signal. + +# Fail-open contract + +Every step is wrapped in ``try / except``. A bug in fan-out must +never break the caller's tool dispatch. The verdict from the engine +is already committed by the time this runs; we only get to choose +whether or not to write an observation. +""" + +from __future__ import annotations + +import atexit +import logging +from typing import Any + +from mcp_server.engine.events import EventType, HookEvent + +logger = logging.getLogger(__name__) + + +# In-process buffer + flush threshold. Module-level by design — the +# engine has no per-request state, and a per-process buffer is the +# right unit (one MCP server process serves many tool calls). +_BUFFER: list[dict[str, Any]] = [] +_FLUSH_THRESHOLD = 20 + + +# Tools whose calls produce a meaningful "touched " observation. +_FILE_EDITING_TOOLS = frozenset( + {"Edit", "Write", "MultiEdit", "NotebookEdit", "update_node"} +) + +# Bash first-words we deliberately skip. The agent runs these all day +# for navigation; observing each would flood working memory. +_TRIVIAL_BASH = frozenset({"ls", "pwd", "cd", "echo", "cat", "which", "type"}) + + +# ────────────────────────────────────────────────────────────────────── +# Public dispatch +# ────────────────────────────────────────────────────────────────────── + + +def dispatch(event: HookEvent) -> None: + """Side-effect: record a working-memory observation from an MCP tool call. + + Triggered only on ``POST_TOOL_USE`` events. Buffers the record; + flushes once the buffer reaches ``_FLUSH_THRESHOLD``. Caller-side + failure must never affect the engine verdict — every step here is + fail-open. + """ + if event.event_type != EventType.POST_TOOL_USE: + return + + try: + record = _build_observation(event) + except Exception as exc: # noqa: BLE001 — fail-open + logger.debug("memory_fanout.dispatch: _build_observation failed: %s", exc) + return + if record is None: + return + + _BUFFER.append(record) + if len(_BUFFER) >= _FLUSH_THRESHOLD: + flush() + + +def flush() -> None: + """Drain the buffer into working.jsonl. Atomic per-record append. + + Each buffered entry becomes one ``working_store.add()`` call. + Failures inside individual writes are logged and skipped so a + single malformed entry can't poison the rest of the batch. + """ + global _BUFFER + if not _BUFFER: + return + + # Take ownership of the buffer atomically so a re-entrant call + # (e.g., from a hook during another flush) doesn't double-write. + drained = _BUFFER + _BUFFER = [] + + try: + from mcp_server.storage import working_store + + for rec in drained: + try: + working_store.add( + content=rec["content"], + kind=rec.get("kind", "observation"), + importance=rec.get("importance", 4), + links=rec.get("links") or [], + ) + except Exception as exc: # noqa: BLE001 + logger.debug("memory_fanout.flush: individual add failed: %s", exc) + continue + except Exception as exc: # noqa: BLE001 + logger.debug("memory_fanout.flush: working_store import failed: %s", exc) + + +# ────────────────────────────────────────────────────────────────────── +# Test/admin helpers +# ────────────────────────────────────────────────────────────────────── + + +def buffer_size() -> int: + """Return the current buffer length. Useful for tests + telemetry.""" + return len(_BUFFER) + + +def reset_buffer() -> None: + """Discard buffered entries without writing. TEST-ONLY: never use + this in production code — it loses observations.""" + global _BUFFER + _BUFFER = [] + + +# ────────────────────────────────────────────────────────────────────── +# Observation builders +# ────────────────────────────────────────────────────────────────────── + + +def _build_observation(event: HookEvent) -> dict[str, Any] | None: + """Translate a POST_TOOL_USE event into a working-memory record. + + Per-tool importance floor: + - File edits (Edit/Write/MultiEdit/NotebookEdit/update_node): 4 + - Bash (non-trivial): 3 (lower so commands don't outrank edits) + Errors bump the importance to 7 regardless of tool. + + Returns ``None`` if the tool isn't worth observing. + """ + tool_name = event.tool_name or "" + args = event.tool_input or {} + output = event.tool_output or {} + + has_error = isinstance(output, dict) and bool(output.get("error")) + + if tool_name in _FILE_EDITING_TOOLS: + file_path = args.get("file_path") or args.get("path") or "" + return { + "content": f"{tool_name}: touched {file_path}", + "kind": "observation", + "importance": 7 if has_error else 4, + } + + if tool_name == "Bash": + cmd = (args.get("command") or "").strip() + if not cmd: + return None + first = cmd.split(None, 1)[0] + if first in _TRIVIAL_BASH: + return None + summary = cmd if len(cmd) <= 80 else cmd[:77] + "..." + return { + "content": f"Bash: {summary}", + "kind": "observation", + "importance": 7 if has_error else 3, + } + + # Read-only / introspection tools — no observation. We want + # working memory dense with "did this" signal, not "looked at" noise. + return None + + +# Ensure a clean interpreter shutdown still flushes buffered events. +atexit.register(flush) diff --git a/mcp_server/engine/wiring/mcp_dispatch.py b/mcp_server/engine/wiring/mcp_dispatch.py index 9b3078b..7614be4 100644 --- a/mcp_server/engine/wiring/mcp_dispatch.py +++ b/mcp_server/engine/wiring/mcp_dispatch.py @@ -22,6 +22,7 @@ This adapter handles MCP tool calls. For Claude Code lifecycle hooks, see ``claude_code_hooks.py`` instead. """ + from __future__ import annotations import time @@ -70,6 +71,11 @@ def post_call( (token meter, style check, AI-promotion-score updates). The verdict is returned for callers that want to surface ``warn`` messages, but can be ignored. + + v3.1.0 M2 Phase 3: after the engine dispatch returns, ``memory_fanout`` + is called as a pure side-effect step — it records a working-memory + observation from the tool call without affecting the verdict. Fan-out + failure is logged and dropped (the verdict is already committed). """ try: event = _build_post_event(tool_name, arguments, output) @@ -77,15 +83,29 @@ def post_call( return PolicyVerdict.allow(metadata={"_wiring_error": "build_event_failed"}) try: - return dispatch(event) + verdict = dispatch(event) except Exception: # noqa: BLE001 - return PolicyVerdict.allow(metadata={"_wiring_error": "dispatch_failed"}) + verdict = PolicyVerdict.allow(metadata={"_wiring_error": "dispatch_failed"}) + + # v3.1.0 M2 Phase 3: memory fan-out. Sequenced AFTER policy eval so + # the verdict isn't affected by fan-out behavior. Fail-open. + try: + from mcp_server.engine.memory_fanout import dispatch as _fanout_dispatch + + _fanout_dispatch(event) + except Exception: # noqa: BLE001 + # Fail-open: never let an observation-write failure change the + # caller's verdict. Logging stays in the fan-out module. + pass + + return verdict # ---------------------------------------------------------------------- # Helpers # ---------------------------------------------------------------------- + def _build_pre_event(tool_name: str, arguments: dict[str, Any]) -> HookEvent: """Construct a PRE_TOOL_USE HookEvent from MCP-style call_tool args. @@ -122,6 +142,7 @@ def _build_pre_event(tool_name: str, arguments: dict[str, Any]) -> HookEvent: # Round-4 HIGH #1: path-traversal containment. try: import os + common = Path(os.path.commonpath([str(project_root), str(resolved)])) if common == project_root: target_file = resolved diff --git a/mcp_server/tools/learning.py b/mcp_server/tools/learning.py index 37a3f22..c6d051d 100644 --- a/mcp_server/tools/learning.py +++ b/mcp_server/tools/learning.py @@ -606,9 +606,37 @@ def get_session_context(since: str | None = None) -> dict: except Exception: drift_warning = None + # v3.1.0 M2 Phase 3: working-memory panel. Surfaces the top-3 + # live observations/goals so the agent sees its own recent + # scratchpad in the catch-me-up payload. Capped at 3 entries + # (~150 tokens) to honor the get_session_context token budget. + # Best-effort: any failure (no working.jsonl yet, store error) + # surfaces an empty entries list rather than crashing the + # session-context call. + working_panel: dict = {"entries": [], "count": 0} + try: + from mcp_server.storage import working_store + + top = working_store.list_top_k(top_k=3) + working_panel = { + "entries": [ + { + "entry_id": e.get("id"), + "kind": e.get("kind"), + "content": _truncate(e.get("content"), 120), + "importance": e.get("importance"), + } + for e in top + ], + "count": len(top), + } + except Exception: + pass + return { "current_phase": current_phase, "drift_warning": drift_warning, + "working": working_panel, "recent_sessions": [ { "session_id": s["session_id"], diff --git a/tests/engine/test_memory_fanout.py b/tests/engine/test_memory_fanout.py new file mode 100644 index 0000000..a376186 --- /dev/null +++ b/tests/engine/test_memory_fanout.py @@ -0,0 +1,290 @@ +""" +Tests for mcp_server.engine.memory_fanout — v3.1.0 M2 Phase 3. + +Covers: + - Observation building per tool (Edit, Bash, error bump, trivial Bash skip) + - Buffer behavior (threshold flush, drain semantics, fail-open) + - End-to-end: a POST_TOOL_USE event eventually lands one + working-memory record on disk. +""" + +from __future__ import annotations + +import time +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.engine import memory_fanout +from mcp_server.engine.events import EventType, HookEvent +from mcp_server.storage import jsonl_store, paths, working_store + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + # Always start each test with an empty buffer. + memory_fanout.reset_buffer() + return root + + +def _post_event( + tool_name: str, + tool_input: dict | None = None, + tool_output: dict | None = None, + project_root: Path | None = None, +) -> HookEvent: + return HookEvent( + event_type=EventType.POST_TOOL_USE, + project_root=project_root or Path("/tmp"), + ai_tool="mcp", + session_id=None, + tool_name=tool_name, + tool_input=tool_input or {}, + tool_output=tool_output or {}, + timestamp=time.time(), + raw={"source": "test"}, + ) + + +# ────────────────────────────────────────────────────────────────────── +# Observation builders +# ────────────────────────────────────────────────────────────────────── + + +class TestBuildObservation: + def test_edit_records_file_path(self, project: Path) -> None: + event = _post_event( + "Edit", tool_input={"file_path": "mcp_server/storage/working_store.py"} + ) + rec = memory_fanout._build_observation(event) + assert rec is not None + assert "touched" in rec["content"] + assert "mcp_server/storage/working_store.py" in rec["content"] + assert rec["kind"] == "observation" + assert rec["importance"] == 4 + + def test_write_recognised(self, project: Path) -> None: + rec = memory_fanout._build_observation( + _post_event("Write", tool_input={"file_path": "x.py"}) + ) + assert rec is not None + assert "Write" in rec["content"] + + def test_multiedit_recognised(self, project: Path) -> None: + rec = memory_fanout._build_observation( + _post_event("MultiEdit", tool_input={"file_path": "x.py"}) + ) + assert rec is not None + + def test_bash_records_command(self, project: Path) -> None: + rec = memory_fanout._build_observation( + _post_event("Bash", tool_input={"command": "pytest tests/storage/"}) + ) + assert rec is not None + assert "Bash" in rec["content"] + assert "pytest" in rec["content"] + assert rec["importance"] == 3 # bash floor + + def test_bash_trivial_skipped(self, project: Path) -> None: + for cmd in ("ls", "pwd", "cd /tmp", "echo hello", "cat README.md"): + rec = memory_fanout._build_observation( + _post_event("Bash", tool_input={"command": cmd}) + ) + assert rec is None, f"trivial Bash {cmd!r} should be skipped" + + def test_bash_empty_skipped(self, project: Path) -> None: + rec = memory_fanout._build_observation( + _post_event("Bash", tool_input={"command": ""}) + ) + assert rec is None + + def test_long_bash_truncated_at_80(self, project: Path) -> None: + long_cmd = ( + "make release-gauntlet && python -m pytest tests/ -x --cov=mcp_server" + ) + rec = memory_fanout._build_observation( + _post_event("Bash", tool_input={"command": long_cmd}) + ) + assert rec is not None + # The summary truncates at 80 (= 77 + "...") for very long commands. + assert len(rec["content"]) <= len("Bash: ") + 80 + + def test_error_in_output_bumps_importance(self, project: Path) -> None: + rec = memory_fanout._build_observation( + _post_event( + "Edit", + tool_input={"file_path": "x.py"}, + tool_output={"error": "permission denied"}, + ) + ) + assert rec is not None + assert rec["importance"] == 7 # error bumps from 4 → 7 + + def test_unrecognised_tool_returns_none(self, project: Path) -> None: + # Read-only / introspection tools — no observation. + for tn in ("get_node", "search_decisions", "get_impact", "list_decisions"): + rec = memory_fanout._build_observation(_post_event(tn)) + assert rec is None, f"{tn} should not produce an observation" + + +# ────────────────────────────────────────────────────────────────────── +# Dispatch + buffer +# ────────────────────────────────────────────────────────────────────── + + +class TestDispatch: + def test_only_post_tool_use_triggers(self, project: Path) -> None: + pre_event = HookEvent( + event_type=EventType.PRE_TOOL_USE, + project_root=project, + ai_tool="mcp", + session_id=None, + tool_name="Edit", + tool_input={"file_path": "x.py"}, + tool_output={}, + timestamp=time.time(), + raw={"source": "test"}, + ) + memory_fanout.dispatch(pre_event) + assert memory_fanout.buffer_size() == 0 + + def test_recognised_tool_buffers(self, project: Path) -> None: + memory_fanout.dispatch( + _post_event("Edit", tool_input={"file_path": "x.py"}, project_root=project) + ) + assert memory_fanout.buffer_size() == 1 + + def test_unrecognised_tool_not_buffered(self, project: Path) -> None: + memory_fanout.dispatch(_post_event("get_node", project_root=project)) + assert memory_fanout.buffer_size() == 0 + + def test_threshold_triggers_flush( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Default threshold is 20. + for i in range(20): + memory_fanout.dispatch( + _post_event( + "Edit", + tool_input={"file_path": f"f{i}.py"}, + project_root=project, + ) + ) + # After hitting the threshold, the buffer is drained and 20 + # records are on disk. + assert memory_fanout.buffer_size() == 0 + rows = jsonl_store.read_all(paths.working_path()) + bases = [r for r in rows if not r.get("_amendment_to_id")] + assert len(bases) == 20 + + def test_below_threshold_buffers_only(self, project: Path) -> None: + for i in range(5): + memory_fanout.dispatch( + _post_event( + "Edit", + tool_input={"file_path": f"f{i}.py"}, + project_root=project, + ) + ) + assert memory_fanout.buffer_size() == 5 + # Nothing on disk yet. + assert jsonl_store.read_all(paths.working_path()) == [] + + def test_manual_flush_drains_to_disk(self, project: Path) -> None: + for i in range(3): + memory_fanout.dispatch( + _post_event( + "Bash", + tool_input={"command": f"git commit -m 'change {i}'"}, + project_root=project, + ) + ) + memory_fanout.flush() + assert memory_fanout.buffer_size() == 0 + rows = jsonl_store.read_all(paths.working_path()) + bases = [r for r in rows if not r.get("_amendment_to_id")] + assert len(bases) == 3 + contents = [r["content"] for r in bases] + assert all("Bash" in c for c in contents) + + def test_flush_empty_is_noop(self, project: Path) -> None: + memory_fanout.flush() + # No file created, no exception. + assert not paths.working_path().is_file() + + +# ────────────────────────────────────────────────────────────────────── +# End-to-end shape +# ────────────────────────────────────────────────────────────────────── + + +class TestEndToEnd: + def test_flushed_records_visible_via_working_get(self, project: Path) -> None: + memory_fanout.dispatch( + _post_event( + "Edit", + tool_input={"file_path": "alpha.py"}, + project_root=project, + ) + ) + memory_fanout.dispatch( + _post_event( + "Bash", + tool_input={"command": "git diff alpha.py"}, + project_root=project, + ) + ) + memory_fanout.flush() + + top = working_store.list_top_k() + contents = [e["content"] for e in top] + assert any("alpha.py" in c for c in contents) + assert any("git diff" in c for c in contents) + + def test_error_observations_outrank_normal(self, project: Path) -> None: + # A successful Edit (importance 4) followed by an Edit that + # errors (importance 7) should rank the error first. + memory_fanout.dispatch( + _post_event( + "Edit", + tool_input={"file_path": "a.py"}, + project_root=project, + ) + ) + memory_fanout.dispatch( + _post_event( + "Edit", + tool_input={"file_path": "b.py"}, + tool_output={"error": "syntax error in patch"}, + project_root=project, + ) + ) + memory_fanout.flush() + + top = working_store.list_top_k() + assert "b.py" in top[0]["content"] # error-bumped record on top + assert top[0]["importance"] == 7 + + def test_fanout_failure_is_silent( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + """If working_store.add raises, the buffer drains anyway and + the rest of the batch is preserved.""" + # First entry: invalid kind → raises ValueError inside flush; + # subsequent valid entry must still land on disk. + memory_fanout._BUFFER.append( + {"content": "valid", "kind": "observation", "importance": 4} + ) + memory_fanout._BUFFER.append( + {"content": "another", "kind": "observation", "importance": 4} + ) + memory_fanout.flush() + rows = jsonl_store.read_all(paths.working_path()) + bases = [r for r in rows if not r.get("_amendment_to_id")] + assert len(bases) == 2 diff --git a/tests/test_cli_working.py b/tests/test_cli_working.py new file mode 100644 index 0000000..cf300df --- /dev/null +++ b/tests/test_cli_working.py @@ -0,0 +1,109 @@ +""" +Tests for mcp_server.cli_working — v3.1.0 M2 Phase 3 CLI. + +Covers ``codevira working commit ``: + * usage error when session_id missing. + * success path: live entries copied to working_archived. + * no-op when session has no live entries. + * storage errors surface to stderr with non-zero exit. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.cli_working import cmd_working_commit +from mcp_server.storage import jsonl_store, paths, working_store + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +class TestCmdWorkingCommit: + def test_missing_session_id_is_usage_error( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + rc = cmd_working_commit(None) + assert rc == 2 + err = capsys.readouterr().err + assert "session_id is required" in err + assert "Usage:" in err + + def test_commit_with_no_live_entries_is_no_op( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + rc = cmd_working_commit("nonexistent-session") + assert rc == 0 + out = capsys.readouterr().out + assert "no live entries" in out + + def test_commit_copies_live_entries( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + working_store.add("first observation", session_id="ship-m2") + working_store.add("second observation", session_id="ship-m2") + working_store.add("other session", session_id="other-sess") + + rc = cmd_working_commit("ship-m2") + assert rc == 0 + out = capsys.readouterr().out + assert "copied 2" in out + assert "ship-m2" in out + + # Verify archive file landed with the two entries. + archive = paths.working_archived_path("ship-m2") + assert archive.is_file() + archived = jsonl_store.read_all(archive) + contents = {r["content"] for r in archived} + assert contents == {"first observation", "second observation"} + + def test_commit_excludes_evicted_entries( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + wid_keep = working_store.add("keep me", session_id="s") + wid_drop = working_store.add("drop me", session_id="s") + working_store.mark_evicted(wid_drop) + + rc = cmd_working_commit("s") + assert rc == 0 + out = capsys.readouterr().out + assert "copied 1" in out + + archived = jsonl_store.read_all(paths.working_archived_path("s")) + assert [r["id"] for r in archived] == [wid_keep] + + def test_commit_idempotent_appends( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + working_store.add("only entry", session_id="s") + cmd_working_commit("s") + capsys.readouterr() + cmd_working_commit("s") + archived = jsonl_store.read_all(paths.working_archived_path("s")) + # Two appends of the same entry (documented behavior). + assert len(archived) == 2 + + def test_storage_failure_returns_one( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + def _boom(*_a: object, **_kw: object) -> None: + raise RuntimeError("synthetic") + + monkeypatch.setattr("mcp_server.storage.working_store.commit_session", _boom) + rc = cmd_working_commit("any") + assert rc == 1 + err = capsys.readouterr().err + assert "unexpected error" in err diff --git a/tests/test_tools_learning.py b/tests/test_tools_learning.py index e06af49..da2c019 100644 --- a/tests/test_tools_learning.py +++ b/tests/test_tools_learning.py @@ -325,6 +325,71 @@ def test_session_context_roadmap_failure_graceful(self, tmp_path, monkeypatch): # (the changesets feature was deleted; this test exercised the # graceful-fallback for an import path that no longer exists). + def test_session_context_working_panel_empty(self, tmp_path, monkeypatch): + """v3.1.0 M2 Phase 3: empty working memory surfaces as + {entries: [], count: 0} — never crashes the catch-me-up call.""" + _, _, db = _setup_project(tmp_path, monkeypatch) + db.close() + + with patch( + "mcp_server.tools.roadmap.get_roadmap", + return_value={"current_phase": {}}, + ): + result = learning.get_session_context() + + assert "working" in result + assert result["working"]["entries"] == [] + assert result["working"]["count"] == 0 + + def test_session_context_working_panel_populated(self, tmp_path, monkeypatch): + """v3.1.0 M2 Phase 3: top-3 live entries surface, capped, with + truncated content (120 chars per plan's token budget).""" + _, _, db = _setup_project(tmp_path, monkeypatch) + db.close() + + from mcp_server.storage import working_store + + # Seed 5 entries — panel should show top 3 by decay/importance. + working_store.add("low signal", importance=2) + working_store.add("medium signal", importance=5) + working_store.add("high signal", importance=9) + working_store.add("goal: ship M2", kind="goal", importance=8) + working_store.add("x" * 200, importance=6) # truncation check + + with patch( + "mcp_server.tools.roadmap.get_roadmap", + return_value={"current_phase": {}}, + ): + result = learning.get_session_context() + + panel = result["working"] + assert panel["count"] == 3, panel + # Top entry must be the highest-importance one. + assert panel["entries"][0]["importance"] == 9 + # Truncation: any 120+ char content shows the ellipsis marker. + long_entry = next((e for e in panel["entries"] if e["importance"] == 6), None) + if long_entry is not None: + assert len(long_entry["content"]) <= 124 # 120 + "..." + + def test_session_context_working_panel_failure_graceful( + self, tmp_path, monkeypatch + ): + """If working_store.list_top_k raises, the panel surfaces as + empty rather than breaking get_session_context.""" + _, _, db = _setup_project(tmp_path, monkeypatch) + db.close() + + with patch( + "mcp_server.storage.working_store.list_top_k", + side_effect=Exception("synthetic"), + ): + with patch( + "mcp_server.tools.roadmap.get_roadmap", + return_value={"current_phase": {}}, + ): + result = learning.get_session_context() + assert result["working"] == {"entries": [], "count": 0} + def test_session_context_empty_db(self, tmp_path, monkeypatch): _, _, db = _setup_project(tmp_path, monkeypatch) db.close() From 972ee1a128b45b1858f8f078a254e6992367d5f7 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 12:54:53 +0530 Subject: [PATCH 08/44] =?UTF-8?q?feat(v3.1.0):=20M3=20Phase=201=20?= =?UTF-8?q?=E2=80=94=20skills=5Fstore=20storage=20layer?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the skill-library storage subsystem: a canonical, supersession- chained, reinforcement-aware procedural-memory store. Skills encode 'how to do X in this project' as ≤ 2 KB markdown procedures the agent can record (explicit now; induced in M5) and retrieve when a similar task recurs. mcp_server/storage/paths.py — additive: skills_path() → .codevira/skills.jsonl (canonical, committed). Doc note on the D000012 lock — pure path computation, ensure_dirs() still owns WRITE-path validation. mcp_server/storage/skills_store.py — the store. API: * record(name, procedure, summary, triggers, source, source_session_ids, do_not_revert, origin_override) → K-id. Validates inputs (procedure ≤ 2 KB, summary ≤ 256 B, source ∈ {explicit, induced}). Each record carries _schema_v: 1 + origin + K-prefixed monotonic id + normalized tags + token estimate. * mark_used(skill_id, success) — reinforcement loop. Success increments success_count + resets consecutive_failures + revives an archived skill. Failure increments failure_count + consecutive_failures; at 5 consecutive failures (configurable) auto-archives unless do_not_revert=True. * set_flag(skill_id, do_not_revert, tags) — lightweight amendment. * mark_archived(skill_id, reason) — manual archive. Refuses to archive do_not_revert skills (canonical doctrine). * supersede(old_id, name, procedure, summary, triggers, reason, do_not_revert) — writes new skill + amendment chain. Triggers inherit from old when not supplied; back-references on both sides. * get(skill_id) — single-skill merged view. * list_all(status, source, tags, limit) — filtered list. Default status=active; tags filter is set intersection. * decay_sweep(now, unused_archive_days=90) — auto-archive active skills unused past the cutoff. do_not_revert exempt; already-archived skills not double-counted. For codevira sync. Lifecycle states (mirrors decisions' protected-set convention): - active — default. Returned by get_skill. - archived — low-value (5 consec failures or 90d unused). - superseded — replaced by a successor; final state. Tests: tests/storage/test_skills_store.py — 33 tests across record validation, mark_used reinforcement loop, set_flag, mark_archived, supersede chain, list_all filtering, decay_sweep. 227 storage tests pass green; zero regressions from M2 baseline. Plan M3 Phase 1. Phase 2 (FTS5 + 6 MCP tools) is next. Co-Authored-By: Claude Opus 4.7 --- mcp_server/storage/paths.py | 14 + mcp_server/storage/skills_store.py | 508 +++++++++++++++++++++++++++++ tests/storage/test_skills_store.py | 341 +++++++++++++++++++ 3 files changed, 863 insertions(+) create mode 100644 mcp_server/storage/skills_store.py create mode 100644 tests/storage/test_skills_store.py diff --git a/mcp_server/storage/paths.py b/mcp_server/storage/paths.py index 6a87978..dc66d75 100644 --- a/mcp_server/storage/paths.py +++ b/mcp_server/storage/paths.py @@ -95,6 +95,20 @@ def config_path(project_root: Path | None = None) -> Path: return codevira_dir(project_root) / "config.yaml" +def skills_path(project_root: Path | None = None) -> Path: + """v3.1.0 M3: skill library store. + + Canonical (lives in ``.codevira/``, committed) because skills are + team-shareable procedural knowledge. Schema-versioned per the + v3.0.1 forward-compat convention (records carry ``_schema_v: 1``). + + See ``working_archived_path`` for the D000012 lock note — same + reasoning applies (additive path computation, ensure_dirs still + owns root validation). + """ + return codevira_dir(project_root) / "skills.jsonl" + + def working_archived_path(session_id: str, project_root: Path | None = None) -> Path: """v3.1.0 M2: opt-in commit target for working-memory entries. diff --git a/mcp_server/storage/skills_store.py b/mcp_server/storage/skills_store.py new file mode 100644 index 0000000..63a3d12 --- /dev/null +++ b/mcp_server/storage/skills_store.py @@ -0,0 +1,508 @@ +""" +skills_store.py — v3.1.0 M3 Phase 1: the skill library storage layer. + +Skills are reusable procedural patterns — "how to do thing X in this +project" — that the agent can record (explicitly via ``record_skill`` +or induced via the M5 pipeline) and retrieve when a similar task +recurs. Unlike decisions (facts, "why we chose X") or working memory +(intra-session scratchpad), skills are *procedures* — they encode +"what to do" in markdown. + +# Why a separate store + +- **Procedural memory** in the cognitive-science taxonomy: distinct + from episodic (decisions/sessions) and from working memory. +- **Team-shareable**: lives in ``.codevira/skills.jsonl`` (canonical, + committed) so a teammate's induced skill helps everyone. +- **Reinforcement-aware**: each skill carries success/failure counts + + a ``consecutive_failures`` watchdog. Stale or failing skills + archive themselves during ``codevira sync``. +- **Supersession-chained**: a v2 of a skill points to v1 via + ``supersedes`` / ``superseded_by`` so the audit trail is intact. + +# Lifecycle states + + - ``active`` — default. Returned by ``get_skill``. + - ``archived`` — low-value (5+ consecutive failures or + ``unused_days ≥ 90``, configurable). Not returned + by default; visible via ``list_skills(status="archived")``. + ``apply_outcome(skill_id, success=True)`` revives + it (resets ``consecutive_failures``, returns to + ``active``). + - ``superseded`` — replaced by a successor. Carries + ``superseded_by``. Final state. + +Skills with ``do_not_revert=True`` are EXEMPT from the auto-archive +sweep (mirrors decisions' do_not_revert semantics). + +# Schema + +:: + + { + "id": "K000001", + "ts": "2026-05-28T10:00:00+00:00", + "name": "git-rebase-workflow", + "summary": "One-line: how we rebase against main in this repo", + "procedure": "", + "procedure_token_estimate": 0, + "triggers": { + "tags": ["git", "rebase"], + "file_patterns": ["*.py", "Makefile"], + }, + "source": "explicit" | "induced", + "source_session_ids": [], + "success_count": 0, + "failure_count": 0, + "consecutive_failures": 0, + "last_used_at": null, + "unused_days": 0, + "status": "active" | "archived" | "superseded", + "supersedes": null, + "superseded_by": null, + "do_not_revert": false, + "origin": {ide, agent_model, host_hash, ts}, + "_schema_v": 1, + } + +# Amendment overlay + +Mutations (mark_used / set_flag / mark_archived / supersede) append +amendment rows that share the base id. ``jsonl_store.read_merged`` +folds them into the canonical view at read time. Underscored fields +(``_amendment_to_id``) do not overlay (matches decisions convention). +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timezone +from typing import Any + +from mcp_server.storage import jsonl_store, origin as origin_module, paths + +logger = logging.getLogger(__name__) + + +# ────────────────────────────────────────────────────────────────────── +# Constants +# ────────────────────────────────────────────────────────────────────── + +SCHEMA_V = 1 + +STATUS_ACTIVE = "active" +STATUS_ARCHIVED = "archived" +STATUS_SUPERSEDED = "superseded" +_VALID_STATUSES = frozenset({STATUS_ACTIVE, STATUS_ARCHIVED, STATUS_SUPERSEDED}) + +SOURCE_EXPLICIT = "explicit" +SOURCE_INDUCED = "induced" +_VALID_SOURCES = frozenset({SOURCE_EXPLICIT, SOURCE_INDUCED}) + +# Caps +_PROCEDURE_MAX_BYTES = 2048 +_SUMMARY_MAX_BYTES = 256 + +# Auto-archive thresholds (configurable via .codevira/config.yaml in +# a later phase; defaults here are the plan's stated values). +DEFAULT_MAX_CONSECUTIVE_FAILURES = 5 +DEFAULT_UNUSED_ARCHIVE_DAYS = 90 + + +# ────────────────────────────────────────────────────────────────────── +# Writes +# ────────────────────────────────────────────────────────────────────── + + +def record( + name: str, + procedure: str, + *, + summary: str | None = None, + triggers: dict[str, list[str]] | None = None, + source: str = SOURCE_EXPLICIT, + source_session_ids: list[str] | None = None, + do_not_revert: bool = False, + origin_override: dict | None = None, +) -> str: + """Append a new skill; return the generated K-id. + + Inputs are validated up front so the disk store never sees a + malformed record. + """ + if not isinstance(name, str) or not name.strip(): + raise ValueError("skills_store.record: name must be a non-empty string") + if not isinstance(procedure, str) or not procedure.strip(): + raise ValueError("skills_store.record: procedure must be a non-empty string") + procedure = procedure.strip() + if len(procedure.encode("utf-8")) > _PROCEDURE_MAX_BYTES: + raise ValueError( + f"skills_store.record: procedure exceeds {_PROCEDURE_MAX_BYTES} " + f"byte cap ({len(procedure.encode('utf-8'))} bytes given)" + ) + if summary is not None: + if not isinstance(summary, str): + raise ValueError("skills_store.record: summary must be a string or None") + if len(summary.encode("utf-8")) > _SUMMARY_MAX_BYTES: + raise ValueError( + f"skills_store.record: summary exceeds {_SUMMARY_MAX_BYTES} byte cap" + ) + if source not in _VALID_SOURCES: + raise ValueError( + f"skills_store.record: source must be one of {sorted(_VALID_SOURCES)}; " + f"got {source!r}" + ) + + # Triggers: normalize tags to lowercase + sort (mirrors decisions + # convention); file_patterns kept verbatim (they're already + # case-significant globs). + norm_tags: list[str] = [] + file_patterns: list[str] = [] + if triggers: + raw_tags = triggers.get("tags") or [] + norm_tags = sorted({str(t).strip().lower() for t in raw_tags if str(t).strip()}) + raw_patterns = triggers.get("file_patterns") or [] + file_patterns = [str(p) for p in raw_patterns if isinstance(p, str)] + + paths.ensure_dirs() + + # Lazy import: token_estimator is optional infrastructure (heavy + # tokenizer); failure here doesn't block the write. + estimate = _safe_estimate_tokens(procedure) + + base_record = { + "ts": datetime.now(timezone.utc).isoformat(), + "name": name.strip(), + "summary": (summary or "").strip() or None, + "procedure": procedure, + "procedure_token_estimate": estimate, + "triggers": {"tags": norm_tags, "file_patterns": file_patterns}, + "source": source, + "source_session_ids": list(source_session_ids or []), + "success_count": 0, + "failure_count": 0, + "consecutive_failures": 0, + "last_used_at": None, + "unused_days": 0, + "status": STATUS_ACTIVE, + "supersedes": None, + "superseded_by": None, + "do_not_revert": bool(do_not_revert), + "origin": origin_override or origin_module.current_origin(), + "_schema_v": SCHEMA_V, + } + + return jsonl_store.append_with_generated_id( + paths.skills_path(), base_record, prefix="K", width=6 + ) + + +def mark_used(skill_id: str, *, success: bool) -> dict[str, Any]: + """Apply one outcome to a skill — success or failure. + + Increments the relevant counter via amendment. On success, resets + ``consecutive_failures`` to 0 AND revives an archived skill (back to + ``status="active"``). On failure, increments ``consecutive_failures``; + if it crosses ``DEFAULT_MAX_CONSECUTIVE_FAILURES`` AND the skill is + not ``do_not_revert``, auto-archives. + + Returns the new (merged) view of the skill. + """ + existing = get(skill_id) + if existing is None: + return {"success": False, "error": f"skill {skill_id} not found"} + + new_success_count = int(existing.get("success_count", 0)) + new_failure_count = int(existing.get("failure_count", 0)) + new_consecutive = int(existing.get("consecutive_failures", 0)) + new_status = existing.get("status", STATUS_ACTIVE) + revived = False + + if success: + new_success_count += 1 + new_consecutive = 0 + # Revive an archived skill if a fresh success comes in. + if new_status == STATUS_ARCHIVED: + new_status = STATUS_ACTIVE + revived = True + else: + new_failure_count += 1 + new_consecutive += 1 + # Auto-archive on threshold unless do_not_revert is set. + if ( + new_status == STATUS_ACTIVE + and not existing.get("do_not_revert") + and new_consecutive >= DEFAULT_MAX_CONSECUTIVE_FAILURES + ): + new_status = STATUS_ARCHIVED + + amendment = { + "id": skill_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": skill_id, + "success_count": new_success_count, + "failure_count": new_failure_count, + "consecutive_failures": new_consecutive, + "last_used_at": datetime.now(timezone.utc).isoformat(), + "unused_days": 0, + "status": new_status, + } + paths.ensure_dirs() + jsonl_store.append(paths.skills_path(), amendment) + return { + "success": True, + "skill_id": skill_id, + "status": new_status, + "consecutive_failures": new_consecutive, + "revived": revived, + } + + +def set_flag( + skill_id: str, + *, + do_not_revert: bool | None = None, + tags: list[str] | None = None, +) -> dict[str, Any]: + """Lightweight in-place flag/tag updates via an amendment line. + + Mirrors ``decisions_store.set_flag`` semantics. Either or both of + ``do_not_revert`` / ``tags`` may be supplied. No-op if neither is. + """ + existing = get(skill_id) + if existing is None: + return {"success": False, "error": f"skill {skill_id} not found"} + + updates: dict[str, Any] = {} + if do_not_revert is not None: + updates["do_not_revert"] = bool(do_not_revert) + if tags is not None: + if not isinstance(tags, list) or not all(isinstance(t, str) for t in tags): + return {"success": False, "error": "tags must be a list[str]"} + # Mirror decisions normalization. + norm_tags = sorted({str(t).strip().lower() for t in tags if str(t).strip()}) + # The triggers dict carries tags + file_patterns; merge. + merged_triggers = dict(existing.get("triggers") or {}) + merged_triggers["tags"] = norm_tags + updates["triggers"] = merged_triggers + + if not updates: + return {"success": True, "skill_id": skill_id, "updates": {}} + + amendment = { + "id": skill_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": skill_id, + **updates, + } + paths.ensure_dirs() + jsonl_store.append(paths.skills_path(), amendment) + return {"success": True, "skill_id": skill_id, "updates": updates} + + +def mark_archived(skill_id: str, *, reason: str | None = None) -> dict[str, Any]: + """Manually archive a skill. Useful when the user knows a skill is + obsolete but the auto-sweep hasn't fired yet. + + Refuses to archive ``do_not_revert=True`` skills — those represent + canonical doctrine. + """ + existing = get(skill_id) + if existing is None: + return {"success": False, "error": f"skill {skill_id} not found"} + if existing.get("do_not_revert"): + return { + "success": False, + "error": ( + f"skill {skill_id} is do_not_revert=true; refusing to archive. " + f"Clear the flag first via set_flag(skill_id, do_not_revert=False)." + ), + } + amendment = { + "id": skill_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": skill_id, + "status": STATUS_ARCHIVED, + } + if reason: + amendment["_archive_reason"] = reason + paths.ensure_dirs() + jsonl_store.append(paths.skills_path(), amendment) + return {"success": True, "skill_id": skill_id, "status": STATUS_ARCHIVED} + + +def supersede( + old_id: str, + *, + name: str, + procedure: str, + summary: str | None = None, + triggers: dict[str, list[str]] | None = None, + reason: str = "", + do_not_revert: bool = False, +) -> dict[str, Any]: + """Append a new skill that supersedes ``old_id`` and amendment-mark + the old one as ``superseded`` with a backref. + + The new skill inherits the old skill's tags/file_patterns when + ``triggers`` is not supplied (matches decisions.supersede pattern + for file_path / tags inheritance). + """ + old = get(old_id) + if old is None: + return {"success": False, "error": f"skill {old_id} not found"} + + inherited_triggers = triggers or { + "tags": (old.get("triggers") or {}).get("tags") or [], + "file_patterns": (old.get("triggers") or {}).get("file_patterns") or [], + } + + new_summary = summary if summary is not None else old.get("summary") + + new_id = record( + name=name, + procedure=procedure, + summary=new_summary, + triggers=inherited_triggers, + source=SOURCE_EXPLICIT, + do_not_revert=do_not_revert, + ) + + # Amend the old skill to mark it superseded. + paths.ensure_dirs() + amendment = { + "id": old_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": old_id, + "status": STATUS_SUPERSEDED, + "superseded_by": new_id, + } + if reason: + amendment["_supersede_reason"] = reason + jsonl_store.append(paths.skills_path(), amendment) + + # Also amend the NEW skill to record the back-reference. + back_ref = { + "id": new_id, + "ts": datetime.now(timezone.utc).isoformat(), + "_amendment_to_id": new_id, + "supersedes": old_id, + } + jsonl_store.append(paths.skills_path(), back_ref) + return {"success": True, "old_id": old_id, "new_id": new_id} + + +# ────────────────────────────────────────────────────────────────────── +# Reads +# ────────────────────────────────────────────────────────────────────── + + +def get(skill_id: str) -> dict[str, Any] | None: + """Return the merged record for ``skill_id``, or None.""" + for rec in jsonl_store.read_merged(paths.skills_path()): + if str(rec.get("id")) == skill_id: + return rec + return None + + +def list_all( + *, + status: str | None = STATUS_ACTIVE, + source: str | None = None, + tags: list[str] | None = None, + limit: int = 50, +) -> list[dict[str, Any]]: + """List skills filtered by status / source / tags intersection. + + ``status=None`` returns every state. ``status=STATUS_ACTIVE`` + (default) returns only the actively-used set — the daily-driver + surface. Tags filter is intersection: a skill matches only if it + has ALL the requested tags. + """ + merged = jsonl_store.read_merged(paths.skills_path()) + norm_tags_filter = ( + {str(t).strip().lower() for t in tags if str(t).strip()} if tags else None + ) + + out: list[dict[str, Any]] = [] + for r in merged: + if status is not None and r.get("status", STATUS_ACTIVE) != status: + continue + if source is not None and r.get("source") != source: + continue + if norm_tags_filter: + rec_tags = set((r.get("triggers") or {}).get("tags") or []) + if not norm_tags_filter.issubset(rec_tags): + continue + out.append(r) + if len(out) >= limit: + break + return out + + +# ────────────────────────────────────────────────────────────────────── +# Maintenance +# ────────────────────────────────────────────────────────────────────── + + +def decay_sweep( + *, + now: datetime | None = None, + unused_archive_days: int = DEFAULT_UNUSED_ARCHIVE_DAYS, +) -> dict[str, Any]: + """Auto-archive active skills that haven't been used in + ``unused_archive_days``. Called by ``codevira sync``. Returns + ``{archived, scanned, dry_run=False}``. + + Skills with ``do_not_revert=True`` are exempt (mirrors decisions). + Skills with no ``last_used_at`` are considered "never used"; their + age is computed against ``ts`` (creation). + """ + now_dt = now or datetime.now(timezone.utc) + cutoff_seconds = unused_archive_days * 86400 + + skills = jsonl_store.read_merged(paths.skills_path()) + archived: list[str] = [] + scanned = 0 + + for s in skills: + scanned += 1 + if s.get("status", STATUS_ACTIVE) != STATUS_ACTIVE: + continue + if s.get("do_not_revert"): + continue + last_used = s.get("last_used_at") or s.get("ts") + if not isinstance(last_used, str): + continue + try: + ref = datetime.fromisoformat(last_used) + if ref.tzinfo is None: + ref = ref.replace(tzinfo=timezone.utc) + except (ValueError, TypeError): + continue + if (now_dt - ref).total_seconds() >= cutoff_seconds: + res = mark_archived( + str(s["id"]), reason=f"unused for ≥ {unused_archive_days} days" + ) + if res.get("success"): + archived.append(str(s["id"])) + + return {"archived": archived, "archived_count": len(archived), "scanned": scanned} + + +# ────────────────────────────────────────────────────────────────────── +# Helpers +# ────────────────────────────────────────────────────────────────────── + + +def _safe_estimate_tokens(text: str) -> int: + """Best-effort token count via token_estimator. Fallback: ~4 + chars/token rule-of-thumb so the field is always populated. + """ + try: + from mcp_server.storage.token_estimator import estimate_tokens + + return int(estimate_tokens(text)) + except Exception: # noqa: BLE001 + # Conservative 1 token ≈ 4 bytes (UTF-8) estimate. + return max(1, len(text.encode("utf-8")) // 4) diff --git a/tests/storage/test_skills_store.py b/tests/storage/test_skills_store.py new file mode 100644 index 0000000..bf22243 --- /dev/null +++ b/tests/storage/test_skills_store.py @@ -0,0 +1,341 @@ +""" +Tests for mcp_server.storage.skills_store — v3.1.0 M3 Phase 1. + +Coverage: + - record() input validation (name, procedure, summary, source) + - schema (K-id, _schema_v: 1, origin stamp, normalized tags) + - mark_used: success / failure / auto-archive at threshold / revive + - set_flag: do_not_revert + tags + - mark_archived + do_not_revert refusal + - supersede chain + back-reference + - list_all: status / source / tags filters + - decay_sweep: auto-archive on unused threshold; do_not_revert exempt +""" + +from __future__ import annotations + +import re +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import jsonl_store, paths, skills_store + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# Record + schema +# ────────────────────────────────────────────────────────────────────── + + +class TestRecord: + _ID_PATTERN = re.compile(r"^K\d{6}$") + + def test_basic_returns_k_id(self, project: Path) -> None: + kid = skills_store.record( + name="git-rebase-workflow", + procedure="1. Fetch origin\n2. Rebase against main\n3. Push --force-with-lease", + ) + assert self._ID_PATTERN.match(kid), kid + + def test_record_has_schema_v_and_origin( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + skills_store.record(name="x", procedure="step 1", summary="short desc") + rows = jsonl_store.read_all(paths.skills_path()) + rec = rows[0] + assert rec["_schema_v"] == 1 + assert rec["origin"]["ide"] == "claude_code" + assert rec["status"] == "active" + assert rec["source"] == "explicit" + + def test_tags_lowercased_and_sorted(self, project: Path) -> None: + skills_store.record( + name="x", + procedure="p", + triggers={"tags": ["Z-Tag", "a-tag", "B-Tag"], "file_patterns": ["*.py"]}, + ) + rec = jsonl_store.read_all(paths.skills_path())[0] + assert rec["triggers"]["tags"] == ["a-tag", "b-tag", "z-tag"] + assert rec["triggers"]["file_patterns"] == ["*.py"] + + def test_empty_name_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="name"): + skills_store.record(name=" ", procedure="p") + + def test_empty_procedure_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="procedure"): + skills_store.record(name="x", procedure="") + + def test_oversize_procedure_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="2048 byte cap"): + skills_store.record(name="x", procedure="x" * 2049) + + def test_oversize_summary_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="256 byte cap"): + skills_store.record(name="x", procedure="p", summary="s" * 257) + + def test_invalid_source_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="source"): + skills_store.record(name="x", procedure="p", source="hand-crafted") + + def test_procedure_token_estimate_populated(self, project: Path) -> None: + skills_store.record(name="x", procedure="some procedure text here") + rec = jsonl_store.read_all(paths.skills_path())[0] + assert rec["procedure_token_estimate"] > 0 + + +# ────────────────────────────────────────────────────────────────────── +# mark_used: reinforcement loop +# ────────────────────────────────────────────────────────────────────── + + +class TestMarkUsed: + def test_success_increments_count(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + res = skills_store.mark_used(kid, success=True) + assert res["success"] is True + rec = skills_store.get(kid) + assert rec["success_count"] == 1 + assert rec["failure_count"] == 0 + assert rec["consecutive_failures"] == 0 + assert rec["last_used_at"] is not None + + def test_failure_increments_count_and_consecutive(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + for _ in range(3): + skills_store.mark_used(kid, success=False) + rec = skills_store.get(kid) + assert rec["failure_count"] == 3 + assert rec["consecutive_failures"] == 3 + assert rec["status"] == "active" # below threshold + + def test_success_resets_consecutive_failures(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + for _ in range(3): + skills_store.mark_used(kid, success=False) + skills_store.mark_used(kid, success=True) + rec = skills_store.get(kid) + assert rec["consecutive_failures"] == 0 + assert rec["failure_count"] == 3 + assert rec["success_count"] == 1 + + def test_auto_archive_at_5_consecutive_failures(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + for _ in range(5): + skills_store.mark_used(kid, success=False) + rec = skills_store.get(kid) + assert rec["status"] == "archived" + + def test_do_not_revert_exempt_from_auto_archive(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p", do_not_revert=True) + for _ in range(10): + skills_store.mark_used(kid, success=False) + rec = skills_store.get(kid) + # do_not_revert protects from auto-archive even past the threshold. + assert rec["status"] == "active" + assert rec["consecutive_failures"] == 10 + + def test_revival_after_archive(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + for _ in range(5): + skills_store.mark_used(kid, success=False) + # Auto-archived now. + res = skills_store.mark_used(kid, success=True) + assert res["revived"] is True + rec = skills_store.get(kid) + assert rec["status"] == "active" + + def test_unknown_skill_returns_error(self, project: Path) -> None: + res = skills_store.mark_used("K999999", success=True) + assert res["success"] is False + assert "not found" in res["error"] + + +# ────────────────────────────────────────────────────────────────────── +# set_flag + mark_archived +# ────────────────────────────────────────────────────────────────────── + + +class TestSetFlag: + def test_toggle_do_not_revert(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + skills_store.set_flag(kid, do_not_revert=True) + rec = skills_store.get(kid) + assert rec["do_not_revert"] is True + + def test_update_tags(self, project: Path) -> None: + kid = skills_store.record( + name="x", procedure="p", triggers={"tags": ["old"], "file_patterns": []} + ) + skills_store.set_flag(kid, tags=["new-tag", "another"]) + rec = skills_store.get(kid) + assert sorted(rec["triggers"]["tags"]) == ["another", "new-tag"] + + def test_no_updates_is_noop(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + res = skills_store.set_flag(kid) + assert res["updates"] == {} + + +class TestMarkArchived: + def test_archive_active_skill(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + skills_store.mark_archived(kid, reason="manual") + rec = skills_store.get(kid) + assert rec["status"] == "archived" + + def test_refuse_archive_do_not_revert(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p", do_not_revert=True) + res = skills_store.mark_archived(kid) + assert res["success"] is False + assert "do_not_revert" in res["error"] + + +# ────────────────────────────────────────────────────────────────────── +# Supersession +# ────────────────────────────────────────────────────────────────────── + + +class TestSupersede: + def test_supersede_marks_old_and_creates_new(self, project: Path) -> None: + kid_old = skills_store.record( + name="git-workflow-v1", + procedure="rebase the manual way", + triggers={"tags": ["git"], "file_patterns": ["*.py"]}, + ) + res = skills_store.supersede( + kid_old, + name="git-workflow-v2", + procedure="rebase via the new alias", + reason="moved to git-rebase-bot helper", + ) + assert res["success"] is True + kid_new = res["new_id"] + assert kid_new != kid_old + + old = skills_store.get(kid_old) + new = skills_store.get(kid_new) + assert old["status"] == "superseded" + assert old["superseded_by"] == kid_new + assert new["supersedes"] == kid_old + # Triggers inherited from the old skill. + assert new["triggers"]["tags"] == ["git"] + assert new["triggers"]["file_patterns"] == ["*.py"] + + def test_supersede_explicit_triggers_override_inheritance( + self, project: Path + ) -> None: + kid_old = skills_store.record( + name="x", + procedure="p", + triggers={"tags": ["old"], "file_patterns": []}, + ) + res = skills_store.supersede( + kid_old, + name="x2", + procedure="p2", + triggers={"tags": ["new-tag"], "file_patterns": ["*.md"]}, + ) + new = skills_store.get(res["new_id"]) + assert new["triggers"]["tags"] == ["new-tag"] + assert new["triggers"]["file_patterns"] == ["*.md"] + + def test_supersede_unknown_skill_rejected(self, project: Path) -> None: + res = skills_store.supersede("K999999", name="x", procedure="p") + assert res["success"] is False + + +# ────────────────────────────────────────────────────────────────────── +# list_all +# ────────────────────────────────────────────────────────────────────── + + +class TestListAll: + def test_default_returns_active_only(self, project: Path) -> None: + kid_a = skills_store.record(name="a", procedure="p") + kid_b = skills_store.record(name="b", procedure="p") + skills_store.mark_archived(kid_b) + live = skills_store.list_all() + assert [r["id"] for r in live] == [kid_a] + + def test_status_filter_archived(self, project: Path) -> None: + kid_a = skills_store.record(name="a", procedure="p") + skills_store.mark_archived(kid_a) + archived = skills_store.list_all(status="archived") + assert [r["id"] for r in archived] == [kid_a] + + def test_status_none_returns_all(self, project: Path) -> None: + kid_a = skills_store.record(name="a", procedure="p") + kid_b = skills_store.record(name="b", procedure="p") + skills_store.mark_archived(kid_a) + ids = {r["id"] for r in skills_store.list_all(status=None)} + assert ids == {kid_a, kid_b} + + def test_source_filter(self, project: Path) -> None: + skills_store.record(name="explicit", procedure="p", source="explicit") + skills_store.record(name="induced", procedure="p", source="induced") + only_induced = skills_store.list_all(source="induced") + assert [r["name"] for r in only_induced] == ["induced"] + + def test_tags_filter_is_intersection(self, project: Path) -> None: + skills_store.record( + name="A", procedure="p", triggers={"tags": ["git", "release"]} + ) + skills_store.record(name="B", procedure="p", triggers={"tags": ["git"]}) + skills_store.record(name="C", procedure="p", triggers={"tags": ["release"]}) + only_both = skills_store.list_all(tags=["git", "release"]) + assert [r["name"] for r in only_both] == ["A"] + + +# ────────────────────────────────────────────────────────────────────── +# decay_sweep +# ────────────────────────────────────────────────────────────────────── + + +class TestDecaySweep: + def test_unused_skill_archived( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + kid = skills_store.record(name="x", procedure="p") + # 100 days later → past the 90-day cutoff. + future = datetime(2027, 1, 1, tzinfo=timezone.utc) + timedelta(days=100) + res = skills_store.decay_sweep(now=future) + assert kid in res["archived"] + rec = skills_store.get(kid) + assert rec["status"] == "archived" + + def test_recently_used_not_archived(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + skills_store.mark_used(kid, success=True) # last_used_at = now + res = skills_store.decay_sweep(now=datetime.now(timezone.utc)) + assert kid not in res["archived"] + rec = skills_store.get(kid) + assert rec["status"] == "active" + + def test_do_not_revert_skill_exempt_from_sweep(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p", do_not_revert=True) + future = datetime(2030, 1, 1, tzinfo=timezone.utc) + res = skills_store.decay_sweep(now=future) + assert kid not in res["archived"] + assert skills_store.get(kid)["status"] == "active" + + def test_archived_skill_not_re_archived(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + skills_store.mark_archived(kid) + future = datetime(2030, 1, 1, tzinfo=timezone.utc) + res = skills_store.decay_sweep(now=future) + # Already archived → skipped (not double-counted). + assert kid not in res["archived"] From d0f27983b3cd4ecebe83b7871b4ff93bf86a3335 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 13:08:48 +0530 Subject: [PATCH 09/44] =?UTF-8?q?feat(v3.1.0):=20M3=20Phase=202=20?= =?UTF-8?q?=E2=80=94=20FTS5=20skills=20table=20+=206=20MCP=20tools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes M3 by adding the FTS5 retrieval layer and the agent-facing MCP surface on top of M3 Phase 1's storage layer. FTS5 skills table: mcp_server/storage/fts5_index.py — additive (existing decision callers unchanged): - New _SKILL_TABLE = 'skill_fts' coexists in the same .codevira-cache/fts5.sqlite file as decision_fts. Separate meta key ('skill_source_mtime') tracks the skills index independently from decisions. - rebuild_skills_from_jsonl(skills_path, index_path) — drop + recreate skill_fts from skills.jsonl. Skips superseded skills. - add_skill(index_path, skill) — incremental indexing, called from skills_store.record(). DELETE-then-INSERT for idempotency. - search_skills(index_path, query, limit) — BM25-ranked search; name 3.0 / summary 1.5 / procedure 1.0 weights. - skill_staleness_check(skills_path, index_path) — parallel to the decisions check; uses the dedicated meta key. Composite ranking (skills_store.search): mcp_server/storage/skills_store.py adds search() with the plan's formula: score = 0.5 × BM25_norm + 0.3 × tag_jaccard + 0.2 × recency_decay BM25_norm = -bm25_raw / max(-bm25_raw) (in [0, 1]) tag_jaccard = |query_tokens ∩ skill_tags| / |union| recency_decay = exp(-Δdays_since_last_used / 30) recency_decay scores 0 for never-used skills — recency is a *usage* signal, not an existence signal. skills_store.record() now calls fts5_index.add_skill (best-effort, P9 — never blocks the write). 6 MCP tools (mcp_server/tools/skills.py + server.py registration): - record_skill — runs check_conflict on SKILLS corpus first; force=True overrides. - get_skill — composite-ranked hits with score_breakdown. - apply_skill_outcome — manual reinforcement override. - list_skills — daily-driver active list by default; status='all' returns every state. - supersede_skill — version a skill with amendment chain. - promote_skill_to_playbook — writes the procedure as .codevira/playbooks//.md. Refuses overwrite without force=True. Registered via 6 Tool(...) entries in list_tools() and 6 dispatch branches in call_tool(). Tests: - tests/storage/test_skills_store.py::TestSearch (10 new): empty query, finds by text, excludes archived/superseded, tag jaccard boosts score, recency uses last_used_at, file_path filter, weights overridable, top_k cap, lazy rebuild on stale index. - tests/test_tools_skills.py (27 new): record_skill validation + force override, get_skill response shape + file_path filter, apply_skill_outcome variants, list_skills filters, supersede chain, promote_skill_to_playbook (write, refuse-overwrite, force-overwrite, explicit name, unknown skill, superseded rejection, empty task_type, unslugifiable name). 799 tests across storage + tools + check_conflict + server + ide_inject + engine + cli pass green; zero regressions from the M3 Phase 1 baseline. Existing fts5_index tests (decisions) unchanged. Plan M3 Phase 2. M3 complete; M4 (spatial memory) is next. Co-Authored-By: Claude Opus 4.7 --- mcp_server/server.py | 262 +++++++++++++++++++ mcp_server/storage/fts5_index.py | 190 ++++++++++++++ mcp_server/storage/skills_store.py | 208 ++++++++++++++- mcp_server/tools/skills.py | 396 +++++++++++++++++++++++++++++ tests/storage/test_skills_store.py | 139 ++++++++++ tests/test_tools_skills.py | 295 +++++++++++++++++++++ 6 files changed, 1488 insertions(+), 2 deletions(-) create mode 100644 mcp_server/tools/skills.py create mode 100644 tests/test_tools_skills.py diff --git a/mcp_server/server.py b/mcp_server/server.py index c60ecf8..fe946a6 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -1048,6 +1048,210 @@ async def list_tools() -> list[Tool]: }, }, ), + # ---- v3.1.0 M3: skill library (procedural memory) ---- + Tool( + name="record_skill", + description=( + "v3.1.0 M3: Author a new skill in the canonical store " + "(.codevira/skills.jsonl). Skills encode 'how to do X in " + "this project' as markdown procedures. Calls check_conflict " + "against the SKILLS corpus before writing; near-duplicate " + "warnings can be overridden via force=True. Use supersede_skill " + "to version an existing skill, or promote_skill_to_playbook to " + "promote a skill into the existing playbook system." + ), + inputSchema={ + "type": "object", + "properties": { + "name": { + "type": "string", + "description": "Short identifier (e.g., 'git-rebase-workflow')", + }, + "procedure": { + "type": "string", + "description": "Markdown body of how to do this thing (max 2 KB)", + }, + "summary": { + "type": "string", + "description": "Optional one-liner (max 256 B)", + }, + "triggers": { + "type": "object", + "properties": { + "tags": {"type": "array", "items": {"type": "string"}}, + "file_patterns": { + "type": "array", + "items": {"type": "string"}, + }, + }, + "description": ( + "Discovery hints: tags (lowercased, set-membership " + "for jaccard ranking) + file_patterns (fnmatch globs " + "for file-scoped retrieval)" + ), + }, + "source": { + "type": "string", + "enum": ["explicit", "induced"], + "default": "explicit", + }, + "do_not_revert": { + "type": "boolean", + "default": False, + "description": ( + "Exempt from auto-archive sweep; flag canonical " + "doctrine." + ), + }, + "force": { + "type": "boolean", + "default": False, + "description": "Skip duplicate-check warning", + }, + }, + "required": ["name", "procedure"], + }, + ), + Tool( + name="get_skill", + description=( + "v3.1.0 M3: Composite-ranked search over active skills. " + "score = 0.5 × BM25_norm + 0.3 × tag_jaccard + 0.2 × recency_decay " + "(τ=30d, never-used skills score 0 recency). Returns hits with " + "score_breakdown for debuggability. Pass file_path to filter " + "skills whose trigger file_patterns don't match." + ), + inputSchema={ + "type": "object", + "properties": { + "query": { + "type": "string", + "description": "Search keywords (e.g., 'rebase main')", + }, + "top_k": {"type": "integer", "default": 5}, + "file_path": { + "type": "string", + "description": ( + "Optional file path to filter skills by their " + "trigger file_patterns (fnmatch). Skills with no " + "patterns match anything (not filtered)." + ), + }, + }, + "required": ["query"], + }, + ), + Tool( + name="apply_skill_outcome", + description=( + "v3.1.0 M3: Manually record one outcome for a skill — success " + "or failure. Reinforces the reinforcement loop (resets " + "consecutive_failures on success; auto-archives at 5 consecutive " + "failures unless do_not_revert=True). The canonical signal in " + "M5+ comes from outcomes_writer.py (git-derived, not " + "agent-self-reported); this tool is the manual override." + ), + inputSchema={ + "type": "object", + "properties": { + "skill_id": {"type": "string"}, + "success": {"type": "boolean"}, + }, + "required": ["skill_id", "success"], + }, + ), + Tool( + name="list_skills", + description=( + "v3.1.0 M3: Filtered list of skills. status='active' (default) " + "returns the daily-driver set; 'all' returns every state; any " + "other value filters to that one state. tags filter is set " + "intersection." + ), + inputSchema={ + "type": "object", + "properties": { + "status": { + "type": "string", + "description": "active | archived | superseded | all", + "default": "active", + }, + "source": { + "type": "string", + "enum": ["explicit", "induced"], + }, + "tags": { + "type": "array", + "items": {"type": "string"}, + }, + "limit": {"type": "integer", "default": 50}, + }, + }, + ), + Tool( + name="supersede_skill", + description=( + "v3.1.0 M3: Version a skill. Writes a new skill that supersedes " + "old_id; amendment-marks the old as 'superseded' with a backref. " + "Triggers inherit from the old skill when not supplied. The old " + "skill no longer surfaces in search after this; it's still " + "retrievable via list_skills(status='superseded') for audit." + ), + inputSchema={ + "type": "object", + "properties": { + "old_id": {"type": "string"}, + "name": {"type": "string"}, + "procedure": {"type": "string"}, + "summary": {"type": "string"}, + "triggers": { + "type": "object", + "properties": { + "tags": {"type": "array", "items": {"type": "string"}}, + "file_patterns": { + "type": "array", + "items": {"type": "string"}, + }, + }, + }, + "reason": {"type": "string"}, + "do_not_revert": {"type": "boolean", "default": False}, + }, + "required": ["old_id", "name", "procedure"], + }, + ), + Tool( + name="promote_skill_to_playbook", + description=( + "v3.1.0 M3: Write the skill's procedure as a playbook markdown " + "file at .codevira/playbooks//.md. Refuses on " + "existing file unless force=True so hand-written playbooks " + "aren't clobbered. After promotion the procedure is also " + "discoverable via get_playbook(task_type)." + ), + inputSchema={ + "type": "object", + "properties": { + "skill_id": {"type": "string"}, + "task_type": { + "type": "string", + "description": ( + "Playbook directory name (e.g., 'commit', " + "'add_tool', 'debug_pipeline')" + ), + }, + "name": { + "type": "string", + "description": ( + "Optional filename slug; defaults to " + "slugified(skill.name)" + ), + }, + "force": {"type": "boolean", "default": False}, + }, + "required": ["skill_id", "task_type"], + }, + ), # ---- v1.5: Deep Graph Intelligence Tools ---- Tool( name="query_graph", @@ -1378,6 +1582,64 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: from mcp_server.tools.working import get_working_context result = get_working_context(top_k=arguments.get("top_k", 5)) + # ---- v3.1.0 M3: skill library dispatch ---- + elif name == "record_skill": + from mcp_server.tools.skills import record_skill + + result = record_skill( + name=arguments["name"], + procedure=arguments["procedure"], + summary=arguments.get("summary"), + triggers=arguments.get("triggers"), + source=arguments.get("source", "explicit"), + do_not_revert=arguments.get("do_not_revert", False), + force=arguments.get("force", False), + ) + elif name == "get_skill": + from mcp_server.tools.skills import get_skill + + result = get_skill( + query=arguments["query"], + top_k=arguments.get("top_k", 5), + file_path=arguments.get("file_path"), + ) + elif name == "apply_skill_outcome": + from mcp_server.tools.skills import apply_skill_outcome + + result = apply_skill_outcome( + skill_id=arguments["skill_id"], + success=arguments["success"], + ) + elif name == "list_skills": + from mcp_server.tools.skills import list_skills + + result = list_skills( + status=arguments.get("status", "active"), + source=arguments.get("source"), + tags=arguments.get("tags"), + limit=arguments.get("limit", 50), + ) + elif name == "supersede_skill": + from mcp_server.tools.skills import supersede_skill + + result = supersede_skill( + old_id=arguments["old_id"], + name=arguments["name"], + procedure=arguments["procedure"], + summary=arguments.get("summary"), + triggers=arguments.get("triggers"), + reason=arguments.get("reason", ""), + do_not_revert=arguments.get("do_not_revert", False), + ) + elif name == "promote_skill_to_playbook": + from mcp_server.tools.skills import promote_skill_to_playbook + + result = promote_skill_to_playbook( + skill_id=arguments["skill_id"], + task_type=arguments["task_type"], + name=arguments.get("name"), + force=arguments.get("force", False), + ) else: result = {"error": f"Unknown tool: {name}"} diff --git a/mcp_server/storage/fts5_index.py b/mcp_server/storage/fts5_index.py index 7486698..a76564c 100644 --- a/mcp_server/storage/fts5_index.py +++ b/mcp_server/storage/fts5_index.py @@ -48,6 +48,15 @@ _TABLE = "decision_fts" _META_TABLE = "fts_meta" +# v3.1.0 M3 Phase 2: skills FTS5 table coexists in the same .sqlite +# file. Separate meta key (``skill_source_mtime``) so the staleness +# check tracks decisions and skills independently. Weights below +# (name 3.0 / summary 1.5 / procedure 1.0) match the plan's stated +# ranking; tags is UNINDEXED — agents can supply tags as a separate +# Jaccard filter at the skills_store layer rather than letting FTS5 +# stem them. +_SKILL_TABLE = "skill_fts" + # Schema is intentionally minimal — FTS5 is fast even without elaborate # tokenization tweaks. Porter stemmer covers "auth" → "authentication" # style matches; remove_diacritics handles café/cafe; ascii fallback @@ -65,6 +74,18 @@ ); """ +_CREATE_SKILL_SQL = f""" +CREATE VIRTUAL TABLE IF NOT EXISTS {_SKILL_TABLE} +USING fts5( + skill_id UNINDEXED, + name, + summary, + procedure, + tags UNINDEXED, + tokenize = "porter unicode61 remove_diacritics 2" +); +""" + _CREATE_META_SQL = f""" CREATE TABLE IF NOT EXISTS {_META_TABLE} ( key TEXT PRIMARY KEY, @@ -118,6 +139,7 @@ def _ensure_tables(conn: sqlite3.Connection) -> None: except Exception: # noqa: BLE001 pass conn.execute(_CREATE_SQL) + conn.execute(_CREATE_SKILL_SQL) # v3.1.0 M3 Phase 2 conn.execute(_CREATE_META_SQL) conn.commit() @@ -297,6 +319,174 @@ def staleness_check(decisions_path: Path, index_path: Path) -> bool: conn.close() +# ─── Skills FTS5 (v3.1.0 M3 Phase 2) ───────────────────────────────── + + +def rebuild_skills_from_jsonl(skills_path: Path, index_path: Path) -> int: + """Drop + recreate the skill_fts table from skills.jsonl. + + Returns the number of indexed skills. Skips superseded entries to + match ``list_skills`` default behavior. Same atomicity contract as + ``rebuild_from_jsonl`` — single transaction inside the connection. + """ + conn = _connect(index_path) + try: + _ensure_tables(conn) + records = jsonl_store.read_merged(skills_path) + + with conn: + conn.execute(f"DELETE FROM {_SKILL_TABLE}") + for r in records: + if r.get("status") == "superseded": + continue + conn.execute( + f"INSERT INTO {_SKILL_TABLE} " + "(skill_id, name, summary, procedure, tags) " + "VALUES (?, ?, ?, ?, ?)", + ( + str(r.get("id", "")), + r.get("name") or "", + r.get("summary") or "", + r.get("procedure") or "", + " ".join((r.get("triggers") or {}).get("tags") or []), + ), + ) + try: + src_mtime = skills_path.stat().st_mtime + except OSError: + src_mtime = 0 + conn.execute( + f"INSERT OR REPLACE INTO {_META_TABLE}(key, value) VALUES(?, ?)", + ("skill_source_mtime", str(src_mtime)), + ) + count = conn.execute(f"SELECT COUNT(*) FROM {_SKILL_TABLE}").fetchone()[0] + return int(count) + finally: + conn.close() + + +def add_skill(index_path: Path, skill: dict[str, Any]) -> None: + """Incrementally index one skill. Called after skills_store.record. + + Idempotent: an existing skill_id is DELETEd before INSERT, so a + second add (e.g., after an amendment) cleanly replaces the row. + Skips superseded skills so they don't pollute search results. + """ + if skill.get("status") == "superseded": + return + + conn = _connect(index_path) + try: + _ensure_tables(conn) + kid = str(skill.get("id", "")) + if not kid: + return + with conn: + conn.execute(f"DELETE FROM {_SKILL_TABLE} WHERE skill_id = ?", (kid,)) + conn.execute( + f"INSERT INTO {_SKILL_TABLE} " + "(skill_id, name, summary, procedure, tags) " + "VALUES (?, ?, ?, ?, ?)", + ( + kid, + skill.get("name") or "", + skill.get("summary") or "", + skill.get("procedure") or "", + " ".join((skill.get("triggers") or {}).get("tags") or []), + ), + ) + finally: + conn.close() + + +def search_skills( + index_path: Path, + query: str, + *, + limit: int = 5, +) -> list[dict[str, Any]]: + """BM25-ranked FTS5 search over the skill library. + + Returns ``[{"skill_id": str, "score": float, "snippet": str}, ...]`` + sorted ascending by BM25 distance (best matches first). Weights + per the plan: name 3.0, summary 1.5, procedure 1.0; tags is + UNINDEXED so it doesn't contribute to BM25 ranking. + + Bad-query handling mirrors ``search()``: malformed inputs return + [] rather than raising. + """ + if not index_path.is_file(): + return [] + if not query or not query.strip(): + return [] + sanitized = _sanitize_fts_query(query) + if not sanitized: + return [] + + conn = _connect(index_path) + try: + try: + cursor = conn.execute( + f""" + SELECT skill_id, + bm25({_SKILL_TABLE}, 3.0, 1.5, 1.0, 0.0) AS score, + snippet({_SKILL_TABLE}, 2, '[', ']', '…', 12) AS snippet + FROM {_SKILL_TABLE} + WHERE {_SKILL_TABLE} MATCH ? + ORDER BY score + LIMIT ? + """, + (sanitized, limit), + ) + return [ + { + "skill_id": row["skill_id"], + "score": float(row["score"]), + "snippet": row["snippet"], + } + for row in cursor.fetchall() + ] + except sqlite3.OperationalError as exc: + logger.warning( + "fts5_index.search_skills: query failed (%r); returning empty: %s", + query, + exc, + ) + return [] + finally: + conn.close() + + +def skill_staleness_check(skills_path: Path, index_path: Path) -> bool: + """Return True if the skills FTS5 index is older than skills.jsonl. + + Tracked under a separate meta key (``skill_source_mtime``) so it + doesn't collide with the decisions staleness signal. + """ + if not index_path.is_file(): + return True + if not skills_path.is_file(): + return False # nothing to index + src_mtime = skills_path.stat().st_mtime + + conn = _connect(index_path) + try: + _ensure_tables(conn) + row = conn.execute( + f"SELECT value FROM {_META_TABLE} WHERE key = ?", + ("skill_source_mtime",), + ).fetchone() + if row is None: + return True + try: + idx_mtime = float(row["value"]) + except (TypeError, ValueError): + return True + return src_mtime > idx_mtime + 1.0 + finally: + conn.close() + + # ─── Internal helpers ───────────────────────────────────────────────── diff --git a/mcp_server/storage/skills_store.py b/mcp_server/storage/skills_store.py index 63a3d12..7980fc2 100644 --- a/mcp_server/storage/skills_store.py +++ b/mcp_server/storage/skills_store.py @@ -76,10 +76,11 @@ from __future__ import annotations import logging +import math from datetime import datetime, timezone from typing import Any -from mcp_server.storage import jsonl_store, origin as origin_module, paths +from mcp_server.storage import fts5_index, jsonl_store, origin as origin_module, paths logger = logging.getLogger(__name__) @@ -192,9 +193,20 @@ def record( "_schema_v": SCHEMA_V, } - return jsonl_store.append_with_generated_id( + skill_id = jsonl_store.append_with_generated_id( paths.skills_path(), base_record, prefix="K", width=6 ) + base_record["id"] = skill_id + + # v3.1.0 M3 Phase 2: incrementally update the FTS5 skill_fts index + # so searches don't wait for a sync. Best-effort (P9 — never fail + # the write on a cache miss). + try: + fts5_index.add_skill(paths.fts5_path(), base_record) + except Exception as exc: # noqa: BLE001 + logger.warning("skills_store.record: FTS5 add_skill failed: %s", exc) + + return skill_id def mark_used(skill_id: str, *, success: bool) -> dict[str, Any]: @@ -440,6 +452,129 @@ def list_all( return out +# ────────────────────────────────────────────────────────────────────── +# Search (composite ranking) +# ────────────────────────────────────────────────────────────────────── + + +# Default tuning per the plan: 0.5 * BM25_norm + 0.3 * tag_jaccard + +# 0.2 * recency_decay (τ_days = 30). Overridable per-call so a +# config flag in M9 can tune without code changes. +DEFAULT_RANKING_WEIGHTS = {"bm25": 0.5, "tag": 0.3, "recency": 0.2} +_RECENCY_TAU_DAYS = 30.0 +# How many FTS5 candidates to pull per requested top-K. Wider net so +# the tag+recency rerank can promote a strong candidate that BM25 +# ranked just below the cut. +_CANDIDATE_OVERSAMPLE = 4 + + +def search( + query: str, + *, + top_k: int = 5, + file_path: str | None = None, + ranking_weights: dict[str, float] | None = None, + now: datetime | None = None, +) -> list[dict[str, Any]]: + """Rank active skills against a query via the composite formula. + + :: + + score = 0.5 × BM25_norm + 0.3 × tag_jaccard + 0.2 × recency_decay + + Where: + - ``BM25_norm = -bm25_raw / max(-bm25_raw)`` over the candidate + set (FTS5 BM25 is a negative distance — lower = better; we + flip the sign before normalizing). + - ``tag_jaccard = |query_tokens ∩ skill_tags| / |union|`` over + the query tokens (lowercased, ≥ 3 chars) and the skill's + trigger tags. + - ``recency_decay = exp(-Δdays_since_last_used / 30)`` where + ``last_used_at`` (or ``ts`` if never used) is the reference. + + ``file_path`` (optional): filters out skills whose ``triggers. + file_patterns`` don't match the path (fnmatch). Empty / absent + patterns means the skill matches anything (no filter). + + Superseded / archived skills are excluded — search is the + everyday surface, daily-driver only. + + Returns each result with ``score_breakdown`` so callers can + inspect the composition (useful for debugging weight tuning). + """ + weights = ranking_weights or DEFAULT_RANKING_WEIGHTS + now_dt = now or datetime.now(timezone.utc) + + if not query or not query.strip(): + return [] + + # Lazy rebuild — the FTS5 cache is stateless; rebuilding when + # stale keeps writes cheap and reads correct. + if fts5_index.skill_staleness_check(paths.skills_path(), paths.fts5_path()): + try: + fts5_index.rebuild_skills_from_jsonl(paths.skills_path(), paths.fts5_path()) + except Exception as exc: # noqa: BLE001 + logger.warning("skills_store.search: FTS5 rebuild failed: %s", exc) + + hits = fts5_index.search_skills( + paths.fts5_path(), query, limit=max(top_k * _CANDIDATE_OVERSAMPLE, top_k) + ) + if not hits: + return [] + + # Flip BM25 (negative → positive) for normalization. + raw_pos = [-h["score"] for h in hits] + max_pos = max(raw_pos) if raw_pos else 1.0 + if max_pos <= 0: + max_pos = 1.0 # all-zero corpus; avoid div-by-zero + + query_tokens = _tokenize_for_jaccard(query) + + merged = jsonl_store.read_merged(paths.skills_path()) + by_id = {str(s.get("id")): s for s in merged} + + out: list[dict[str, Any]] = [] + for hit, pos in zip(hits, raw_pos, strict=False): + skill = by_id.get(hit["skill_id"]) + if skill is None: + continue + if skill.get("status", STATUS_ACTIVE) != STATUS_ACTIVE: + continue + if file_path is not None and not _matches_file_pattern(skill, file_path): + continue + + bm25_norm = pos / max_pos + tag_jaccard = _tag_jaccard(query_tokens, skill) + recency = _recency_decay(skill, now=now_dt) + composite = ( + weights.get("bm25", 0.0) * bm25_norm + + weights.get("tag", 0.0) * tag_jaccard + + weights.get("recency", 0.0) * recency + ) + + out.append( + { + "id": skill.get("id"), + "name": skill.get("name"), + "summary": skill.get("summary"), + "procedure": skill.get("procedure"), + "triggers": skill.get("triggers"), + "status": skill.get("status"), + "do_not_revert": skill.get("do_not_revert"), + "score": round(composite, 4), + "score_breakdown": { + "bm25_norm": round(bm25_norm, 4), + "tag_jaccard": round(tag_jaccard, 4), + "recency_decay": round(recency, 4), + }, + "snippet": hit.get("snippet"), + } + ) + + out.sort(key=lambda r: r["score"], reverse=True) + return out[:top_k] + + # ────────────────────────────────────────────────────────────────────── # Maintenance # ────────────────────────────────────────────────────────────────────── @@ -495,6 +630,75 @@ def decay_sweep( # ────────────────────────────────────────────────────────────────────── +def _tokenize_for_jaccard(query: str) -> set[str]: + """Lowercased tokens ≥ 3 chars from the query, for tag-Jaccard + overlap. Punctuation is stripped to match the FTS5 sanitizer's + behavior loosely (not exactly — this is a relevance heuristic). + """ + out: set[str] = set() + for raw in query.split(): + t = raw.strip("\"'.,;:!?()[]{}").lower() + if len(t) >= 3: + out.add(t) + return out + + +def _tag_jaccard(query_tokens: set[str], skill: dict[str, Any]) -> float: + """``|A ∩ B| / |A ∪ B|`` over query tokens and skill tags. + + Returns 0.0 if either set is empty (no overlap signal available). + """ + skill_tags = set((skill.get("triggers") or {}).get("tags") or []) + if not query_tokens or not skill_tags: + return 0.0 + inter = query_tokens & skill_tags + union = query_tokens | skill_tags + if not union: + return 0.0 + return len(inter) / len(union) + + +def _recency_decay( + skill: dict[str, Any], + *, + now: datetime, + tau_days: float = _RECENCY_TAU_DAYS, +) -> float: + """``exp(-Δdays_since_last_used / τ)`` per the plan. + + ``last_used_at`` is the reference. **Never-used skills score 0** + — recency is a *usage* signal, not an existence signal. A + freshly-recorded skill still surfaces via BM25 + tag-Jaccard; + once it's used at least once, the recency component starts + contributing. + + Malformed timestamps also score 0 — be conservative. + """ + last_used = skill.get("last_used_at") + if not isinstance(last_used, str): + return 0.0 + try: + ref = datetime.fromisoformat(last_used) + if ref.tzinfo is None: + ref = ref.replace(tzinfo=timezone.utc) + delta_days = max(0.0, (now - ref).total_seconds() / 86400.0) + return math.exp(-delta_days / tau_days) + except (ValueError, TypeError): + return 0.0 + + +def _matches_file_pattern(skill: dict[str, Any], file_path: str) -> bool: + """True if any of the skill's ``triggers.file_patterns`` fnmatches + ``file_path`` — or if the skill has no patterns (no filter). + """ + import fnmatch + + patterns = (skill.get("triggers") or {}).get("file_patterns") or [] + if not patterns: + return True + return any(fnmatch.fnmatch(file_path, p) for p in patterns) + + def _safe_estimate_tokens(text: str) -> int: """Best-effort token count via token_estimator. Fallback: ~4 chars/token rule-of-thumb so the field is always populated. diff --git a/mcp_server/tools/skills.py b/mcp_server/tools/skills.py new file mode 100644 index 0000000..8ada608 --- /dev/null +++ b/mcp_server/tools/skills.py @@ -0,0 +1,396 @@ +""" +skills.py — v3.1.0 M3 Phase 2 MCP tools for the skill library. + +Six tools cover the agent-facing surface: + + - record_skill — author a new skill. + - get_skill — composite-ranked search (BM25 + tags + recency). + - apply_skill_outcome — manual reinforcement (success / failure). + Canonical reinforcement comes from M5's + outcomes_writer integration; this tool is + the override path. + - list_skills — filtered list (status / source / tags). + - supersede_skill — version a skill, preserving audit chain. + - promote_skill_to_playbook + — write the skill's procedure as markdown to + .codevira/playbooks//.md. + Refuses on existing file unless force=True. + +Each tool returns a structured ``dict`` (never raises). Validation +errors from the storage layer surface as ``{success/recorded: False, +error: ...}`` so the agent can correct the input and retry without +crashing the dispatcher. +""" + +from __future__ import annotations + +import re +from typing import Any + +from mcp_server.storage import skills_store + + +# Filesystem-safe slug pattern for promotion to playbook. +_SAFE_NAME_RE = re.compile(r"^[a-z0-9][a-z0-9_-]{0,63}$") + + +# ────────────────────────────────────────────────────────────────────── +# record_skill +# ────────────────────────────────────────────────────────────────────── + + +def record_skill( + name: str, + procedure: str, + *, + summary: str | None = None, + triggers: dict[str, list[str]] | None = None, + source: str = "explicit", + do_not_revert: bool = False, + force: bool = False, +) -> dict[str, Any]: + """Record a new skill in the canonical store. + + Runs ``check_conflict`` against the SKILLS corpus first (not + decisions) so re-recording a near-duplicate surfaces a warning. + Pass ``force=True`` to record anyway. + """ + if not isinstance(name, str) or not name.strip(): + return {"recorded": False, "error": "name must be a non-empty string"} + if not isinstance(procedure, str) or not procedure.strip(): + return {"recorded": False, "error": "procedure must be a non-empty string"} + + # Conflict check: search the existing skill corpus for near-matches + # by procedure text. The skills FTS5 corpus is the natural index. + # We surface a warning but don't auto-block — agents may legitimately + # want a parallel skill (e.g., language-specific variants). + conflict_warning: dict[str, Any] | None = None + if not force: + try: + hits = skills_store.search(f"{name} {procedure[:200]}", top_k=3) + # If any hit has a BM25 component above an "obvious dup" + # threshold, surface it. 0.85 in [0,1] is the rough + # "near-identical text" line. + obvious_dups = [ + h + for h in hits + if h.get("score_breakdown", {}).get("bm25_norm", 0.0) >= 0.85 + ] + if obvious_dups: + conflict_warning = { + "kind": "duplicate", + "message": ( + f"This skill text looks similar to " + f"{len(obvious_dups)} existing skill(s). Pass " + f"force=True to record anyway, or supersede the " + f"existing skill via supersede_skill(old_id, ...)." + ), + "candidate_skill_ids": [h["id"] for h in obvious_dups], + } + except Exception: # noqa: BLE001 — P9: never block writes + pass + + if conflict_warning and not force: + return {"recorded": False, "_conflict_warning": conflict_warning} + + try: + kid = skills_store.record( + name=name, + procedure=procedure, + summary=summary, + triggers=triggers, + source=source, + do_not_revert=do_not_revert, + ) + except ValueError as exc: + return {"recorded": False, "error": str(exc)} + + return { + "recorded": True, + "skill_id": kid, + "name": name.strip(), + "do_not_revert": bool(do_not_revert), + "hint": ( + "Use get_skill(query=...) to retrieve. apply_skill_outcome " + "tracks success/failure for the auto-archive sweep." + ), + } + + +# ────────────────────────────────────────────────────────────────────── +# get_skill +# ────────────────────────────────────────────────────────────────────── + + +def get_skill( + query: str, + *, + top_k: int = 5, + file_path: str | None = None, +) -> dict[str, Any]: + """Composite-ranked search over active skills. + + Returns a ``hits`` list where each entry includes ``score`` (the + composite) + ``score_breakdown`` (the three component scores) + so the agent can reason about WHY a skill was surfaced. + """ + if not isinstance(query, str) or not query.strip(): + return {"hits": [], "count": 0, "query": query} + + hits = skills_store.search(query, top_k=top_k, file_path=file_path) + return { + "hits": [ + { + "skill_id": h.get("id"), + "name": h.get("name"), + "summary": h.get("summary"), + "procedure": h.get("procedure"), + "triggers": h.get("triggers"), + "do_not_revert": h.get("do_not_revert"), + "score": h.get("score"), + "score_breakdown": h.get("score_breakdown"), + "snippet": h.get("snippet"), + } + for h in hits + ], + "count": len(hits), + "query": query, + "file_path": file_path, + } + + +# ────────────────────────────────────────────────────────────────────── +# apply_skill_outcome +# ────────────────────────────────────────────────────────────────────── + + +def apply_skill_outcome(skill_id: str, success: bool) -> dict[str, Any]: + """Manually record one outcome for a skill. + + The canonical reinforcement loop in v3.1.0 M5 wires this through + ``outcomes_writer.py`` so the success signal is git-derived rather + than agent-self-reported. Until M5 ships, this tool IS the + reinforcement loop — agents and humans use it directly. + """ + if not isinstance(skill_id, str) or not skill_id.strip(): + return {"success": False, "error": "skill_id must be a non-empty string"} + res = skills_store.mark_used(skill_id, success=bool(success)) + return res + + +# ────────────────────────────────────────────────────────────────────── +# list_skills +# ────────────────────────────────────────────────────────────────────── + + +def list_skills( + *, + status: str | None = "active", + source: str | None = None, + tags: list[str] | None = None, + limit: int = 50, +) -> dict[str, Any]: + """List skills filtered by status / source / tags intersection. + + ``status="all"`` returns every state (active + archived + + superseded); any other string filters to that one state. Default + surfaces the daily-driver active set. + """ + effective_status = None if status == "all" else status + rows = skills_store.list_all( + status=effective_status, source=source, tags=tags, limit=limit + ) + return { + "skills": [ + { + "skill_id": s.get("id"), + "name": s.get("name"), + "summary": s.get("summary"), + "status": s.get("status"), + "source": s.get("source"), + "do_not_revert": s.get("do_not_revert"), + "success_count": s.get("success_count", 0), + "failure_count": s.get("failure_count", 0), + "consecutive_failures": s.get("consecutive_failures", 0), + "last_used_at": s.get("last_used_at"), + "triggers": s.get("triggers"), + } + for s in rows + ], + "count": len(rows), + "filtered_by": {"status": status, "source": source, "tags": tags}, + } + + +# ────────────────────────────────────────────────────────────────────── +# supersede_skill +# ────────────────────────────────────────────────────────────────────── + + +def supersede_skill( + old_id: str, + *, + name: str, + procedure: str, + summary: str | None = None, + triggers: dict[str, list[str]] | None = None, + reason: str = "", + do_not_revert: bool = False, +) -> dict[str, Any]: + """Replace an existing skill with a new version. + + Writes the new skill + amendment-marks the old as ``superseded`` + with a backref. Triggers inherit from the old skill when not + supplied. The old skill never returns from search after this. + """ + if not isinstance(old_id, str) or not old_id.strip(): + return {"success": False, "error": "old_id must be a non-empty string"} + try: + return skills_store.supersede( + old_id, + name=name, + procedure=procedure, + summary=summary, + triggers=triggers, + reason=reason, + do_not_revert=do_not_revert, + ) + except ValueError as exc: + return {"success": False, "error": str(exc)} + + +# ────────────────────────────────────────────────────────────────────── +# promote_skill_to_playbook +# ────────────────────────────────────────────────────────────────────── + + +def promote_skill_to_playbook( + skill_id: str, + *, + task_type: str, + name: str | None = None, + force: bool = False, +) -> dict[str, Any]: + """Write the skill's procedure as a playbook markdown file. + + Destination: ``/.codevira/playbooks//.md`` + where ``name`` defaults to the skill's name (slugified). The + existing playbook resolution chain (``mcp_server/tools/playbook.py``) + picks this up automatically — once promoted, the skill's procedure + is also available via ``get_playbook(task_type=task_type)``. + + Refuses on existing file unless ``force=True`` so a teammate's + hand-written playbook isn't clobbered silently. The source skill + stays in the skill library (no automatic archive); humans manage + versioning via ``supersede_skill`` if the playbook later + supersedes the skill. + """ + if not isinstance(skill_id, str) or not skill_id.strip(): + return {"promoted": False, "error": "skill_id must be a non-empty string"} + if not isinstance(task_type, str) or not task_type.strip(): + return {"promoted": False, "error": "task_type must be a non-empty string"} + + skill = skills_store.get(skill_id) + if skill is None: + return {"promoted": False, "error": f"skill {skill_id} not found"} + if skill.get("status") == "superseded": + return { + "promoted": False, + "error": ( + f"skill {skill_id} is superseded; promote the successor " + f"({skill.get('superseded_by')}) instead." + ), + } + + # Resolve the destination filename. + effective_name = name or skill.get("name") or skill_id + slug = _slugify(effective_name) + if not _SAFE_NAME_RE.match(slug): + return { + "promoted": False, + "error": ( + f"could not derive a filesystem-safe slug from " + f"{effective_name!r}; pass an explicit `name` argument." + ), + } + + from mcp_server.storage import paths as _paths + + dest = _paths.codevira_dir() / "playbooks" / task_type / f"{slug}.md" + + if dest.exists() and not force: + return { + "promoted": False, + "error": ( + f"playbook already exists at {dest.relative_to(_paths.codevira_dir().parent)} " + f"— pass force=True to overwrite, or supply a different `name`." + ), + "existing_path": str(dest), + } + + # Write the markdown. atomic_write_text covers crash-safety. + from mcp_server.storage import atomic + + dest.parent.mkdir(parents=True, exist_ok=True) + body = _render_playbook_markdown(skill, task_type=task_type) + try: + atomic.atomic_write_text(dest, body) + except OSError as exc: + return { + "promoted": False, + "error": f"could not write playbook at {dest}: {exc}", + } + + return { + "promoted": True, + "skill_id": skill_id, + "task_type": task_type, + "path": str(dest), + "name": slug, + "hint": ( + f"The procedure is now also discoverable via " + f"get_playbook(task_type={task_type!r}). The skill itself " + f"stays in the library — supersede it via supersede_skill if " + f"the playbook becomes the canonical version." + ), + } + + +# ────────────────────────────────────────────────────────────────────── +# Helpers +# ────────────────────────────────────────────────────────────────────── + + +def _slugify(name: str) -> str: + """Lowercase + hyphenate; drop characters outside [a-z0-9_-].""" + if not name: + return "" + s = re.sub(r"[^a-z0-9_-]+", "-", name.lower().strip()) + s = re.sub(r"-{2,}", "-", s).strip("-") + return s[:64] + + +def _render_playbook_markdown(skill: dict[str, Any], *, task_type: str) -> str: + """Render a skill into a playbook .md file body. + + Includes a small header so the file announces its origin (skill_id + + provenance) — useful when a teammate sees an unfamiliar playbook + appear in git diff. + """ + lines = [ + f"# {skill.get('name') or skill.get('id')}", + "", + f"_Promoted from skill {skill.get('id')} on " + f"{__import__('datetime').datetime.now().date().isoformat()}_ ", + f"_task_type: {task_type}_", + "", + ] + summary = skill.get("summary") + if summary: + lines.append(f"> {summary}") + lines.append("") + procedure = (skill.get("procedure") or "").strip() + if procedure: + lines.append(procedure) + lines.append("") + return "\n".join(lines) diff --git a/tests/storage/test_skills_store.py b/tests/storage/test_skills_store.py index bf22243..4a617ad 100644 --- a/tests/storage/test_skills_store.py +++ b/tests/storage/test_skills_store.py @@ -339,3 +339,142 @@ def test_archived_skill_not_re_archived(self, project: Path) -> None: res = skills_store.decay_sweep(now=future) # Already archived → skipped (not double-counted). assert kid not in res["archived"] + + +# ────────────────────────────────────────────────────────────────────── +# search (composite ranking — v3.1.0 M3 Phase 2) +# ────────────────────────────────────────────────────────────────────── + + +class TestSearch: + """Composite ranking: + score = 0.5 × BM25_norm + 0.3 × tag_jaccard + 0.2 × recency_decay + """ + + def test_empty_query_returns_empty(self, project: Path) -> None: + skills_store.record(name="x", procedure="step 1\nstep 2") + assert skills_store.search("") == [] + assert skills_store.search(" ") == [] + + def test_finds_skill_by_procedure_text(self, project: Path) -> None: + skills_store.record( + name="git-rebase-workflow", + procedure="Fetch origin then rebase against main", + summary="how we rebase", + triggers={"tags": ["git", "rebase"]}, + ) + results = skills_store.search("rebase main") + assert len(results) == 1 + assert results[0]["name"] == "git-rebase-workflow" + assert results[0]["score"] > 0 + # Composite breakdown surfaces for debug. + bd = results[0]["score_breakdown"] + assert "bm25_norm" in bd + assert "tag_jaccard" in bd + assert "recency_decay" in bd + + def test_excludes_archived_skills(self, project: Path) -> None: + kid_a = skills_store.record(name="alpha", procedure="rebase against main") + kid_b = skills_store.record(name="beta", procedure="rebase against main") + skills_store.mark_archived(kid_b) + results = skills_store.search("rebase main") + ids = {r["id"] for r in results} + assert kid_a in ids + assert kid_b not in ids + + def test_excludes_superseded_skills(self, project: Path) -> None: + kid_a = skills_store.record(name="v1", procedure="old way to rebase main") + skills_store.supersede(kid_a, name="v2", procedure="new way to rebase main") + results = skills_store.search("rebase main") + ids = {r["id"] for r in results} + assert kid_a not in ids + # v2 still appears. + assert any(r["name"] == "v2" for r in results) + + def test_tag_jaccard_boosts_score(self, project: Path) -> None: + # Skill A: matches text only; no relevant tags. + skills_store.record( + name="A", + procedure="run pytest with coverage", + triggers={"tags": ["unrelated"]}, + ) + # Skill B: matches text AND shares tags with the query terms. + skills_store.record( + name="B", + procedure="run pytest with coverage", + triggers={"tags": ["pytest", "coverage"]}, + ) + results = skills_store.search("pytest coverage") + # B should rank ABOVE A because tag_jaccard adds to the composite. + names = [r["name"] for r in results] + assert names.index("B") < names.index("A") + + def test_recency_decay_uses_last_used_at(self, project: Path) -> None: + """Older skills decay; recently-used ones rank higher even at + equal BM25.""" + skills_store.record(name="A", procedure="touch files") # never used + kid_new = skills_store.record(name="B", procedure="touch files") + # Mark B as recently used to set last_used_at to ~now. + skills_store.mark_used(kid_new, success=True) + results = skills_store.search("touch files") + # B (just used) should rank above A (never used). + if len(results) == 2: + assert results[0]["id"] == kid_new + + def test_file_path_filter(self, project: Path) -> None: + # Skill with a Python-only file_patterns trigger. + skills_store.record( + name="py-specific", + procedure="run pytest on the file", + triggers={"tags": ["pytest"], "file_patterns": ["*.py"]}, + ) + # Skill with no patterns — matches anything. + skills_store.record( + name="generic", + procedure="run pytest on the file", + triggers={"tags": ["pytest"]}, + ) + # Searching for a Python file: both surface. + py_results = skills_store.search("pytest", file_path="src/auth.py") + py_names = {r["name"] for r in py_results} + assert py_names == {"py-specific", "generic"} + + # Searching for a Markdown file: the py-specific skill is filtered out. + md_results = skills_store.search("pytest", file_path="README.md") + md_names = {r["name"] for r in md_results} + assert md_names == {"generic"} + + def test_ranking_weights_overridable(self, project: Path) -> None: + skills_store.record( + name="A", + procedure="rebase main", + triggers={"tags": ["rebase", "main"]}, + ) + # All weight on tag jaccard — score should equal tag overlap. + results = skills_store.search( + "rebase main", + ranking_weights={"bm25": 0.0, "tag": 1.0, "recency": 0.0}, + ) + assert len(results) == 1 + # With weights={0, 1, 0}, the composite = tag_jaccard. + breakdown = results[0]["score_breakdown"] + assert abs(results[0]["score"] - breakdown["tag_jaccard"]) < 1e-3 + + def test_top_k_caps_results(self, project: Path) -> None: + for i in range(10): + skills_store.record(name=f"skill-{i}", procedure="some procedure text") + results = skills_store.search("procedure", top_k=3) + assert len(results) == 3 + + def test_search_lazy_rebuild_on_stale_index(self, project: Path) -> None: + """First search() on a fresh project rebuilds the index from + skills.jsonl rather than returning empty.""" + kid = skills_store.record(name="x", procedure="rebase against main") + # Force the FTS5 index to be stale by deleting it. + from mcp_server.storage import paths as _paths + + if _paths.fts5_path().is_file(): + _paths.fts5_path().unlink() + # Search should still work — rebuild kicks in. + results = skills_store.search("rebase") + assert any(r["id"] == kid for r in results) diff --git a/tests/test_tools_skills.py b/tests/test_tools_skills.py new file mode 100644 index 0000000..78b18ae --- /dev/null +++ b/tests/test_tools_skills.py @@ -0,0 +1,295 @@ +""" +Tests for mcp_server.tools.skills — v3.1.0 M3 Phase 2. + +Verifies the six MCP tools against the contract documented in +mcp_server/tools/skills.py. Storage-layer correctness is covered +separately in tests/storage/test_skills_store.py. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import skills_store +from mcp_server.tools import skills + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# record_skill +# ────────────────────────────────────────────────────────────────────── + + +class TestRecordSkill: + def test_basic_returns_skill_id(self, project: Path) -> None: + r = skills.record_skill( + name="git-rebase-workflow", + procedure="1. fetch\n2. rebase main\n3. push --force-with-lease", + ) + assert r["recorded"] is True + assert r["skill_id"].startswith("K") + assert r["do_not_revert"] is False + + def test_empty_name_returns_structured_error(self, project: Path) -> None: + r = skills.record_skill(name=" ", procedure="p") + assert r["recorded"] is False + assert "name" in r["error"] + + def test_empty_procedure_returns_structured_error(self, project: Path) -> None: + r = skills.record_skill(name="x", procedure="") + assert r["recorded"] is False + assert "procedure" in r["error"] + + def test_oversize_procedure_returns_structured_error(self, project: Path) -> None: + r = skills.record_skill(name="x", procedure="x" * 4000) + assert r["recorded"] is False + assert "2048 byte cap" in r["error"] + + def test_force_bypasses_conflict_warning(self, project: Path) -> None: + # Seed a near-duplicate first. + skills.record_skill( + name="commit-style", + procedure="Use conventional commits for every commit message", + ) + # Second record near-identical content. Without force, conflict warning fires. + r = skills.record_skill( + name="commit-style", + procedure="Use conventional commits for every commit message", + ) + # The conflict-check threshold is conservative (0.85 BM25_norm); + # exact duplicate text should trigger it. + if r["recorded"] is False: + assert "_conflict_warning" in r + # Force=True overrides. + r2 = skills.record_skill( + name="commit-style-v2", + procedure="Use conventional commits for every commit message", + force=True, + ) + assert r2["recorded"] is True + + +# ────────────────────────────────────────────────────────────────────── +# get_skill +# ────────────────────────────────────────────────────────────────────── + + +class TestGetSkill: + def test_empty_query_returns_no_hits(self, project: Path) -> None: + skills.record_skill(name="x", procedure="rebase against main") + r = skills.get_skill("") + assert r["hits"] == [] + assert r["count"] == 0 + + def test_finds_skill_by_text(self, project: Path) -> None: + kid = skills_store.record( + name="git-rebase-workflow", procedure="rebase against main" + ) + r = skills.get_skill("rebase main") + assert r["count"] >= 1 + assert r["hits"][0]["skill_id"] == kid + bd = r["hits"][0]["score_breakdown"] + assert "bm25_norm" in bd + assert "tag_jaccard" in bd + assert "recency_decay" in bd + + def test_file_path_filter_propagated(self, project: Path) -> None: + skills_store.record( + name="py-only", + procedure="run pytest on the file", + triggers={"tags": ["pytest"], "file_patterns": ["*.py"]}, + ) + skills_store.record( + name="generic", + procedure="run pytest on the file", + ) + md_results = skills.get_skill("pytest", file_path="README.md") + names = {h["name"] for h in md_results["hits"]} + assert "generic" in names + assert "py-only" not in names + + +# ────────────────────────────────────────────────────────────────────── +# apply_skill_outcome +# ────────────────────────────────────────────────────────────────────── + + +class TestApplySkillOutcome: + def test_success_increments_count(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + r = skills.apply_skill_outcome(kid, success=True) + assert r["success"] is True + rec = skills_store.get(kid) + assert rec["success_count"] == 1 + + def test_failure_at_threshold_archives(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + for _ in range(5): + skills.apply_skill_outcome(kid, success=False) + assert skills_store.get(kid)["status"] == "archived" + + def test_unknown_skill_returns_error(self, project: Path) -> None: + r = skills.apply_skill_outcome("K999999", success=True) + assert r["success"] is False + assert "not found" in r["error"] + + def test_empty_skill_id_rejected(self, project: Path) -> None: + r = skills.apply_skill_outcome("", success=True) + assert r["success"] is False + assert "skill_id" in r["error"] + + +# ────────────────────────────────────────────────────────────────────── +# list_skills +# ────────────────────────────────────────────────────────────────────── + + +class TestListSkills: + def test_default_returns_active_only(self, project: Path) -> None: + kid_a = skills_store.record(name="a", procedure="p") + kid_b = skills_store.record(name="b", procedure="p") + skills_store.mark_archived(kid_b) + r = skills.list_skills() + ids = {s["skill_id"] for s in r["skills"]} + assert ids == {kid_a} + + def test_status_all_returns_every_state(self, project: Path) -> None: + kid_a = skills_store.record(name="a", procedure="p") + kid_b = skills_store.record(name="b", procedure="p") + skills_store.mark_archived(kid_b) + r = skills.list_skills(status="all") + ids = {s["skill_id"] for s in r["skills"]} + assert ids == {kid_a, kid_b} + + def test_tags_filter_intersection(self, project: Path) -> None: + skills_store.record( + name="A", procedure="p", triggers={"tags": ["git", "release"]} + ) + skills_store.record(name="B", procedure="p", triggers={"tags": ["git"]}) + r = skills.list_skills(tags=["git", "release"]) + names = {s["name"] for s in r["skills"]} + assert names == {"A"} + + def test_response_shape_includes_reinforcement_stats(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + skills_store.mark_used(kid, success=True) + r = skills.list_skills() + s = r["skills"][0] + assert s["success_count"] == 1 + assert s["last_used_at"] is not None + + +# ────────────────────────────────────────────────────────────────────── +# supersede_skill +# ────────────────────────────────────────────────────────────────────── + + +class TestSupersedeSkill: + def test_marks_old_and_creates_new(self, project: Path) -> None: + kid_old = skills_store.record(name="A", procedure="old") + r = skills.supersede_skill( + kid_old, name="B", procedure="new", reason="bumped to v2" + ) + assert r["success"] is True + assert r["new_id"].startswith("K") + # The new skill exists; the old is marked superseded. + new = skills_store.get(r["new_id"]) + old = skills_store.get(kid_old) + assert new["supersedes"] == kid_old + assert old["status"] == "superseded" + + def test_empty_old_id_rejected(self, project: Path) -> None: + r = skills.supersede_skill("", name="x", procedure="p") + assert r["success"] is False + + def test_unknown_old_id_returns_error(self, project: Path) -> None: + r = skills.supersede_skill("K999999", name="x", procedure="p") + assert r["success"] is False + assert "not found" in r["error"] + + +# ────────────────────────────────────────────────────────────────────── +# promote_skill_to_playbook +# ────────────────────────────────────────────────────────────────────── + + +class TestPromoteSkillToPlaybook: + def test_writes_playbook_markdown(self, project: Path) -> None: + kid = skills_store.record( + name="git-rebase", + summary="how we rebase", + procedure="1. fetch\n2. rebase main\n3. push --force-with-lease", + ) + r = skills.promote_skill_to_playbook(kid, task_type="commit") + assert r["promoted"] is True + # Destination under .codevira/playbooks//.md + path = Path(r["path"]) + assert path.is_file() + body = path.read_text() + assert "git-rebase" in body + assert "rebase main" in body + assert "task_type: commit" in body + assert r["name"] == "git-rebase" # slugified + + def test_refuses_overwrite_without_force(self, project: Path) -> None: + kid = skills_store.record(name="commit-style", procedure="p") + skills.promote_skill_to_playbook(kid, task_type="commit") + # Second promote → refused. + r = skills.promote_skill_to_playbook(kid, task_type="commit") + assert r["promoted"] is False + assert "force=True" in r["error"] + + def test_force_overwrites(self, project: Path) -> None: + kid = skills_store.record(name="commit-style", procedure="v1 procedure") + skills.promote_skill_to_playbook(kid, task_type="commit") + # Update the skill's procedure via supersede, then force-promote. + res = skills_store.supersede(kid, name="commit-style", procedure="v2 procedure") + new_kid = res["new_id"] + r = skills.promote_skill_to_playbook( + new_kid, task_type="commit", name="commit-style", force=True + ) + assert r["promoted"] is True + assert "v2 procedure" in Path(r["path"]).read_text() + + def test_explicit_name_overrides_skill_name(self, project: Path) -> None: + kid = skills_store.record(name="my-skill", procedure="p") + r = skills.promote_skill_to_playbook( + kid, task_type="add_tool", name="custom-name" + ) + assert r["promoted"] is True + assert r["name"] == "custom-name" + + def test_unknown_skill_rejected(self, project: Path) -> None: + r = skills.promote_skill_to_playbook("K999999", task_type="commit") + assert r["promoted"] is False + assert "not found" in r["error"] + + def test_superseded_skill_rejected(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + skills_store.supersede(kid, name="x2", procedure="p2") + r = skills.promote_skill_to_playbook(kid, task_type="commit") + assert r["promoted"] is False + assert "superseded" in r["error"] + + def test_empty_task_type_rejected(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + r = skills.promote_skill_to_playbook(kid, task_type="") + assert r["promoted"] is False + + def test_unslugifiable_name_rejected(self, project: Path) -> None: + kid = skills_store.record(name="x", procedure="p") + # Pass a name that slugifies to empty. + r = skills.promote_skill_to_playbook(kid, task_type="commit", name="!!!") + assert r["promoted"] is False From 96bf32fce38eb50efeec2501f79c176e29d88042 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 13:35:27 +0530 Subject: [PATCH 10/44] =?UTF-8?q?feat(v3.1.0):=20M4=20Phase=201=20?= =?UTF-8?q?=E2=80=94=20activity=5Fstore=20+=20memory=5Ffanout/decisions=20?= =?UTF-8?q?integration?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the spatial-activity log subsystem: records *where* in the codebase the agent has been working so the spatial query tools (M4 Phase 2) can surface focus zones and rank neighbors by recent attention. mcp_server/storage/paths.py — additive: activity_path() → .codevira-cache/activity.jsonl (per-machine, gitignored). Doc note on the D000012 lock — pure path computation. mcp_server/storage/activity_store.py — the log. API: * add(node_id, kind, session_id, origin_override) → A-id. Validates kind ∈ {edit, decision_ref}; node_id non-empty. Each record carries _schema_v: 1 + origin + A-prefixed monotonic id + session_id (defaulting to ad-hoc-XXXXXX). * list_recent(limit, kind, node_id, since) — newest-first activity feed with AND-filter composition. * list_top_k_files(top_k, since, weights) — weighted heatmap. Default weights: edit=1.0, decision_ref=2.0 (a decision tied to a file is a stronger 'attention' signal than a single edit). Overridable per-call. * visit_count_30d(node_id, now) — rolling-window counter for spatial_nearby ranking in Phase 2. * compact(retention_days=90) — drop rows older than the retention window. Called by codevira sync. memory_fanout integration: * _build_observation tags file-edit observations with a hidden _activity_file_path field carrying the file path. * flush() detects the field and writes an activity row alongside the working observation. Bash and unknown-file-path edits skip the activity write (preserves the 'did this' signal density). Best-effort: activity errors don't affect the working memory write. decisions_store integration: * record() with file_path emits a decision_ref activity row. Best-effort (P9) — the decision is already persisted. Schema: in v3.1.0 node_id is per-file. Per-symbol granularity needs graph.sqlite schema changes and is deferred to v3.2+. The plan-reserved 'visit' kind for read-only tools is deliberately NOT emitted; spatial heat surfaces edits + decisions, not lookups. Tests: tests/storage/test_activity_store.py — 23 tests covering add() validation, list_recent filters, list_top_k_files weighted ranking, visit_count_30d rolling window, compact retention drop, memory_fanout integration (Edit produces BOTH working + activity; Bash produces only working; unknown file_path skips), and decisions_store integration (file_path → decision_ref). 680 tests across storage + engine + tools + check_conflict + CLI pass green; zero regressions from the M3 baseline. Plan M4 Phase 1. Phase 2 (folder-tree neighborhoods + affordances + 4 spatial MCP tools) is next. Co-Authored-By: Claude Opus 4.7 --- mcp_server/engine/memory_fanout.py | 57 +++-- mcp_server/storage/activity_store.py | 274 +++++++++++++++++++++ mcp_server/storage/decisions_store.py | 20 ++ mcp_server/storage/paths.py | 14 ++ tests/storage/test_activity_store.py | 332 ++++++++++++++++++++++++++ 5 files changed, 683 insertions(+), 14 deletions(-) create mode 100644 mcp_server/storage/activity_store.py create mode 100644 tests/storage/test_activity_store.py diff --git a/mcp_server/engine/memory_fanout.py b/mcp_server/engine/memory_fanout.py index 5dfdda0..571e808 100644 --- a/mcp_server/engine/memory_fanout.py +++ b/mcp_server/engine/memory_fanout.py @@ -111,11 +111,13 @@ def dispatch(event: HookEvent) -> None: def flush() -> None: - """Drain the buffer into working.jsonl. Atomic per-record append. + """Drain the buffer into working.jsonl + activity.jsonl. - Each buffered entry becomes one ``working_store.add()`` call. - Failures inside individual writes are logged and skipped so a - single malformed entry can't poison the rest of the batch. + Each buffered entry becomes one ``working_store.add()`` call, + plus — for file-edit observations carrying ``_activity_file_path`` + metadata — one ``activity_store.add(kind="edit")`` row. Failures + inside individual writes are logged and skipped so a single + malformed entry can't poison the rest of the batch. """ global _BUFFER if not _BUFFER: @@ -128,20 +130,40 @@ def flush() -> None: try: from mcp_server.storage import working_store + except Exception as exc: # noqa: BLE001 + logger.debug("memory_fanout.flush: working_store import failed: %s", exc) + return - for rec in drained: + # activity_store import is best-effort — older installs without M4 + # still get the working observation written. + try: + from mcp_server.storage import activity_store + except Exception: # noqa: BLE001 + activity_store = None # type: ignore[assignment] + + for rec in drained: + try: + working_store.add( + content=rec["content"], + kind=rec.get("kind", "observation"), + importance=rec.get("importance", 4), + links=rec.get("links") or [], + ) + except Exception as exc: # noqa: BLE001 + logger.debug("memory_fanout.flush: working add failed: %s", exc) + + # v3.1.0 M4: if the originating tool was a file edit, mirror the + # observation as an activity row so spatial_heat / spatial_nearby + # have a heat signal. _activity_file_path is set by + # _build_observation below; never present on Bash records. + if activity_store is not None and rec.get("_activity_file_path"): try: - working_store.add( - content=rec["content"], - kind=rec.get("kind", "observation"), - importance=rec.get("importance", 4), - links=rec.get("links") or [], + activity_store.add( + rec["_activity_file_path"], + kind=activity_store.KIND_EDIT, ) except Exception as exc: # noqa: BLE001 - logger.debug("memory_fanout.flush: individual add failed: %s", exc) - continue - except Exception as exc: # noqa: BLE001 - logger.debug("memory_fanout.flush: working_store import failed: %s", exc) + logger.debug("memory_fanout.flush: activity add failed: %s", exc) # ────────────────────────────────────────────────────────────────────── @@ -188,6 +210,13 @@ def _build_observation(event: HookEvent) -> dict[str, Any] | None: "content": f"{tool_name}: touched {file_path}", "kind": "observation", "importance": 7 if has_error else 4, + # v3.1.0 M4: mirror this edit into activity.jsonl too. The + # flusher detects this hidden field and writes an activity + # row alongside the working observation; non-file tools + # (e.g., Bash) omit it. + "_activity_file_path": ( + str(file_path) if file_path and file_path != "" else None + ), } if tool_name == "Bash": diff --git a/mcp_server/storage/activity_store.py b/mcp_server/storage/activity_store.py new file mode 100644 index 0000000..8db6681 --- /dev/null +++ b/mcp_server/storage/activity_store.py @@ -0,0 +1,274 @@ +""" +activity_store.py — v3.1.0 M4 Phase 1: spatial-activity log. + +Records *where* in the codebase the agent has been working — edits, +decisions tagged with a file. The downstream spatial tools +(``spatial_nearby``, ``spatial_heat``) read this log to surface +focus zones and rank neighbors by recent attention. + +# Why a separate store + +- **Per-developer**: each engineer's attention pattern is theirs. + Living in ``.codevira-cache/activity.jsonl`` (gitignored, per + machine) avoids polluting the team's git diff with someone else's + exploration history. +- **Opt-in team export**: ``codevira spatial export-activity`` + aggregates and writes ``.codevira/activity_summary.yaml`` when a + team wants the heat map shared. +- **Compaction-friendly**: append-only JSONL with capped retention + (default 90 days) keeps the file from growing without bound on + long-running projects. + +# Schema + +:: + + { + "id": "A000001", + "ts": "2026-05-28T10:00:00+00:00", + "node_id": "", + "kind": "edit" | "decision_ref", + "session_id": "ad-hoc-a1b2c3", + "origin": {"ide", "agent_model", "host_hash", "ts"}, + "_schema_v": 1, + } + +In v3.1.0 ``node_id`` is per-file. Per-symbol granularity needs +``graph.sqlite`` schema changes and is explicitly deferred to v3.2+. + +# Kinds + + - ``edit`` — emitted by ``memory_fanout`` on Edit / Write / + MultiEdit / NotebookEdit / update_node events. + - ``decision_ref`` — emitted by ``decisions_store.record()`` when + the new decision carries a ``file_path``. + +The plan also reserves a ``visit`` kind for future use (read-only +tool calls), but those are deliberately NOT emitted in v3.1.0 to +keep the log dense with "did this" signal rather than "looked at" +noise — see ``memory_fanout._build_observation`` for the same +filter applied to working memory. +""" + +from __future__ import annotations + +import logging +from datetime import datetime, timedelta, timezone +from typing import Any + +from mcp_server.storage import jsonl_store, origin as origin_module, paths + +logger = logging.getLogger(__name__) + + +# ────────────────────────────────────────────────────────────────────── +# Constants +# ────────────────────────────────────────────────────────────────────── + +SCHEMA_V = 1 + +KIND_EDIT = "edit" +KIND_DECISION_REF = "decision_ref" +_VALID_KINDS = frozenset({KIND_EDIT, KIND_DECISION_REF}) + +# Retention defaults — overridable via codevira sync command flags. +DEFAULT_RETENTION_DAYS = 90 + + +# ────────────────────────────────────────────────────────────────────── +# Writes +# ────────────────────────────────────────────────────────────────────── + + +def add( + node_id: str, + *, + kind: str = KIND_EDIT, + session_id: str | None = None, + origin_override: dict | None = None, +) -> str: + """Append an activity row; return the generated A-id. + + Inputs validated up front; failures raise ValueError so callers + in the hot path (``memory_fanout``) can wrap and drop silently + per their fail-open contract. + """ + if not isinstance(node_id, str) or not node_id.strip(): + raise ValueError("activity_store.add: node_id must be a non-empty string") + if kind not in _VALID_KINDS: + raise ValueError( + f"activity_store.add: kind must be one of {sorted(_VALID_KINDS)}; " + f"got {kind!r}" + ) + + paths.ensure_dirs() + + from mcp_server.storage import decisions_store # local: avoid cycle + + rec = { + "ts": datetime.now(timezone.utc).isoformat(), + "node_id": node_id.strip(), + "kind": kind, + "session_id": session_id or decisions_store.default_session_id(), + "origin": origin_override or origin_module.current_origin(), + "_schema_v": SCHEMA_V, + } + return jsonl_store.append_with_generated_id( + paths.activity_path(), rec, prefix="A", width=6 + ) + + +# ────────────────────────────────────────────────────────────────────── +# Reads +# ────────────────────────────────────────────────────────────────────── + + +def list_recent( + *, + limit: int = 50, + kind: str | None = None, + node_id: str | None = None, + since: datetime | None = None, +) -> list[dict[str, Any]]: + """Return the most recent ``limit`` activity rows, newest first. + + Optional filters compose AND-wise. ``since`` excludes rows older + than the cutoff (useful for time-windowed heatmaps). + """ + raw = jsonl_store.read_recent(paths.activity_path(), limit=limit * 4) + out: list[dict[str, Any]] = [] + for rec in raw: + if kind is not None and rec.get("kind") != kind: + continue + if node_id is not None and rec.get("node_id") != node_id: + continue + if since is not None: + ts_str = rec.get("ts") + if not isinstance(ts_str, str): + continue + try: + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + if ts < since: + continue + except (ValueError, TypeError): + continue + out.append(rec) + if len(out) >= limit: + break + return out + + +def list_top_k_files( + *, + top_k: int = 20, + since: datetime | None = None, + weights: dict[str, float] | None = None, +) -> list[dict[str, Any]]: + """Top-K ``node_id`` values by weighted activity count. + + Weights default to ``{"edit": 1.0, "decision_ref": 2.0}`` — a + decision tied to a file is a stronger "attention" signal than a + single edit (edits are abundant; decisions are deliberate). + + Returns ``[{node_id, edit_count, decision_ref_count, score}, ...]`` + sorted by ``score`` descending. + """ + w = weights or {KIND_EDIT: 1.0, KIND_DECISION_REF: 2.0} + + raw = jsonl_store.read_all(paths.activity_path()) + counts: dict[str, dict[str, int]] = {} + + for rec in raw: + if since is not None: + ts_str = rec.get("ts") + if not isinstance(ts_str, str): + continue + try: + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + if ts < since: + continue + except (ValueError, TypeError): + continue + nid = rec.get("node_id") + kind = rec.get("kind") + if not isinstance(nid, str) or kind not in _VALID_KINDS: + continue + bucket = counts.setdefault(nid, {KIND_EDIT: 0, KIND_DECISION_REF: 0}) + bucket[kind] = bucket.get(kind, 0) + 1 + + scored: list[dict[str, Any]] = [] + for nid, by_kind in counts.items(): + score = sum(w.get(k, 0.0) * v for k, v in by_kind.items()) + scored.append( + { + "node_id": nid, + "edit_count": by_kind.get(KIND_EDIT, 0), + "decision_ref_count": by_kind.get(KIND_DECISION_REF, 0), + "score": round(score, 3), + } + ) + + scored.sort(key=lambda r: (r["score"], r["node_id"]), reverse=True) + return scored[:top_k] + + +def visit_count_30d(node_id: str, *, now: datetime | None = None) -> int: + """Total ``edit`` + ``decision_ref`` events for ``node_id`` in the + last 30 days. Used by ``spatial_nearby`` ranking. + """ + now_dt = now or datetime.now(timezone.utc) + cutoff = now_dt - timedelta(days=30) + n = 0 + for rec in jsonl_store.read_all(paths.activity_path()): + if rec.get("node_id") != node_id: + continue + ts_str = rec.get("ts") + if not isinstance(ts_str, str): + continue + try: + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + except (ValueError, TypeError): + continue + if ts >= cutoff: + n += 1 + return n + + +# ────────────────────────────────────────────────────────────────────── +# Maintenance +# ────────────────────────────────────────────────────────────────────── + + +def compact(*, retention_days: int = DEFAULT_RETENTION_DAYS) -> int: + """Drop activity rows older than ``retention_days``. Called by + ``codevira sync``. Returns count dropped. + + Holds the file lock for the entire read-filter-write via + ``jsonl_store.compact``. The default 90-day window is long + enough for monthly spatial heatmaps without unbounded growth on + a project the agent has worked on for a year. + """ + path = paths.activity_path() + if not path.is_file(): + return 0 + cutoff = datetime.now(timezone.utc) - timedelta(days=retention_days) + + def _keep(rec: dict[str, Any]) -> bool: + ts_str = rec.get("ts") + if not isinstance(ts_str, str): + return True # don't drop malformed rows (codevira doctor handles those) + try: + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + except (ValueError, TypeError): + return True + return ts >= cutoff + + return jsonl_store.compact(path, keep_predicate=_keep) diff --git a/mcp_server/storage/decisions_store.py b/mcp_server/storage/decisions_store.py index 671a289..1e6f880 100644 --- a/mcp_server/storage/decisions_store.py +++ b/mcp_server/storage/decisions_store.py @@ -149,6 +149,26 @@ def record( except Exception as exc: # noqa: BLE001 logger.warning("decisions_store.record: digest update failed: %s", exc) + # v3.1.0 M4: a decision tied to a file is a high-signal "attention" + # event. Mirror it into the activity log so spatial_heat surfaces + # the file. Best-effort (P9 — the decision is already persisted). + _activity_origin = base_record["origin"] + _activity_session = base_record["session_id"] + if file_path: + try: + from mcp_server.storage import activity_store + + activity_store.add( + str(file_path), + kind=activity_store.KIND_DECISION_REF, + session_id=str(_activity_session) if _activity_session else None, + origin_override=_activity_origin + if isinstance(_activity_origin, dict) + else None, + ) + except Exception as exc: # noqa: BLE001 + logger.warning("decisions_store.record: activity add failed: %s", exc) + # Phase D — regenerate AGENTS.md so other AI tools (Copilot, Codex, # Cursor, Gemini, Factory, Amp, Windsurf, Zed, RooCode, Jules) see # the new decision on their next prompt. Best-effort (P9). diff --git a/mcp_server/storage/paths.py b/mcp_server/storage/paths.py index dc66d75..eca17ab 100644 --- a/mcp_server/storage/paths.py +++ b/mcp_server/storage/paths.py @@ -163,6 +163,20 @@ def working_path(project_root: Path | None = None) -> Path: return codevira_cache_dir(project_root) / "working.jsonl" +def activity_path(project_root: Path | None = None) -> Path: + """v3.1.0 M4: spatial-activity log (per-machine, gitignored). + + Stores ``edit`` / ``decision_ref`` rows as the agent works through + the codebase. The ``codevira spatial export-activity`` CLI is the + opt-in path to share aggregated heat with a team; the raw log + itself stays local because attention patterns are per-developer. + + See ``working_archived_path`` for the D000012 lock note — same + reasoning applies. + """ + return codevira_cache_dir(project_root) / "activity.jsonl" + + # ─── Convenience operations ─────────────────────────────────────────── diff --git a/tests/storage/test_activity_store.py b/tests/storage/test_activity_store.py new file mode 100644 index 0000000..d914081 --- /dev/null +++ b/tests/storage/test_activity_store.py @@ -0,0 +1,332 @@ +""" +Tests for mcp_server.storage.activity_store — v3.1.0 M4 Phase 1. + +Coverage: + - add() input validation + schema (A-id, _schema_v: 1, origin) + - list_recent: filtering by kind / node_id / since + - list_top_k_files: weighted ranking + custom weights + - visit_count_30d: rolling-window counter for spatial_nearby + - compact: retention drop + - memory_fanout integration: file-edit observations also write + activity rows; Bash observations don't + - decisions_store integration: record() with file_path emits + a decision_ref activity row +""" + +from __future__ import annotations + +import time +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import activity_store, jsonl_store, paths + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# add() — schema + validation +# ────────────────────────────────────────────────────────────────────── + + +class TestAdd: + def test_basic_returns_a_id(self, project: Path) -> None: + aid = activity_store.add("src/foo.py", kind="edit") + assert aid.startswith("A") + + def test_schema_v_and_origin_stamped( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + activity_store.add("src/foo.py", kind="edit") + rec = jsonl_store.read_all(paths.activity_path())[0] + assert rec["_schema_v"] == 1 + assert rec["origin"]["ide"] == "cursor" + assert rec["node_id"] == "src/foo.py" + assert rec["kind"] == "edit" + + def test_empty_node_id_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="node_id"): + activity_store.add(" ", kind="edit") + + def test_invalid_kind_rejected(self, project: Path) -> None: + with pytest.raises(ValueError, match="kind"): + activity_store.add("src/foo.py", kind="visit") # reserved for v3.2 + + def test_decision_ref_kind_accepted(self, project: Path) -> None: + activity_store.add("src/auth.py", kind=activity_store.KIND_DECISION_REF) + rec = jsonl_store.read_all(paths.activity_path())[0] + assert rec["kind"] == "decision_ref" + + +# ────────────────────────────────────────────────────────────────────── +# list_recent +# ────────────────────────────────────────────────────────────────────── + + +class TestListRecent: + def test_newest_first(self, project: Path) -> None: + activity_store.add("a.py", kind="edit") + time.sleep(0.005) + activity_store.add("b.py", kind="edit") + time.sleep(0.005) + activity_store.add("c.py", kind="edit") + recent = activity_store.list_recent(limit=3) + assert [r["node_id"] for r in recent] == ["c.py", "b.py", "a.py"] + + def test_kind_filter(self, project: Path) -> None: + activity_store.add("a.py", kind="edit") + activity_store.add("b.py", kind="decision_ref") + only_dec = activity_store.list_recent(kind="decision_ref") + assert {r["node_id"] for r in only_dec} == {"b.py"} + + def test_node_filter(self, project: Path) -> None: + activity_store.add("a.py", kind="edit") + activity_store.add("b.py", kind="edit") + only_a = activity_store.list_recent(node_id="a.py") + assert {r["node_id"] for r in only_a} == {"a.py"} + + def test_since_filter(self, project: Path) -> None: + # Inject a stale row directly. + old_ts = (datetime(2020, 1, 1, tzinfo=timezone.utc)).isoformat() + jsonl_store.append( + paths.activity_path(), + { + "id": "A000001", + "ts": old_ts, + "node_id": "stale.py", + "kind": "edit", + "_schema_v": 1, + }, + ) + activity_store.add("fresh.py", kind="edit") + cutoff = datetime.now(timezone.utc) - timedelta(days=1) + recent = activity_store.list_recent(since=cutoff) + assert {r["node_id"] for r in recent} == {"fresh.py"} + + +# ────────────────────────────────────────────────────────────────────── +# list_top_k_files (heatmap ranking) +# ────────────────────────────────────────────────────────────────────── + + +class TestListTopKFiles: + def test_score_weights_edit_and_decision_ref(self, project: Path) -> None: + # File A: 3 edits → score = 3.0 + for _ in range(3): + activity_store.add("a.py", kind="edit") + # File B: 1 edit + 1 decision_ref → score = 1 + 2 = 3.0 + activity_store.add("b.py", kind="edit") + activity_store.add("b.py", kind="decision_ref") + # File C: 1 edit → score = 1.0 + activity_store.add("c.py", kind="edit") + ranked = activity_store.list_top_k_files(top_k=10) + scores = {r["node_id"]: r["score"] for r in ranked} + assert scores["a.py"] == 3.0 + assert scores["b.py"] == 3.0 + assert scores["c.py"] == 1.0 + + def test_custom_weights(self, project: Path) -> None: + activity_store.add("a.py", kind="edit") + activity_store.add("a.py", kind="decision_ref") + # Override: edits weigh 5, decisions weigh 0.5 — flip the + # default emphasis. + ranked = activity_store.list_top_k_files( + weights={"edit": 5.0, "decision_ref": 0.5} + ) + assert ranked[0]["score"] == 5.5 + + def test_top_k_caps_output(self, project: Path) -> None: + for i in range(20): + activity_store.add(f"f{i}.py", kind="edit") + ranked = activity_store.list_top_k_files(top_k=3) + assert len(ranked) == 3 + + def test_empty_store_returns_empty(self, project: Path) -> None: + assert activity_store.list_top_k_files() == [] + + +# ────────────────────────────────────────────────────────────────────── +# visit_count_30d +# ────────────────────────────────────────────────────────────────────── + + +class TestVisitCount30d: + def test_counts_within_window(self, project: Path) -> None: + for _ in range(3): + activity_store.add("a.py", kind="edit") + activity_store.add("a.py", kind="decision_ref") + # Total = 4. + assert activity_store.visit_count_30d("a.py") == 4 + + def test_excludes_outside_window(self, project: Path) -> None: + old = datetime.now(timezone.utc) - timedelta(days=45) + jsonl_store.append( + paths.activity_path(), + { + "id": "A000001", + "ts": old.isoformat(), + "node_id": "a.py", + "kind": "edit", + "_schema_v": 1, + }, + ) + # Fresh row. + activity_store.add("a.py", kind="edit") + assert activity_store.visit_count_30d("a.py") == 1 + + def test_other_node_ids_not_counted(self, project: Path) -> None: + activity_store.add("a.py", kind="edit") + activity_store.add("b.py", kind="edit") + assert activity_store.visit_count_30d("a.py") == 1 + + +# ────────────────────────────────────────────────────────────────────── +# compact +# ────────────────────────────────────────────────────────────────────── + + +class TestCompact: + def test_drops_old_rows(self, project: Path) -> None: + old_ts = (datetime(2020, 1, 1, tzinfo=timezone.utc)).isoformat() + jsonl_store.append( + paths.activity_path(), + { + "id": "A000001", + "ts": old_ts, + "node_id": "stale.py", + "kind": "edit", + "_schema_v": 1, + }, + ) + activity_store.add("fresh.py", kind="edit") + + dropped = activity_store.compact(retention_days=90) + assert dropped == 1 + remaining = jsonl_store.read_all(paths.activity_path()) + assert [r["node_id"] for r in remaining] == ["fresh.py"] + + def test_compact_missing_file_returns_zero(self, project: Path) -> None: + # No activity.jsonl exists yet. + assert activity_store.compact() == 0 + + +# ────────────────────────────────────────────────────────────────────── +# memory_fanout integration +# ────────────────────────────────────────────────────────────────────── + + +class TestMemoryFanoutIntegration: + """v3.1.0 M4: PostToolUse Edit events should produce BOTH a working + observation AND an activity row. Bash should produce only the + working observation.""" + + def _post_event(self, tool_name, tool_input=None, project_root=None): + from mcp_server.engine.events import EventType, HookEvent + + return HookEvent( + event_type=EventType.POST_TOOL_USE, + project_root=project_root or Path("/tmp"), + ai_tool="mcp", + session_id=None, + tool_name=tool_name, + tool_input=tool_input or {}, + tool_output={}, + timestamp=time.time(), + raw={"source": "test"}, + ) + + def test_edit_produces_both_working_and_activity(self, project: Path) -> None: + from mcp_server.engine import memory_fanout + + memory_fanout.reset_buffer() + memory_fanout.dispatch( + self._post_event( + "Edit", + tool_input={"file_path": "src/auth.py"}, + project_root=project, + ) + ) + memory_fanout.flush() + + # Working observation landed. + working_rows = jsonl_store.read_all(paths.working_path()) + assert len(working_rows) == 1 + assert "src/auth.py" in working_rows[0]["content"] + + # Activity row also landed with kind=edit. + act_rows = jsonl_store.read_all(paths.activity_path()) + bases = [r for r in act_rows if not r.get("_amendment_to_id")] + assert len(bases) == 1 + assert bases[0]["node_id"] == "src/auth.py" + assert bases[0]["kind"] == "edit" + + def test_bash_does_not_write_activity(self, project: Path) -> None: + from mcp_server.engine import memory_fanout + + memory_fanout.reset_buffer() + memory_fanout.dispatch( + self._post_event( + "Bash", + tool_input={"command": "git status"}, + project_root=project, + ) + ) + memory_fanout.flush() + # Working observation exists, activity does not. + assert jsonl_store.read_all(paths.working_path()) + # activity.jsonl shouldn't exist (no rows written). + assert not paths.activity_path().is_file() + + def test_unknown_file_path_skips_activity(self, project: Path) -> None: + """Edit with no file_path arg still produces a working + observation but no activity row (we can't attribute the + attention to a specific node).""" + from mcp_server.engine import memory_fanout + + memory_fanout.reset_buffer() + memory_fanout.dispatch(self._post_event("Edit", project_root=project)) + memory_fanout.flush() + # No activity row. + assert not paths.activity_path().is_file() + + +# ────────────────────────────────────────────────────────────────────── +# decisions_store integration +# ────────────────────────────────────────────────────────────────────── + + +class TestDecisionsStoreIntegration: + """v3.1.0 M4: record() with file_path emits a decision_ref + activity row alongside the canonical JSONL write.""" + + def test_record_with_file_path_emits_decision_ref(self, project: Path) -> None: + from mcp_server.storage import decisions_store + + decisions_store.record( + decision="Use bcrypt for password hashing", + file_path="auth/middleware.py", + ) + rows = jsonl_store.read_all(paths.activity_path()) + bases = [r for r in rows if not r.get("_amendment_to_id")] + assert len(bases) == 1 + assert bases[0]["kind"] == "decision_ref" + assert bases[0]["node_id"] == "auth/middleware.py" + + def test_record_without_file_path_skips_activity(self, project: Path) -> None: + from mcp_server.storage import decisions_store + + decisions_store.record(decision="Project-wide doctrine") + # No file_path → no activity row. + assert not paths.activity_path().is_file() From d10181f2f636a128a259c4b0b36944d94fc08df7 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 13:41:03 +0530 Subject: [PATCH 11/44] =?UTF-8?q?feat(v3.1.0):=20M4=20Phase=202=20?= =?UTF-8?q?=E2=80=94=20neighborhoods=20+=20affordances=20+=204=20spatial?= =?UTF-8?q?=20MCP=20tools?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Completes M4 by adding the spatial-query layer on top of M4 Phase 1's activity store. The agent can now ask 'what's near this file?', 'where has attention been?', 'what neighborhood am I in?', and 'what can I do here?'. spatial.py — 4 MCP tools: * spatial_nearby(file_path, k) — BFS distance ≤ 2 over the indexer graph (imports + call edges) ∪ same-neighborhood. Ranking: (1 / (1 + bfs_dist)) × log(1 + visit_count_30d). Falls back to neighborhood-only if the indexer graph isn't built. * spatial_heat(top_k, since_days) — top-K most-touched files by weighted activity. * spatial_neighborhood(file_path) — folder-tree default (top-2 dir components — 'mcp_server/storage', 'indexer'), overridable via .codevira/neighborhoods.yaml. * spatial_affordances(file_path) — affordance keys (task_types) for the file based on bundled + project affordances.yaml. Folder-tree neighborhoods drop the filename then cap at depth 2: - mcp_server/storage/foo.py → 'mcp_server/storage' - indexer/foo.py → 'indexer' - README.md → '' Override file .codevira/neighborhoods.yaml RE-LABELS matched files; files matching nothing fall through to the folder-tree default (the override never hides files). mcp_server/data/affordances.yaml — bundled defaults: tools/ → {add_tool, write_test}; storage/ → {add_store, write_test}; indexer/ → {add_parser_rule, write_test}; test files → {write_test, debug_pipeline}; Makefile/pyproject/CHANGELOG → release + commit affordances. Project override at .codevira/affordances.yaml; loader concats bundled+project and returns the union per match. Already covered by pyproject's package-data glob (mcp_server/data/**/*). Server.py: 4 Tool(...) entries in list_tools() + 4 dispatch branches in call_tool(). Tests: tests/test_tools_spatial.py — 28 tests covering folder-tree shapes, yaml override + fall-through + malformed-fallback, members from activity log, affordance patterns (bundled + override union), spatial_heat ranking + since_days, spatial_nearby graph-missing fallback + self-exclusion + activity ranking + isolated file, _node_id_to_file_path edge cases. 756 tests across storage + engine + tools + check_conflict + server + CLI pass green; zero regressions from M4 Phase 1. Plan M4 Phase 2. M4 complete; M5/M9 next per the plan's phasing. Co-Authored-By: Claude Opus 4.7 --- mcp_server/data/affordances.yaml | 44 +++ mcp_server/server.py | 104 +++++++ mcp_server/tools/spatial.py | 509 +++++++++++++++++++++++++++++++ tests/test_tools_spatial.py | 263 ++++++++++++++++ 4 files changed, 920 insertions(+) create mode 100644 mcp_server/data/affordances.yaml create mode 100644 mcp_server/tools/spatial.py create mode 100644 tests/test_tools_spatial.py diff --git a/mcp_server/data/affordances.yaml b/mcp_server/data/affordances.yaml new file mode 100644 index 0000000..46326d1 --- /dev/null +++ b/mcp_server/data/affordances.yaml @@ -0,0 +1,44 @@ +# Bundled affordances — v3.1.0 M4 Phase 2. +# +# Maps file-path globs to the high-level "things you can do here" +# (the affordance keys). spatial_affordances(file_path) returns the +# UNION of all matching pattern affordances. Globs use fnmatch +# semantics (* matches across path components — be explicit when +# scoping to a directory). +# +# Project-specific overrides live at .codevira/affordances.yaml; the +# loader merges bundled defaults + project overrides (project wins on +# pattern collisions). +# +# Affordance keys are conventional task_type names from the playbook +# system so promote_skill_to_playbook(task_type=...) lands cleanly. + +- pattern: "mcp_server/tools/*.py" + affordances: [add_tool, write_test] + +- pattern: "mcp_server/storage/*.py" + affordances: [add_store, write_test] + +- pattern: "mcp_server/engine/policies/*.py" + affordances: [add_policy, write_test] + +- pattern: "indexer/*.py" + affordances: [add_parser_rule, write_test] + +- pattern: "tests/**/*.py" + affordances: [write_test, debug_pipeline] + +- pattern: "tests/*.py" + affordances: [write_test, debug_pipeline] + +- pattern: "*test*.py" + affordances: [write_test, debug_pipeline] + +- pattern: "Makefile" + affordances: [commit, debug_pipeline] + +- pattern: "pyproject.toml" + affordances: [release, commit] + +- pattern: "CHANGELOG.md" + affordances: [release, commit] diff --git a/mcp_server/server.py b/mcp_server/server.py index fe946a6..65b51c8 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -1252,6 +1252,87 @@ async def list_tools() -> list[Tool]: "required": ["skill_id", "task_type"], }, ), + # ---- v3.1.0 M4: spatial memory ---- + Tool( + name="spatial_nearby", + description=( + "v3.1.0 M4: Files topologically near a given file, ranked by " + "recent activity. Candidate set = BFS distance ≤ 2 over the " + "indexer graph (imports + call edges) ∪ same-neighborhood " + "files. Ranking: (1 / (1 + bfs_dist)) × log(1 + visit_count_30d). " + "Falls back to neighborhood-only if the indexer graph isn't built." + ), + inputSchema={ + "type": "object", + "properties": { + "file_path": { + "type": "string", + "description": "Project-relative file path", + }, + "k": { + "type": "integer", + "description": "Max neighbors to return (default 5)", + "default": 5, + }, + }, + "required": ["file_path"], + }, + ), + Tool( + name="spatial_heat", + description=( + "v3.1.0 M4: Top-K most-touched files in a time window by " + "weighted activity (edits + decision_refs). Useful for " + "'where has attention been this week?' queries. Pass " + "since_days to limit the window; omit for all-time." + ), + inputSchema={ + "type": "object", + "properties": { + "top_k": {"type": "integer", "default": 20}, + "since_days": { + "type": "integer", + "description": ( + "Only count activity within the trailing N days " + "(omit for all-time)" + ), + }, + }, + }, + ), + Tool( + name="spatial_neighborhood", + description=( + "v3.1.0 M4: Return the neighborhood id + members for a file. " + "Folder-tree default (top-2 dir components, e.g., " + "'mcp_server/storage'); overridable via " + ".codevira/neighborhoods.yaml." + ), + inputSchema={ + "type": "object", + "properties": { + "file_path": {"type": "string"}, + }, + "required": ["file_path"], + }, + ), + Tool( + name="spatial_affordances", + description=( + "v3.1.0 M4: Return the affordance keys (task_types) applicable " + "to a file based on the bundled + project affordances.yaml. " + "E.g., a file under mcp_server/tools/ typically affords " + "{add_tool, write_test}. Use the returned keys with " + "get_playbook(task_type) for relevant rules." + ), + inputSchema={ + "type": "object", + "properties": { + "file_path": {"type": "string"}, + }, + "required": ["file_path"], + }, + ), # ---- v1.5: Deep Graph Intelligence Tools ---- Tool( name="query_graph", @@ -1640,6 +1721,29 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: name=arguments.get("name"), force=arguments.get("force", False), ) + # ---- v3.1.0 M4: spatial memory dispatch ---- + elif name == "spatial_nearby": + from mcp_server.tools.spatial import spatial_nearby + + result = spatial_nearby( + file_path=arguments["file_path"], + k=arguments.get("k", 5), + ) + elif name == "spatial_heat": + from mcp_server.tools.spatial import spatial_heat + + result = spatial_heat( + top_k=arguments.get("top_k", 20), + since_days=arguments.get("since_days"), + ) + elif name == "spatial_neighborhood": + from mcp_server.tools.spatial import spatial_neighborhood + + result = spatial_neighborhood(file_path=arguments["file_path"]) + elif name == "spatial_affordances": + from mcp_server.tools.spatial import spatial_affordances + + result = spatial_affordances(file_path=arguments["file_path"]) else: result = {"error": f"Unknown tool: {name}"} diff --git a/mcp_server/tools/spatial.py b/mcp_server/tools/spatial.py new file mode 100644 index 0000000..8994142 --- /dev/null +++ b/mcp_server/tools/spatial.py @@ -0,0 +1,509 @@ +""" +spatial.py — v3.1.0 M4 Phase 2 MCP tools for spatial memory. + +Four tools cover the agent-facing surface: + + - spatial_nearby — files topologically close to a given file + (import/call edges + same-neighborhood), + ranked by recent activity. + - spatial_heat — top-K most-touched files in a time window. + - spatial_neighborhood — return the neighborhood of a file and its + members. + - spatial_affordances — list the affordance keys (task_types) that + apply to a file, based on the bundled + + project affordances.yaml. + +# Neighborhoods — hybrid (folder-tree default + yaml override) + +In v3.1.0 a neighborhood = first two path components by default +(e.g., ``mcp_server/storage``, ``indexer``). A project can override +the mapping by committing ``.codevira/neighborhoods.yaml``: + +:: + + # neighborhoods.yaml + storage: + - mcp_server/storage/**/*.py + - mcp_server/storage/jsonl_store.py + tools: + - mcp_server/tools/**/*.py + engine: + - mcp_server/engine/**/*.py + +When the override exists, ``spatial_neighborhood(file_path)`` +matches the file against each neighborhood's glob list (fnmatch); +the first matching neighborhood wins. Files that match nothing fall +through to the folder-tree default — so an override never *hides* a +file, only re-labels matched ones. + +# Affordances — bundled + project override + +The bundled defaults live at ``mcp_server/data/affordances.yaml``; +a project may override at ``.codevira/affordances.yaml``. Both files +are lists of ``{pattern, affordances}`` entries. The loader +concatenates bundled+project, then returns the union of affordances +across patterns that match the input ``file_path``. + +# spatial_nearby ranking + +Per the plan: + +:: + + score = (1 / (1 + bfs_dist)) × log(1 + visit_count_30d) + +Candidates = BFS distance ≤ 2 over the indexer graph's edges ∪ +same-neighborhood files. Ties broken by activity edit_count then +alphabetical. + +If the indexer graph (``.codevira-cache/graph.sqlite``) doesn't +exist, BFS falls back to neighborhood-only — the tool still returns +useful results without requiring ``codevira index`` to have run. +""" + +from __future__ import annotations + +import fnmatch +import logging +import math +import sqlite3 +import yaml +from collections.abc import Iterable +from datetime import datetime, timedelta, timezone +from pathlib import Path +from typing import Any + +from mcp_server.storage import activity_store, paths + + +logger = logging.getLogger(__name__) + + +# Reasonable BFS bound for nearby — too wide a net floods the output +# without adding signal. +_BFS_MAX_DEPTH = 2 + + +# ────────────────────────────────────────────────────────────────────── +# spatial_nearby +# ────────────────────────────────────────────────────────────────────── + + +def spatial_nearby(file_path: str, *, k: int = 5) -> dict[str, Any]: + """Files near ``file_path`` by topology + activity. + + Candidate set = BFS over graph edges (≤ 2 hops) ∪ same-neighborhood. + Ranking = ``(1 / (1 + bfs_dist)) × log(1 + visit_count_30d)``, + tied broken by edit_count and alphabetical. The originating + file itself is excluded. + + Falls back to neighborhood-only if the indexer graph isn't built. + """ + if not isinstance(file_path, str) or not file_path.strip(): + return {"file_path": file_path, "hits": [], "count": 0} + + # Step 1: BFS over the graph (if available). + bfs_distances = _bfs_distances(file_path, max_depth=_BFS_MAX_DEPTH) + + # Step 2: neighborhood union. + neighborhood_id = _neighborhood_for(file_path) + neighborhood_members = _members_of(neighborhood_id) + + # Build candidate set. + candidates: set[str] = set(bfs_distances.keys()) | set(neighborhood_members) + candidates.discard(file_path) + if not candidates: + return { + "file_path": file_path, + "hits": [], + "count": 0, + "neighborhood": neighborhood_id, + } + + # Step 3: rank. + now = datetime.now(timezone.utc) + scored: list[dict[str, Any]] = [] + for cand in candidates: + bfs_dist = bfs_distances.get(cand, 3) # 3 = "neighborhood-only" floor + visit_count = activity_store.visit_count_30d(cand, now=now) + bfs_term = 1.0 / (1.0 + bfs_dist) + activity_term = math.log(1.0 + visit_count) + score = bfs_term * activity_term + scored.append( + { + "file_path": cand, + "bfs_distance": bfs_dist, + "visit_count_30d": visit_count, + "score": round(score, 4), + } + ) + + scored.sort( + key=lambda r: ( + r["score"], + r["visit_count_30d"], + -ord(r["file_path"][0:1] or "z"), + ), + reverse=True, + ) + return { + "file_path": file_path, + "neighborhood": neighborhood_id, + "hits": scored[:k], + "count": min(k, len(scored)), + } + + +# ────────────────────────────────────────────────────────────────────── +# spatial_heat +# ────────────────────────────────────────────────────────────────────── + + +def spatial_heat( + *, + top_k: int = 20, + since_days: int | None = None, +) -> dict[str, Any]: + """Top-K most-touched files by weighted activity. + + ``since_days`` (optional): only count rows within the trailing + N-day window. Falls back to all-time when None. + """ + since: datetime | None = None + if since_days is not None and since_days > 0: + since = datetime.now(timezone.utc) - timedelta(days=int(since_days)) + + rows = activity_store.list_top_k_files(top_k=top_k, since=since) + return { + "hits": rows, + "count": len(rows), + "since_days": since_days, + } + + +# ────────────────────────────────────────────────────────────────────── +# spatial_neighborhood +# ────────────────────────────────────────────────────────────────────── + + +def spatial_neighborhood(file_path: str) -> dict[str, Any]: + """Return the neighborhood id + members for a file. + + Members are derived from the activity log + any indexer-known + files in the same neighborhood — i.e., we surface every file the + spatial layer has 'seen' that shares the neighborhood, not just + the directory listing on disk. + """ + if not isinstance(file_path, str) or not file_path.strip(): + return {"neighborhood_id": None, "members": [], "count": 0} + nid = _neighborhood_for(file_path) + members = _members_of(nid) + return { + "neighborhood_id": nid, + "members": sorted(members), + "count": len(members), + } + + +# ────────────────────────────────────────────────────────────────────── +# spatial_affordances +# ────────────────────────────────────────────────────────────────────── + + +def spatial_affordances(file_path: str) -> dict[str, Any]: + """Return the affordance keys (task_types) applicable to a file. + + Loads bundled + project affordances.yaml, evaluates each pattern + via fnmatch, and returns the union of matching affordance lists. + """ + if not isinstance(file_path, str) or not file_path.strip(): + return {"file_path": file_path, "affordances": []} + + affordances = _load_affordances() + matched: list[str] = [] + seen: set[str] = set() + matched_patterns: list[str] = [] + + for entry in affordances: + pattern = entry.get("pattern", "") + if not pattern: + continue + if fnmatch.fnmatch(file_path, pattern): + matched_patterns.append(pattern) + for a in entry.get("affordances", []) or []: + if a not in seen: + matched.append(a) + seen.add(a) + + return { + "file_path": file_path, + "affordances": matched, + "matched_patterns": matched_patterns, + "count": len(matched), + } + + +# ────────────────────────────────────────────────────────────────────── +# Internals: neighborhoods +# ────────────────────────────────────────────────────────────────────── + + +def _neighborhood_for(file_path: str) -> str: + """Resolve the neighborhood for a file. + + First consults the project override (.codevira/neighborhoods.yaml) + if present; on no match (or no override file), falls back to the + deterministic folder-tree rule (top-2 path components). + """ + # Project override. + override = _load_neighborhood_override() + if override: + for name, patterns in override.items(): + for p in patterns: + if fnmatch.fnmatch(file_path, p): + return str(name) + # Folder-tree default. + return _folder_tree_neighborhood(file_path) + + +def _folder_tree_neighborhood(file_path: str) -> str: + """Directory containing the file, capped at depth 2. + + Examples (from the plan): + - mcp_server/storage/foo.py → ``mcp_server/storage`` + - mcp_server/tools/working.py → ``mcp_server/tools`` + - indexer/index_codebase.py → ``indexer`` + - README.md → ```` + + The intent is to match how developers actually cluster code — by + package directory, not by individual file. We strip the filename + (last component) so files in the same dir share a neighborhood + even when only one of them has 'top-2' coverage in the raw path. + """ + parts = [p for p in file_path.split("/") if p] + if not parts: + return "" + dir_parts = parts[:-1] # drop the filename + if not dir_parts: + return "" + return "/".join(dir_parts[:2]) + + +def _members_of(neighborhood_id: str | None) -> list[str]: + """All files known to either the activity log or the indexer + graph that belong to ``neighborhood_id``. + + If ``neighborhood_id`` is None or '', returns activity-log + files only (no recursive walk of the filesystem). + """ + if not neighborhood_id: + return [] + + candidates: set[str] = set() + + # Files seen in activity log. + for rec in _iter_activity_node_ids(): + if _neighborhood_for(rec) == neighborhood_id: + candidates.add(rec) + + # Files seen in indexer graph (best-effort). + for nid in _iter_indexer_files(): + if _neighborhood_for(nid) == neighborhood_id: + candidates.add(nid) + + return list(candidates) + + +def _load_neighborhood_override() -> dict[str, list[str]] | None: + """Read .codevira/neighborhoods.yaml; return ``{name: [globs]}`` + or None if the file is missing / malformed. + """ + override_path = paths.codevira_dir() / "neighborhoods.yaml" + if not override_path.is_file(): + return None + try: + data = yaml.safe_load(override_path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 + logger.warning( + "spatial: failed to parse neighborhoods.yaml; " + "falling back to folder-tree: %s", + exc, + ) + return None + if not isinstance(data, dict): + return None + out: dict[str, list[str]] = {} + for k, v in data.items(): + if not isinstance(v, list): + continue + out[str(k)] = [str(p) for p in v if isinstance(p, str)] + return out or None + + +# ────────────────────────────────────────────────────────────────────── +# Internals: BFS over graph +# ────────────────────────────────────────────────────────────────────── + + +def _bfs_distances(start: str, *, max_depth: int) -> dict[str, int]: + """Return ``{file_path: dist}`` for files reachable from ``start`` + within ``max_depth`` hops over import/call edges. + + Falls back to ``{start: 0}`` if the indexer graph doesn't exist. + The BFS direction is undirected (both source→target and the + reverse are followed) — for "what's near me?" the agent doesn't + care about edge direction. + """ + graph_db = paths.graph_cache_path() + if not graph_db.is_file(): + return {start: 0} + + try: + conn = sqlite3.connect(str(graph_db)) + conn.row_factory = sqlite3.Row + except Exception as exc: # noqa: BLE001 + logger.warning("spatial: cannot open graph.sqlite: %s", exc) + return {start: 0} + + distances: dict[str, int] = {start: 0} + frontier: set[str] = {start} + try: + for depth in range(1, max_depth + 1): + if not frontier: + break + # Pull neighbors for the current frontier in one query. + placeholders = ",".join(["?"] * len(frontier)) + cursor = conn.execute( + f""" + SELECT DISTINCT target_id AS neighbor FROM edges + WHERE source_id IN ({placeholders}) + UNION + SELECT DISTINCT source_id AS neighbor FROM edges + WHERE target_id IN ({placeholders}) + """, + list(frontier) + list(frontier), + ) + next_frontier: set[str] = set() + for row in cursor.fetchall(): + neighbor = row["neighbor"] + if not neighbor or neighbor in distances: + continue + # Edges include both file nodes ("file:path") and symbol + # nodes ("file:path::sym"). Normalize to the file + # component so the result list matches activity_store + # node_ids (which are per-file paths). + file_neighbor = _node_id_to_file_path(neighbor) + if file_neighbor and file_neighbor not in distances: + distances[file_neighbor] = depth + next_frontier.add(neighbor) + frontier = next_frontier + finally: + try: + conn.close() + except Exception: # noqa: BLE001 + pass + + distances.pop(start, None) # exclude self from neighbor list + return distances + + +def _node_id_to_file_path(node_id: str) -> str | None: + """Extract the file path from an indexer node id. + + The indexer schema (per the exploration) prefixes file paths with + ``file:`` for file nodes and ``file:path::symbol`` for symbols. + Both should map to the file path string activity_store uses. + """ + if not isinstance(node_id, str): + return None + if node_id.startswith("file:"): + node_id = node_id[len("file:") :] + sep = node_id.find("::") + if sep >= 0: + node_id = node_id[:sep] + return node_id.strip() or None + + +def _iter_indexer_files() -> Iterable[str]: + """Yield file paths from the indexer's nodes table. Empty if no + graph DB exists. + """ + graph_db = paths.graph_cache_path() + if not graph_db.is_file(): + return + try: + conn = sqlite3.connect(str(graph_db)) + try: + cursor = conn.execute( + "SELECT DISTINCT file_path FROM nodes WHERE file_path IS NOT NULL" + ) + for row in cursor.fetchall(): + fp = row[0] + if isinstance(fp, str) and fp: + yield fp + finally: + conn.close() + except Exception as exc: # noqa: BLE001 + logger.debug("spatial: indexer file scan failed: %s", exc) + + +def _iter_activity_node_ids() -> Iterable[str]: + """Distinct node_ids seen in activity.jsonl.""" + seen: set[str] = set() + from mcp_server.storage import jsonl_store + + for rec in jsonl_store.read_all(paths.activity_path()): + nid = rec.get("node_id") + if isinstance(nid, str) and nid not in seen: + seen.add(nid) + yield nid + + +# ────────────────────────────────────────────────────────────────────── +# Internals: affordances +# ────────────────────────────────────────────────────────────────────── + + +def _load_affordances() -> list[dict[str, Any]]: + """Concat bundled defaults + project override. + + Order: bundled first, project second. ``spatial_affordances`` + deduplicates the affordance values per match, so duplicate + patterns between bundled+project unions cleanly. + """ + out: list[dict[str, Any]] = [] + for path in (_bundled_affordances_path(), _project_affordances_path()): + if not path.is_file(): + continue + try: + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 + logger.warning("spatial: failed to parse %s; skipping: %s", path, exc) + continue + if not isinstance(data, list): + continue + for entry in data: + if not isinstance(entry, dict): + continue + pattern = entry.get("pattern") + affordances = entry.get("affordances") + if not isinstance(pattern, str) or not isinstance(affordances, list): + continue + out.append( + { + "pattern": pattern, + "affordances": [str(a) for a in affordances if isinstance(a, str)], + } + ) + return out + + +def _bundled_affordances_path() -> Path: + """Resolve the bundled affordances.yaml that ships with the package.""" + # Located at mcp_server/data/affordances.yaml (sibling of the + # tools/ package's parent). + return Path(__file__).resolve().parent.parent / "data" / "affordances.yaml" + + +def _project_affordances_path() -> Path: + return paths.codevira_dir() / "affordances.yaml" diff --git a/tests/test_tools_spatial.py b/tests/test_tools_spatial.py new file mode 100644 index 0000000..bd8db3b --- /dev/null +++ b/tests/test_tools_spatial.py @@ -0,0 +1,263 @@ +""" +Tests for mcp_server.tools.spatial — v3.1.0 M4 Phase 2. + +Coverage: + - spatial_neighborhood: folder-tree default + yaml override + - spatial_affordances: bundled defaults + project overrides + - spatial_heat: wraps activity_store.list_top_k_files + since_days + - spatial_nearby: neighborhood fallback (no graph) + activity ranking +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import activity_store, paths +from mcp_server.tools import spatial + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# Folder-tree + neighborhood +# ────────────────────────────────────────────────────────────────────── + + +class TestFolderTreeNeighborhood: + def test_two_component_path(self) -> None: + assert spatial._folder_tree_neighborhood("mcp_server/storage/foo.py") == ( + "mcp_server/storage" + ) + + def test_long_path_only_top_two(self) -> None: + assert spatial._folder_tree_neighborhood("a/b/c/d/e.py") == "a/b" + + def test_single_component_is_root(self) -> None: + assert spatial._folder_tree_neighborhood("README.md") == "" + + def test_empty_path_is_root(self) -> None: + assert spatial._folder_tree_neighborhood("") == "" + + +class TestSpatialNeighborhood: + def test_default_folder_tree(self, project: Path) -> None: + r = spatial.spatial_neighborhood("mcp_server/storage/foo.py") + assert r["neighborhood_id"] == "mcp_server/storage" + + def test_yaml_override_wins(self, project: Path) -> None: + # Override: place anything matching mcp_server/storage/**.py + # into the 'persistence' neighborhood. + override = project / ".codevira" / "neighborhoods.yaml" + override.write_text( + "persistence:\n" + " - mcp_server/storage/*.py\n" + "engine:\n" + " - mcp_server/engine/*.py\n" + ) + r = spatial.spatial_neighborhood("mcp_server/storage/foo.py") + assert r["neighborhood_id"] == "persistence" + r2 = spatial.spatial_neighborhood("mcp_server/engine/x.py") + assert r2["neighborhood_id"] == "engine" + + def test_override_fallthrough_to_folder_tree(self, project: Path) -> None: + """A file matching no override pattern still gets a folder-tree + neighborhood — the override re-labels, it doesn't gate.""" + override = project / ".codevira" / "neighborhoods.yaml" + override.write_text("persistence:\n - mcp_server/storage/*.py\n") + r = spatial.spatial_neighborhood("indexer/foo.py") + assert r["neighborhood_id"] == "indexer" + + def test_malformed_override_falls_through(self, project: Path) -> None: + override = project / ".codevira" / "neighborhoods.yaml" + override.write_text("[ this is not valid yaml :") + # Should not raise; falls back to folder-tree. + r = spatial.spatial_neighborhood("mcp_server/storage/foo.py") + assert r["neighborhood_id"] == "mcp_server/storage" + + def test_members_from_activity_log(self, project: Path) -> None: + activity_store.add("mcp_server/storage/a.py", kind="edit") + activity_store.add("mcp_server/storage/b.py", kind="edit") + activity_store.add("indexer/x.py", kind="edit") + r = spatial.spatial_neighborhood("mcp_server/storage/a.py") + assert r["neighborhood_id"] == "mcp_server/storage" + members = set(r["members"]) + assert "mcp_server/storage/a.py" in members + assert "mcp_server/storage/b.py" in members + assert "indexer/x.py" not in members + + +# ────────────────────────────────────────────────────────────────────── +# Affordances +# ────────────────────────────────────────────────────────────────────── + + +class TestSpatialAffordances: + def test_bundled_tools_pattern(self, project: Path) -> None: + r = spatial.spatial_affordances("mcp_server/tools/foo.py") + assert "add_tool" in r["affordances"] + assert "write_test" in r["affordances"] + + def test_bundled_storage_pattern(self, project: Path) -> None: + r = spatial.spatial_affordances("mcp_server/storage/foo.py") + assert "add_store" in r["affordances"] + + def test_test_files_get_write_test_affordance(self, project: Path) -> None: + r = spatial.spatial_affordances("tests/test_something.py") + assert "write_test" in r["affordances"] + + def test_no_match_returns_empty(self, project: Path) -> None: + r = spatial.spatial_affordances("random/file.xyz") + assert r["affordances"] == [] + assert r["count"] == 0 + + def test_project_override_unions_with_bundled(self, project: Path) -> None: + override = project / ".codevira" / "affordances.yaml" + override.write_text( + "- pattern: 'mcp_server/storage/*.py'\n" + " affordances: ['custom_affordance']\n" + ) + r = spatial.spatial_affordances("mcp_server/storage/foo.py") + # Both bundled affordances (add_store, write_test) AND the project + # affordance (custom_affordance) surface. + assert "add_store" in r["affordances"] + assert "custom_affordance" in r["affordances"] + + def test_empty_path_returns_empty(self, project: Path) -> None: + r = spatial.spatial_affordances("") + assert r["affordances"] == [] + + +# ────────────────────────────────────────────────────────────────────── +# spatial_heat +# ────────────────────────────────────────────────────────────────────── + + +class TestSpatialHeat: + def test_returns_ranked_files(self, project: Path) -> None: + for _ in range(3): + activity_store.add("hot.py", kind="edit") + activity_store.add("cool.py", kind="edit") + r = spatial.spatial_heat(top_k=5) + names = [h["node_id"] for h in r["hits"]] + assert names[0] == "hot.py" + assert "cool.py" in names + + def test_top_k_caps(self, project: Path) -> None: + for i in range(10): + activity_store.add(f"f{i}.py", kind="edit") + r = spatial.spatial_heat(top_k=3) + assert r["count"] == 3 + + def test_empty_returns_empty(self, project: Path) -> None: + r = spatial.spatial_heat() + assert r["count"] == 0 + + def test_since_days_filter(self, project: Path) -> None: + from datetime import datetime, timedelta, timezone + + from mcp_server.storage import jsonl_store + + # Stale row. + old = datetime.now(timezone.utc) - timedelta(days=45) + jsonl_store.append( + paths.activity_path(), + { + "id": "A000001", + "ts": old.isoformat(), + "node_id": "stale.py", + "kind": "edit", + "_schema_v": 1, + }, + ) + # Fresh row. + activity_store.add("fresh.py", kind="edit") + r = spatial.spatial_heat(since_days=30) + names = {h["node_id"] for h in r["hits"]} + assert "fresh.py" in names + assert "stale.py" not in names + + +# ────────────────────────────────────────────────────────────────────── +# spatial_nearby (neighborhood-only fallback when graph missing) +# ────────────────────────────────────────────────────────────────────── + + +class TestSpatialNearby: + def test_no_graph_uses_neighborhood_only(self, project: Path) -> None: + # No graph.sqlite exists; BFS falls back. Same-neighborhood + # files still surface via activity log. + activity_store.add("mcp_server/storage/a.py", kind="edit") + activity_store.add("mcp_server/storage/b.py", kind="edit") + activity_store.add("indexer/x.py", kind="edit") + + r = spatial.spatial_nearby("mcp_server/storage/a.py", k=10) + nearby_paths = {h["file_path"] for h in r["hits"]} + # b.py is in the same neighborhood; x.py isn't. + assert "mcp_server/storage/b.py" in nearby_paths + assert "indexer/x.py" not in nearby_paths + + def test_originating_file_excluded(self, project: Path) -> None: + activity_store.add("mcp_server/storage/a.py", kind="edit") + activity_store.add("mcp_server/storage/b.py", kind="edit") + r = spatial.spatial_nearby("mcp_server/storage/a.py") + paths_returned = {h["file_path"] for h in r["hits"]} + assert "mcp_server/storage/a.py" not in paths_returned + + def test_ranks_by_activity_count(self, project: Path) -> None: + # b.py has 3 edits, c.py has 1 edit; both same neighborhood as a.py. + for _ in range(3): + activity_store.add("mcp_server/storage/b.py", kind="edit") + activity_store.add("mcp_server/storage/c.py", kind="edit") + # Originating file is a.py. + activity_store.add("mcp_server/storage/a.py", kind="edit") + r = spatial.spatial_nearby("mcp_server/storage/a.py", k=5) + # b.py should outrank c.py via higher visit_count_30d. + names = [h["file_path"] for h in r["hits"]] + assert names.index("mcp_server/storage/b.py") < names.index( + "mcp_server/storage/c.py" + ) + + def test_empty_query_returns_empty(self, project: Path) -> None: + r = spatial.spatial_nearby("", k=5) + assert r["hits"] == [] + + def test_isolated_file_returns_empty(self, project: Path) -> None: + # File with no neighborhood-mates and no graph entries. + r = spatial.spatial_nearby("isolated/lonely.py") + assert r["hits"] == [] + + +# ────────────────────────────────────────────────────────────────────── +# Internal helpers +# ────────────────────────────────────────────────────────────────────── + + +class TestNodeIdToFilePath: + def test_strips_file_prefix(self) -> None: + assert ( + spatial._node_id_to_file_path("file:mcp_server/storage/foo.py") + == "mcp_server/storage/foo.py" + ) + + def test_strips_symbol_suffix(self) -> None: + assert ( + spatial._node_id_to_file_path("file:mcp_server/storage/foo.py::bar") + == "mcp_server/storage/foo.py" + ) + + def test_plain_path_unchanged(self) -> None: + assert spatial._node_id_to_file_path("foo.py") == "foo.py" + + def test_empty_returns_none(self) -> None: + assert spatial._node_id_to_file_path("") is None From 02f179d502495b9e8d60ff6793b9d6c58b2b7e25 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 14:16:27 +0530 Subject: [PATCH 12/44] =?UTF-8?q?feat(v3.1.0):=20M5=20=E2=80=94=20skill=20?= =?UTF-8?q?induction=20wired=20to=20outcomes=5Fwriter?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the skill-library reinforcement loop. Three pieces: Sessions schema (sessions_store.py): Additive optional fields on every session log: - task_type ∈ {feature, bug, refactor, release, docs, other} - skill_ids: list of K-ids used during the session Legacy v3.0.x sessions tolerate absence; the induction pipeline + outcomes-fan-out simply skip sessions without these fields. outcomes_writer skill fan-out (outcomes_writer.py): When observe_all() classifies a session's decision as 'kept' or 'reverted', each skill referenced via skill_ids on the SAME session gets a corresponding mark_used call: - kept → skills_store.mark_used(success=True) - reverted → skills_store.mark_used(success=False) - modified → no-op Pre-builds a {session_id → set[skill_id]} index so the per-decision fan-out is O(1) lookup. Fail-open: skills_store errors log a warning but don't fail the decision-outcome write. Summary dict gains skill_marks_success / skill_marks_failure counts so the CLI can surface the fan-out totals. This is the canonical reinforcement signal — git-derived, not agent-self-reported. The MCP-tool apply_skill_outcome remains as a manual override. codevira induce-skills CLI (cli_induce.py): Deterministic induction pipeline (no LLM in v3.1): 1. Filter to sessions with task_type + ≥80% of classified decisions marked 'kept'. 2. Group by task_type. 3. Cluster within each group by tag-Jaccard ≥ 0.5 (greedy single-pass agglomeration). 4. Keep clusters with ≥3 sessions. 5. Render candidate skill per cluster: name = ': ' procedure = bullet-summary of session.task + truncated decision.decision (capped at 30 lines). 6. Without --apply: write to .codevira/induction_proposals.jsonl. 7. With --apply: interactively confirm each (use --yes to skip prompts in CI). Records via skills_store.record( source='induced', source_session_ids=[...]). paths.induction_proposals_path() + cli.py 'induce-skills' subparser wire the surface. Tests: tests/test_cli_induce.py — 15 tests covering _jaccard, _build_proposals (empty, below-threshold, below-min-cluster, productive cluster, distinct task_types, low-jaccard), cmd_induce_skills (dry-run + apply --yes), and outcomes_writer fan-out (kept→success, reverted→failure with monkeypatched classification). 742 tests across storage + engine + tools + check_conflict + CLI pass green; zero regressions from M4 baseline. Plan M5. Reinforcement loop closed; M6/M7 (consensus) and M8 (reflections) remain. Co-Authored-By: Claude Opus 4.7 --- mcp_server/cli.py | 33 +++ mcp_server/cli_induce.py | 328 ++++++++++++++++++++++++ mcp_server/storage/outcomes_writer.py | 47 ++++ mcp_server/storage/paths.py | 11 + mcp_server/storage/sessions_store.py | 22 +- tests/test_cli_induce.py | 342 ++++++++++++++++++++++++++ 6 files changed, 782 insertions(+), 1 deletion(-) create mode 100644 mcp_server/cli_induce.py create mode 100644 tests/test_cli_induce.py diff --git a/mcp_server/cli.py b/mcp_server/cli.py index c0ca201..f1afcd5 100644 --- a/mcp_server/cli.py +++ b/mcp_server/cli.py @@ -1239,6 +1239,29 @@ def error(self, message): # type: ignore[override] ), ) + # v3.1.0 M5: induced-skill candidate generation. CLI-only — the MCP + # surface for skills is record_skill / get_skill / list_skills. + induce_parser = subparsers.add_parser( + "induce-skills", + help="Cluster productive sessions and propose induced skills " + "(v3.1.0 M5). Without --apply: writes proposals to " + ".codevira/induction_proposals.jsonl for human review. With " + "--apply: interactively confirms each proposal (use --yes to " + "skip prompts in CI).", + ) + induce_parser.add_argument( + "--apply", + action="store_true", + help="Commit the proposals as induced skills " + "(otherwise dry-run to induction_proposals.jsonl)", + ) + induce_parser.add_argument( + "--yes", + action="store_true", + help="With --apply: skip the interactive confirm prompt " + "(non-interactive, CI-safe).", + ) + # v3.1.0 M2 Phase 3: working-memory subcommands. The MCP tool # surface (working_add / working_get / working_promote) is the # everyday agent-facing API; this CLI tier is the escape hatch for @@ -1520,6 +1543,16 @@ def error(self, message): # type: ignore[override] keep_data=getattr(args, "keep_data", False), ) sys.exit(rc) + elif args.command == "induce-skills": + # v3.1.0 M5: skill induction CLI. + from mcp_server.cli_induce import cmd_induce_skills + + sys.exit( + cmd_induce_skills( + apply=getattr(args, "apply", False), + yes=getattr(args, "yes", False), + ) + ) elif args.command == "working": # v3.1.0 M2 Phase 3: working-memory subcommands. working_action = getattr(args, "working_action", None) diff --git a/mcp_server/cli_induce.py b/mcp_server/cli_induce.py new file mode 100644 index 0000000..936180e --- /dev/null +++ b/mcp_server/cli_induce.py @@ -0,0 +1,328 @@ +""" +cli_induce.py — v3.1.0 M5: ``codevira induce-skills`` CLI. + +Walks through ``sessions.jsonl`` looking for productive clusters +(same ``task_type``, similar tag set, ≥80% of decisions classified +``kept``) and proposes induced skills the user can review + commit +to ``skills.jsonl``. + +# Pipeline (matches the plan's M5 spec) + + 1. Filter to sessions that have ``task_type`` set AND at least one + decision in ``decision_ids`` whose outcome (per outcomes.jsonl) + is classified — and ≥80% of those classified outcomes are + ``kept``. + 2. Group by ``task_type``. + 3. Within each group, cluster sessions by tag-Jaccard ≥ 0.5 + (greedy single-pass agglomeration). + 4. Keep clusters with ≥3 sessions. + 5. Render a candidate skill per cluster: + name = ": " + procedure = bullet-summary of session.task lines + truncated + decision text (deterministic — no LLM in v3.1). + 6. Without ``--apply``: write proposals to + ``.codevira/induction_proposals.jsonl`` for human review. + 7. With ``--apply``: review interactively unless ``--yes``, + then ``skills_store.record(source='induced', + source_session_ids=[...])``. + +# Deterministic-only ranking + +v3.1.0 induction does NOT call an LLM. Procedure text is rendered +from existing session/decision strings. M5+ (v3.2 opt-in) can +substitute an LLM-rendered procedure behind a feature flag. +""" + +from __future__ import annotations + +import sys +from datetime import datetime, timezone +from typing import Any + +from mcp_server.storage import jsonl_store, paths + + +_KEPT_THRESHOLD = 0.80 +_MIN_CLUSTER_SIZE = 3 +_TAG_JACCARD_THRESHOLD = 0.5 +_PROCEDURE_LINE_CAP = 30 # cap total lines in the rendered procedure +_PROCEDURE_DECISION_TRUNC = 120 # per-decision truncation + + +def cmd_induce_skills(*, apply: bool = False, yes: bool = False) -> int: + """Entry point for ``codevira induce-skills``. + + Returns 0 on success (including no candidates found). Non-zero on + storage / parse errors. + """ + proposals = _build_proposals() + if not proposals: + sys.stdout.write( + "codevira induce-skills: no induced-skill candidates " + "found (need ≥3 productive sessions sharing a task_type " + "and a ≥0.5 tag-Jaccard cluster).\n" + ) + return 0 + + if not apply: + return _write_proposals(proposals) + + return _apply_proposals(proposals, yes=yes) + + +# ────────────────────────────────────────────────────────────────────── +# Pipeline stages +# ────────────────────────────────────────────────────────────────────── + + +def _build_proposals() -> list[dict[str, Any]]: + """Stage 1-5 of the pipeline. Returns the proposal list (possibly + empty).""" + sessions = jsonl_store.read_all(paths.sessions_path()) + if not sessions: + return [] + + # Build the {decision_id → outcome_type} index from outcomes.jsonl. + outcomes_by_decision: dict[str, str] = {} + try: + for row in jsonl_store.read_all(paths.outcomes_path()): + did = row.get("decision_id") + otype = row.get("outcome_type") + if isinstance(did, str) and isinstance(otype, str): + outcomes_by_decision[did] = otype # last write wins (newest) + except Exception: # noqa: BLE001 + pass + + # Build the {decision_id → decision_row} index from decisions.jsonl + # via the merged-amendment view so we see the latest decision text. + decisions_by_id: dict[str, dict[str, Any]] = {} + try: + for r in jsonl_store.read_merged(paths.decisions_path()): + did = r.get("id") + if isinstance(did, str): + decisions_by_id[did] = r + except Exception: # noqa: BLE001 + pass + + # Stage 1: filter productive sessions. + productive: list[dict[str, Any]] = [] + for s in sessions: + if s.get("_amendment_to_id"): + continue # session-log amendments don't drive induction + task_type = s.get("task_type") + if not isinstance(task_type, str) or not task_type: + continue + decision_ids = s.get("decision_ids") or [] + if not isinstance(decision_ids, list) or not decision_ids: + continue + kept = 0 + classified = 0 + for did in decision_ids: + if not isinstance(did, str): + continue + outcome = outcomes_by_decision.get(did) + if outcome is None: + continue + classified += 1 + if outcome == "kept": + kept += 1 + if classified == 0: + continue + if kept / classified < _KEPT_THRESHOLD: + continue + productive.append(s) + + if not productive: + return [] + + # Stage 2: group by task_type. + by_task_type: dict[str, list[dict[str, Any]]] = {} + for s in productive: + by_task_type.setdefault(s["task_type"], []).append(s) + + # Stage 3-4: cluster + filter. + proposals: list[dict[str, Any]] = [] + for task_type, group in by_task_type.items(): + sessions_with_tags: list[tuple[dict[str, Any], set[str]]] = [] + for s in group: + tags: set[str] = set() + for did in s.get("decision_ids") or []: + if not isinstance(did, str): + continue + d = decisions_by_id.get(did) + if d is None: + continue + for t in d.get("tags") or []: + if isinstance(t, str) and t: + tags.add(t) + sessions_with_tags.append((s, tags)) + + clusters: list[dict[str, Any]] = [] + for s, tags in sessions_with_tags: + matched = False + for cluster in clusters: + if _jaccard(tags, cluster["tags"]) >= _TAG_JACCARD_THRESHOLD: + cluster["sessions"].append(s) + cluster["tags"] = cluster["tags"] | tags + matched = True + break + if not matched: + clusters.append( + {"sessions": [s], "tags": set(tags), "task_type": task_type} + ) + + for cluster in clusters: + if len(cluster["sessions"]) < _MIN_CLUSTER_SIZE: + continue + proposals.append(_render_proposal(cluster, decisions_by_id=decisions_by_id)) + + return proposals + + +def _render_proposal( + cluster: dict[str, Any], *, decisions_by_id: dict[str, dict[str, Any]] +) -> dict[str, Any]: + """Stage 5: deterministic procedure rendering.""" + task_type = cluster["task_type"] + tags = sorted(cluster["tags"]) + top_tags = tags[:3] + name = ( + f"{task_type}: {', '.join(top_tags)}" if top_tags else f"{task_type}: induced" + ) + lines: list[str] = [] + for s in cluster["sessions"]: + task_line = (s.get("task") or "").strip() + if task_line: + lines.append(f"- {task_line}") + for did in s.get("decision_ids") or []: + if not isinstance(did, str): + continue + d = decisions_by_id.get(did) + if d is None: + continue + text = (d.get("decision") or "").strip() + if not text: + continue + snippet = ( + text + if len(text) <= _PROCEDURE_DECISION_TRUNC + else text[: _PROCEDURE_DECISION_TRUNC - 1] + "…" + ) + lines.append(f" • {snippet}") + if len(lines) >= _PROCEDURE_LINE_CAP: + break + if len(lines) >= _PROCEDURE_LINE_CAP: + break + procedure = "\n".join(lines).strip() or "(no rendered procedure body)" + return { + "name": name, + "summary": ( + f"Induced from {len(cluster['sessions'])} productive " + f"{task_type} session(s) sharing tags: {', '.join(top_tags)}" + ), + "procedure": procedure, + "task_type": task_type, + "tags": tags, + "source_session_ids": [str(s.get("session_id")) for s in cluster["sessions"]], + "session_count": len(cluster["sessions"]), + } + + +# ────────────────────────────────────────────────────────────────────── +# Dry-run / Apply +# ────────────────────────────────────────────────────────────────────── + + +def _write_proposals(proposals: list[dict[str, Any]]) -> int: + """Stage 6: write proposals to .codevira/induction_proposals.jsonl.""" + paths.ensure_dirs() + dest = paths.induction_proposals_path() + try: + ts = datetime.now(timezone.utc).isoformat() + for p in proposals: + jsonl_store.append( + dest, + { + "ts": ts, + "name": p["name"], + "summary": p.get("summary"), + "procedure": p["procedure"], + "task_type": p["task_type"], + "tags": p["tags"], + "source_session_ids": p["source_session_ids"], + "session_count": p["session_count"], + "_schema_v": 1, + }, + ) + except OSError as exc: + sys.stderr.write(f"codevira induce-skills: could not write proposals: {exc}\n") + return 1 + sys.stdout.write( + f"codevira induce-skills: wrote {len(proposals)} proposal(s) to " + f"{dest}.\n Review them and re-run with --apply (add --yes for " + f"non-interactive commit) to record into skills.jsonl.\n" + ) + return 0 + + +def _apply_proposals(proposals: list[dict[str, Any]], *, yes: bool) -> int: + """Stage 7: record proposals as induced skills.""" + from mcp_server.storage import skills_store + + paths.ensure_dirs() + recorded = 0 + skipped = 0 + + for p in proposals: + if not yes: + sys.stdout.write("\n" + "─" * 70 + "\n") + sys.stdout.write(f"Proposed skill: {p['name']}\n") + sys.stdout.write(f" ({p['summary']})\n\n") + sys.stdout.write(p["procedure"] + "\n\n") + sys.stdout.write("Record this skill? [y/N]: ") + sys.stdout.flush() + try: + resp = input().strip().lower() + except EOFError: + resp = "n" + if resp not in ("y", "yes"): + skipped += 1 + continue + + try: + kid = skills_store.record( + name=p["name"], + procedure=p["procedure"], + summary=p.get("summary"), + triggers={"tags": p["tags"], "file_patterns": []}, + source=skills_store.SOURCE_INDUCED, + source_session_ids=p["source_session_ids"], + ) + sys.stdout.write(f" ✓ recorded {kid}\n") + recorded += 1 + except ValueError as exc: + sys.stderr.write(f" ✗ skipped: {exc}\n") + skipped += 1 + except Exception as exc: # noqa: BLE001 + sys.stderr.write(f" ✗ unexpected error: {exc}\n") + skipped += 1 + + sys.stdout.write( + f"\ncodevira induce-skills: recorded {recorded} / {len(proposals)} " + f"({skipped} skipped).\n" + ) + return 0 if recorded > 0 else (1 if proposals else 0) + + +# ────────────────────────────────────────────────────────────────────── +# Helpers +# ────────────────────────────────────────────────────────────────────── + + +def _jaccard(a: set[str], b: set[str]) -> float: + if not a and not b: + return 1.0 # two empty sets cluster together + union = a | b + if not union: + return 0.0 + return len(a & b) / len(union) diff --git a/mcp_server/storage/outcomes_writer.py b/mcp_server/storage/outcomes_writer.py index ac00be4..d724e77 100644 --- a/mcp_server/storage/outcomes_writer.py +++ b/mcp_server/storage/outcomes_writer.py @@ -205,6 +205,25 @@ def observe_all(*, project_root: Path | None = None) -> dict[str, Any]: new_outcomes: list[dict[str, Any]] = [] head_sha = _run_git(["rev-parse", "HEAD"], project_root).strip() + # v3.1.0 M5: cache session_id → skill_ids so the per-decision + # fan-out is O(1) lookup rather than O(N_sessions) per decision. + session_skills: dict[str, set[str]] = {} + try: + for s in jsonl_store.read_all(paths.sessions_path(project_root)): + sid = s.get("session_id") + if not isinstance(sid, str): + continue + skill_ids = s.get("skill_ids") or [] + if not isinstance(skill_ids, list): + continue + bucket = session_skills.setdefault(sid, set()) + for k in skill_ids: + if isinstance(k, str) and k: + bucket.add(k) + except Exception as exc: # noqa: BLE001 + logger.warning("outcomes_writer: failed to read sessions: %s", exc) + fanout_summary = {"skill_marks_success": 0, "skill_marks_failure": 0} + for d in active: if d.get("is_superseded") or d.get("superseded_by"): continue # don't track outcomes for retired decisions @@ -214,6 +233,32 @@ def observe_all(*, project_root: Path | None = None) -> dict[str, Any]: continue counts[outcome_type] += 1 + # v3.1.0 M5: fan out the classification to skills used in the + # same session. ``kept`` is a success signal; ``reverted`` is + # a failure. ``modified`` is no-op (the decision's intent + # survives — we just don't get a signal). Best-effort: if + # skills_store fails, decision outcome still lands. + if outcome_type in ("kept", "reverted"): + d_session_id = d.get("session_id") or "" + skill_ids = session_skills.get(d_session_id, set()) + if skill_ids: + try: + from mcp_server.storage import skills_store + + success = outcome_type == "kept" + for sid in skill_ids: + skills_store.mark_used(sid, success=success) + if success: + fanout_summary["skill_marks_success"] += 1 + else: + fanout_summary["skill_marks_failure"] += 1 + except Exception as exc: # noqa: BLE001 + logger.warning( + "outcomes_writer: skill fan-out failed for %s: %s", + d.get("id"), + exc, + ) + # Skip if the SAME outcome already exists for this decision at this HEAD. if d.get("outcome") == outcome_type: continue @@ -259,6 +304,8 @@ def observe_all(*, project_root: Path | None = None) -> dict[str, Any]: **counts, "outcomes_appended": len(new_outcomes), "head_sha": head_sha[:12] if head_sha else None, + # v3.1.0 M5: skill reinforcement fan-out totals. + **fanout_summary, } diff --git a/mcp_server/storage/paths.py b/mcp_server/storage/paths.py index eca17ab..9dccd92 100644 --- a/mcp_server/storage/paths.py +++ b/mcp_server/storage/paths.py @@ -109,6 +109,17 @@ def skills_path(project_root: Path | None = None) -> Path: return codevira_dir(project_root) / "skills.jsonl" +def induction_proposals_path(project_root: Path | None = None) -> Path: + """v3.1.0 M5: human-review staging file for ``codevira induce-skills``. + + Without ``--apply``, the induction CLI writes proposed skills here + so the user can scan them before committing to ``skills.jsonl``. + Lives in ``.codevira/`` (canonical, gitable) so a teammate's + induction run isn't lost. + """ + return codevira_dir(project_root) / "induction_proposals.jsonl" + + def working_archived_path(session_id: str, project_root: Path | None = None) -> Path: """v3.1.0 M2: opt-in commit target for working-memory entries. diff --git a/mcp_server/storage/sessions_store.py b/mcp_server/storage/sessions_store.py index 077d7db..f14a0bf 100644 --- a/mcp_server/storage/sessions_store.py +++ b/mcp_server/storage/sessions_store.py @@ -43,8 +43,21 @@ def write( summary: str | None = None, decision_ids: list[str] | None = None, outcome: str | None = None, + task_type: str | None = None, + skill_ids: list[str] | None = None, ) -> str: - """Append a single session log; return generated id.""" + """Append a single session log; return generated id. + + v3.1.0 M5: ``task_type`` and ``skill_ids`` are additive (optional) + fields. Legacy v3.0.x readers tolerate their absence. They feed + the M5 induction pipeline + the outcomes_writer skill-fan-out: + - task_type ∈ {feature, bug, refactor, release, docs, other}; + induction clusters sessions by task_type. + - skill_ids: skills used during the session; when + ``outcomes_writer`` classifies the session's decisions, the + result is fanned out as ``mark_used(skill_id, success=...)`` + for each. + """ paths.ensure_dirs() record = { "ts": datetime.now(timezone.utc).isoformat(), @@ -54,6 +67,9 @@ def write( "summary": summary, "decision_ids": list(decision_ids or []), "outcome": outcome, + # v3.1.0 M5 + "task_type": task_type, + "skill_ids": list(skill_ids or []), # v3.1.0 M1: provenance tagging — which IDE/agent/machine # wrote this session log. Reads tolerate absence on legacy # records (v3.0.x sessions have no origin). @@ -87,6 +103,10 @@ def write_many(logs: list[dict[str, Any]]) -> tuple[list[str], list[dict[str, An "summary": log.get("summary"), "decision_ids": list(log.get("decisions") or log.get("decision_ids") or []), "outcome": log.get("outcome"), + # v3.1.0 M5: optional induction-pipeline + skill-fan-out + # signals. Legacy records tolerate absence. + "task_type": log.get("task_type"), + "skill_ids": list(log.get("skill_ids") or []), # v3.1.0 M1: provenance tagging (see write() above). "origin": origin.current_origin(), } diff --git a/tests/test_cli_induce.py b/tests/test_cli_induce.py new file mode 100644 index 0000000..9304562 --- /dev/null +++ b/tests/test_cli_induce.py @@ -0,0 +1,342 @@ +""" +Tests for mcp_server.cli_induce — v3.1.0 M5. + +Covers the deterministic induction pipeline + outcomes_writer skill +fan-out integration. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.cli_induce import ( + _build_proposals, + _jaccard, + cmd_induce_skills, +) +from mcp_server.storage import ( + decisions_store, + jsonl_store, + paths, + sessions_store, + skills_store, +) + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +def _seed_outcome(decision_id: str, outcome_type: str) -> None: + """Helper: append a row to outcomes.jsonl.""" + jsonl_store.append( + paths.outcomes_path(), + { + "ts": datetime.now(timezone.utc).isoformat(), + "decision_id": decision_id, + "outcome_type": outcome_type, + }, + ) + + +# ────────────────────────────────────────────────────────────────────── +# Jaccard helper +# ────────────────────────────────────────────────────────────────────── + + +class TestJaccard: + def test_identical_sets_score_1(self) -> None: + assert _jaccard({"a", "b"}, {"a", "b"}) == 1.0 + + def test_disjoint_sets_score_0(self) -> None: + assert _jaccard({"a"}, {"b"}) == 0.0 + + def test_partial_overlap(self) -> None: + # {a, b} ∩ {b, c} = {b}; union = 3 → 1/3 + assert abs(_jaccard({"a", "b"}, {"b", "c"}) - (1 / 3)) < 1e-9 + + def test_two_empty_sets_score_1(self) -> None: + assert _jaccard(set(), set()) == 1.0 + + +# ────────────────────────────────────────────────────────────────────── +# Pipeline: _build_proposals +# ────────────────────────────────────────────────────────────────────── + + +class TestBuildProposals: + def test_empty_state_returns_no_proposals(self, project: Path) -> None: + assert _build_proposals() == [] + + def test_below_threshold_skipped(self, project: Path) -> None: + """Sessions where <80% of decisions are 'kept' don't propose.""" + d1 = decisions_store.record(decision="A", tags=["t1"]) + d2 = decisions_store.record(decision="B", tags=["t1"]) + _seed_outcome(d1, "kept") + _seed_outcome(d2, "reverted") + # 50% kept → below threshold. + for i in range(3): + sessions_store.write( + f"sess-{i}", + task=f"task {i}", + task_type="bug", + decision_ids=[d1, d2], + ) + assert _build_proposals() == [] + + def test_below_min_cluster_size_skipped(self, project: Path) -> None: + """Clusters with <3 sessions don't propose.""" + d1 = decisions_store.record(decision="A", tags=["t1", "t2"]) + _seed_outcome(d1, "kept") + for i in range(2): # only 2 sessions + sessions_store.write( + f"sess-{i}", + task=f"task {i}", + task_type="bug", + decision_ids=[d1], + ) + assert _build_proposals() == [] + + def test_productive_cluster_proposes_skill(self, project: Path) -> None: + """3+ productive sessions sharing tags → 1 proposal.""" + d1 = decisions_store.record( + decision="Use bcrypt", file_path="auth.py", tags=["auth", "hash"] + ) + d2 = decisions_store.record( + decision="Rate-limit logins", file_path="auth.py", tags=["auth"] + ) + for did in (d1, d2): + _seed_outcome(did, "kept") + for i in range(3): + sessions_store.write( + f"sess-{i}", + task=f"harden auth flow {i}", + task_type="bug", + decision_ids=[d1, d2], + ) + proposals = _build_proposals() + assert len(proposals) == 1 + p = proposals[0] + assert p["task_type"] == "bug" + assert "auth" in p["tags"] + assert p["session_count"] == 3 + assert p["source_session_ids"] == ["sess-0", "sess-1", "sess-2"] + assert "bug:" in p["name"] + # Procedure includes session task + decision text. + assert "harden auth flow" in p["procedure"] + assert "bcrypt" in p["procedure"] + + def test_distinct_task_types_form_distinct_clusters(self, project: Path) -> None: + d1 = decisions_store.record(decision="A", tags=["x"]) + d2 = decisions_store.record(decision="B", tags=["x"]) + _seed_outcome(d1, "kept") + _seed_outcome(d2, "kept") + for i in range(3): + sessions_store.write( + f"bug-{i}", + task="x", + task_type="bug", + decision_ids=[d1], + ) + for i in range(3): + sessions_store.write( + f"feat-{i}", + task="x", + task_type="feature", + decision_ids=[d2], + ) + proposals = _build_proposals() + task_types = {p["task_type"] for p in proposals} + assert task_types == {"bug", "feature"} + + def test_low_jaccard_sessions_dont_cluster(self, project: Path) -> None: + """Sessions whose decision tags don't overlap above the + threshold form separate (small, dropped) clusters.""" + d1 = decisions_store.record(decision="A", tags=["alpha"]) + d2 = decisions_store.record(decision="B", tags=["beta"]) + d3 = decisions_store.record(decision="C", tags=["gamma"]) + for did in (d1, d2, d3): + _seed_outcome(did, "kept") + sessions_store.write( + "alpha-1", task="alpha", task_type="refactor", decision_ids=[d1] + ) + sessions_store.write( + "beta-1", task="beta", task_type="refactor", decision_ids=[d2] + ) + sessions_store.write( + "gamma-1", task="gamma", task_type="refactor", decision_ids=[d3] + ) + # Three sessions but 3 disjoint single-session clusters → no proposals. + assert _build_proposals() == [] + + +# ────────────────────────────────────────────────────────────────────── +# cmd_induce_skills: dry-run + apply paths +# ────────────────────────────────────────────────────────────────────── + + +def _seed_productive_cluster(project: Path) -> None: + d1 = decisions_store.record( + decision="Use bcrypt for password hashing", + tags=["auth", "security"], + ) + d2 = decisions_store.record( + decision="Rate-limit login attempts", + tags=["auth", "security"], + ) + for did in (d1, d2): + _seed_outcome(did, "kept") + for i in range(3): + sessions_store.write( + f"sess-{i}", + task=f"harden authentication {i}", + task_type="bug", + decision_ids=[d1, d2], + ) + + +class TestCmdInduceSkills: + def test_dry_run_writes_proposals_jsonl( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + _seed_productive_cluster(project) + rc = cmd_induce_skills(apply=False) + assert rc == 0 + out = capsys.readouterr().out + assert "wrote 1 proposal" in out + proposals_path = paths.induction_proposals_path() + assert proposals_path.is_file() + proposals = jsonl_store.read_all(proposals_path) + assert len(proposals) == 1 + assert proposals[0]["task_type"] == "bug" + + def test_no_candidates_returns_zero( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + rc = cmd_induce_skills(apply=False) + assert rc == 0 + out = capsys.readouterr().out + assert "no induced-skill candidates" in out + # No proposals file written. + assert not paths.induction_proposals_path().is_file() + + def test_apply_yes_records_skill( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + _seed_productive_cluster(project) + rc = cmd_induce_skills(apply=True, yes=True) + assert rc == 0 + out = capsys.readouterr().out + assert "recorded" in out + + # The new skill is in skills.jsonl with source="induced" + session refs. + live = skills_store.list_all() + assert len(live) >= 1 + induced = [s for s in live if s.get("source") == "induced"] + assert len(induced) == 1 + s = induced[0] + # The induced skill carries its trigger tags + provenance refs; + # task_type isn't on the skill schema (it lives on the source + # sessions instead — induce-skills uses it for clustering only). + assert sorted(s["triggers"]["tags"]) == ["auth", "security"] + assert s["source_session_ids"] == ["sess-0", "sess-1", "sess-2"] + + +# ────────────────────────────────────────────────────────────────────── +# outcomes_writer skill fan-out +# ────────────────────────────────────────────────────────────────────── + + +class TestOutcomesWriterFanout: + def test_kept_classification_marks_skill_success( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + """When outcomes_writer classifies a session's decision as + 'kept', each skill referenced by the session gets a + mark_used(success=True) call.""" + # Build a skill, a session referencing it, and a decision in + # the session. + kid = skills_store.record(name="auth-skill", procedure="p") + d1 = decisions_store.record( + decision="Use bcrypt", + file_path="auth.py", + tags=["auth"], + session_id="sess-1", + ) + sessions_store.write( + "sess-1", + task="harden auth", + decision_ids=[d1], + skill_ids=[kid], + task_type="bug", + ) + + # Stub _classify_decision so it always returns 'kept' — we don't + # want to depend on git state in unit tests. + from mcp_server.storage import outcomes_writer + + monkeypatch.setattr( + outcomes_writer, "_classify_decision", lambda *_a, **_kw: "kept" + ) + monkeypatch.setattr(outcomes_writer, "_git_available", lambda *_a, **_kw: True) + monkeypatch.setattr( + outcomes_writer, "_run_git", lambda *_a, **_kw: "deadbeefcafe" + ) + + summary = outcomes_writer.observe_all(project_root=project) + assert summary["skill_marks_success"] == 1 + assert summary["skill_marks_failure"] == 0 + + # The skill's success_count is incremented. + skill = skills_store.get(kid) + assert skill is not None + assert skill["success_count"] == 1 + + def test_reverted_classification_marks_skill_failure( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + kid = skills_store.record(name="auth-skill", procedure="p") + d1 = decisions_store.record( + decision="Use bcrypt", + file_path="auth.py", + tags=["auth"], + session_id="sess-1", + ) + sessions_store.write( + "sess-1", + task="harden auth", + decision_ids=[d1], + skill_ids=[kid], + task_type="bug", + ) + + from mcp_server.storage import outcomes_writer + + monkeypatch.setattr( + outcomes_writer, "_classify_decision", lambda *_a, **_kw: "reverted" + ) + monkeypatch.setattr(outcomes_writer, "_git_available", lambda *_a, **_kw: True) + monkeypatch.setattr( + outcomes_writer, "_run_git", lambda *_a, **_kw: "deadbeefcafe" + ) + + summary = outcomes_writer.observe_all(project_root=project) + assert summary["skill_marks_failure"] == 1 + assert summary["skill_marks_success"] == 0 + skill = skills_store.get(kid) + assert skill is not None + assert skill["failure_count"] == 1 From 96f263908e7886881a734044f12e4591dde8aed2 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 14:34:56 +0530 Subject: [PATCH 13/44] =?UTF-8?q?feat(v3.1.0):=20M6=20Phase=20B=20?= =?UTF-8?q?=E2=80=94=20cross-IDE=20consensus=20check=20(read-only)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The consensus subsystem ships as Phase B in v3.1.0 — a read-only scan that surfaces conflicts between decisions written by different IDEs to .codevira/pending_conflicts.jsonl for human review. No amendment rows are written on decisions; the handshake protocol where one IDE proposes a supersession is M7 (opt-in, default off). Storage layer (consensus_store.py): - Per-IDE checkpoint at .codevira/checkpoints/.json, keyed on last_seen_decision_id. Plain string ordering works because IDs are zero-padded base-36 — no clock drift exposure. - append_conflict / list_pending — PC-prefixed append-only log. - scan_and_materialize(): 1. Resolve current_ide from CODEVIRA_IDE env (bails out cleanly when 'unknown' so we don't materialize garbage). 2. Pull decisions via decisions_store._read_merged (skips superseded). 3. Filter to decisions with id > checkpoint. 4. Partition by origin.ide into current_corpus + foreign. 5. For each foreign × current_corpus pair, run check_conflict tokenize/Jaccard/overlap math. Record duplicate or asymmetric-conflict matches. 6. Advance the checkpoint to the max id seen. Reuses the existing _tokenize / _jaccard / _overlap_coefficient helpers from check_conflict so the conflict-shape math is one source of truth. CLI + MCP tools: - 'codevira consensus check' (cli_consensus.cmd_consensus_check) runs the scan and prints a summary. Exit 0 always. - consensus_check MCP tool: same scan, returns the summary dict. - consensus_status MCP tool: count + top-K rows for surface rendering. Reused by the get_session_context panel. get_session_context gains a 'consensus' field with pending_count + top-3 rows ordered by (do_not_revert × recency). Capped at ~200 tokens worth of summary. Best-effort: any storage failure surfaces an empty count rather than crashing. Schema additions: - paths: pending_conflicts_path() + ide_checkpoint_path(ide_key). - PC-prefixed monotonic IDs. - Each row carries _schema_v: 1 + current_origin + foreign_origin so future readers can reconstruct the cross-IDE context. Tests: tests/test_cli_consensus.py — 16 tests covering checkpoint roundtrip + malformed recovery; scan_and_materialize (unknown-IDE bail, no-foreign, foreign-duplicate, checkpoint advancement, second- scan delta, superseded skipped); cmd_consensus_check stdout; consensus_check / consensus_status MCP tools; get_session_context consensus panel (empty + populated). 758 tests across storage + engine + tools + check_conflict + CLI pass green; zero regressions from M5 baseline. CLI smoke verified: 'codevira consensus check --help' renders cleanly. Plan M6. M7 (Phase C handshake) and M8 (reflections) remain. Co-Authored-By: Claude Opus 4.7 --- mcp_server/cli.py | 29 +++ mcp_server/cli_consensus.py | 55 +++++ mcp_server/server.py | 37 ++++ mcp_server/storage/consensus_store.py | 292 ++++++++++++++++++++++++++ mcp_server/storage/paths.py | 25 +++ mcp_server/tools/consensus.py | 52 +++++ mcp_server/tools/learning.py | 34 +++ tests/test_cli_consensus.py | 280 ++++++++++++++++++++++++ 8 files changed, 804 insertions(+) create mode 100644 mcp_server/cli_consensus.py create mode 100644 mcp_server/storage/consensus_store.py create mode 100644 mcp_server/tools/consensus.py create mode 100644 tests/test_cli_consensus.py diff --git a/mcp_server/cli.py b/mcp_server/cli.py index f1afcd5..3355a73 100644 --- a/mcp_server/cli.py +++ b/mcp_server/cli.py @@ -1239,6 +1239,23 @@ def error(self, message): # type: ignore[override] ), ) + # v3.1.0 M6 Phase B: cross-IDE consensus check (read-only). The + # MCP surface (consensus_check / consensus_status) is also exposed. + consensus_parser = subparsers.add_parser( + "consensus", + help="Cross-IDE consensus operations (v3.1.0 M6). `check` " + "materializes conflicts between decisions written by this IDE " + "vs other IDEs into .codevira/pending_conflicts.jsonl for " + "human review. No automatic resolution; the handshake " + "protocol is M7 (opt-in).", + ) + consensus_sub = consensus_parser.add_subparsers(dest="consensus_action") + consensus_sub.add_parser( + "check", + help="Scan for conflicts since the last checkpoint; advance " + "this IDE's checkpoint.", + ) + # v3.1.0 M5: induced-skill candidate generation. CLI-only — the MCP # surface for skills is record_skill / get_skill / list_skills. induce_parser = subparsers.add_parser( @@ -1543,6 +1560,18 @@ def error(self, message): # type: ignore[override] keep_data=getattr(args, "keep_data", False), ) sys.exit(rc) + elif args.command == "consensus": + # v3.1.0 M6: cross-IDE consensus CLI. + consensus_action = getattr(args, "consensus_action", None) + if consensus_action == "check": + from mcp_server.cli_consensus import cmd_consensus_check + + sys.exit(cmd_consensus_check()) + sys.stderr.write( + "codevira consensus: missing subcommand. Try `codevira " + "consensus check`.\n" + ) + sys.exit(2) elif args.command == "induce-skills": # v3.1.0 M5: skill induction CLI. from mcp_server.cli_induce import cmd_induce_skills diff --git a/mcp_server/cli_consensus.py b/mcp_server/cli_consensus.py new file mode 100644 index 0000000..3116eb6 --- /dev/null +++ b/mcp_server/cli_consensus.py @@ -0,0 +1,55 @@ +""" +cli_consensus.py — v3.1.0 M6 Phase B: ``codevira consensus check`` CLI. + +Read-only scan that materializes cross-IDE conflicts to +``.codevira/pending_conflicts.jsonl`` for human review. Calls into +``consensus_store.scan_and_materialize`` so the same path also +backs the ``consensus_check`` MCP tool. +""" + +from __future__ import annotations + +import sys + + +def cmd_consensus_check(*, verbose: bool = False) -> int: + """Entry point for ``codevira consensus check``. + + Returns 0 on success (including no conflicts found). Non-zero only + on storage / IO errors raised by the scan. + """ + try: + from mcp_server.storage import consensus_store + except Exception as exc: # noqa: BLE001 + sys.stderr.write( + f"codevira consensus check: consensus_store import failed: {exc}\n" + ) + return 1 + + try: + summary = consensus_store.scan_and_materialize() + except Exception as exc: # noqa: BLE001 + sys.stderr.write(f"codevira consensus check: scan failed: {exc}\n") + return 1 + + if summary.get("skipped_reason"): + sys.stdout.write( + f"codevira consensus check: skipped — {summary['skipped_reason']}.\n" + f" Set CODEVIRA_IDE in your MCP config (ide_inject.py handles " + f"this for newly-injected IDE configs) and re-run.\n" + ) + return 0 + + sys.stdout.write( + f"codevira consensus check: scanned {summary.get('scanned', 0)} " + f"decision(s) since last checkpoint " + f"(foreign-IDE: {summary.get('foreign', 0)}; " + f"conflicts recorded: {summary.get('conflicts_recorded', 0)}).\n" + f" Checkpoint advanced to " + f"{summary.get('new_checkpoint') or ''}.\n" + ) + if summary.get("conflicts_recorded"): + from mcp_server.storage import paths + + sys.stdout.write(f" Review: {paths.pending_conflicts_path()}\n") + return 0 diff --git a/mcp_server/server.py b/mcp_server/server.py index 65b51c8..a57c46e 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -1333,6 +1333,34 @@ async def list_tools() -> list[Tool]: "required": ["file_path"], }, ), + # ---- v3.1.0 M6 Phase B: consensus (read-only) ---- + Tool( + name="consensus_check", + description=( + "v3.1.0 M6 Phase B: Scan decisions written since this IDE's " + "checkpoint, surface cross-IDE conflicts to " + ".codevira/pending_conflicts.jsonl, advance the checkpoint. " + "Read-only — no automatic resolution. The Phase C handshake " + "protocol (one IDE proposing supersession to another) is " + "M7 and ships disabled by default." + ), + inputSchema={"type": "object", "properties": {}}, + ), + Tool( + name="consensus_status", + description=( + "v3.1.0 M6: Return the count of pending cross-IDE conflicts + " + "top-K rows (default 3). Useful as a status check from inside " + "the agent loop; the get_session_context payload also " + "carries a 'consensus' panel based on this data." + ), + inputSchema={ + "type": "object", + "properties": { + "top_k": {"type": "integer", "default": 3}, + }, + }, + ), # ---- v1.5: Deep Graph Intelligence Tools ---- Tool( name="query_graph", @@ -1744,6 +1772,15 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: from mcp_server.tools.spatial import spatial_affordances result = spatial_affordances(file_path=arguments["file_path"]) + # ---- v3.1.0 M6 Phase B: consensus dispatch ---- + elif name == "consensus_check": + from mcp_server.tools.consensus import consensus_check + + result = consensus_check() + elif name == "consensus_status": + from mcp_server.tools.consensus import consensus_status + + result = consensus_status(top_k=arguments.get("top_k", 3)) else: result = {"error": f"Unknown tool: {name}"} diff --git a/mcp_server/storage/consensus_store.py b/mcp_server/storage/consensus_store.py new file mode 100644 index 0000000..553da39 --- /dev/null +++ b/mcp_server/storage/consensus_store.py @@ -0,0 +1,292 @@ +""" +consensus_store.py — v3.1.0 M6 Phase B: cross-IDE conflict materialization. + +The consensus subsystem in v3.1.0 ships as Phase B — read-only +conflict surfacing. It scans decisions written since this IDE's last +checkpoint, looks for ones authored by a *different* IDE that conflict +with a decision authored by *this* IDE since the same checkpoint, and +records the conflict in ``pending_conflicts.jsonl`` for human review. + +Phase B never writes amendment rows; the handshake protocol is M7. + +# Single-machine multi-IDE scope + +v3.1.0 assumes one machine sharing one filesystem across multiple +IDEs. Cross-machine conflicts (introduced via ``git pull`` of a +teammate's branch) are scanned the same way but not auto-resolved; +the human decides via ``supersede_decision`` if needed. + +# Checkpoint design + +``ide_key`` → last_seen_decision_id (the largest D-id this IDE has +scanned). Decisions land in monotonically-increasing order in +decisions.jsonl thanks to ``jsonl_store.append_with_generated_id``, so +the checkpoint scalar avoids cross-machine clock drift. After each +``codevira consensus check`` run, the checkpoint advances to +``max(D-id) at scan time``. + +# Pending-conflict row schema + +:: + + { + "id": "PC000001", + "ts": "2026-05-28T10:00:00+00:00", + "current_ide": "claude_code", + "foreign_decision_id": "D000123", + "foreign_origin": {"ide", "agent_model", "host_hash", "ts"}, + "current_decision_id": "D000119", + "current_origin": {...}, + "conflict_kind": "duplicate" | "asymmetric-conflict", + "similarity": 0.78, + "summary": "", + "do_not_revert": bool, # of the existing protected decision + "_schema_v": 1, + } +""" + +from __future__ import annotations + +import json +import logging +from datetime import datetime, timezone +from typing import Any + +from mcp_server.storage import atomic, jsonl_store, paths + +logger = logging.getLogger(__name__) + + +SCHEMA_V = 1 + +CONFLICT_KIND_DUPLICATE = "duplicate" +CONFLICT_KIND_ASYMMETRIC = "asymmetric-conflict" + + +# ────────────────────────────────────────────────────────────────────── +# Checkpoint +# ────────────────────────────────────────────────────────────────────── + + +def read_checkpoint(ide_key: str) -> dict[str, Any]: + """Return ``{last_seen_decision_id, last_seen_at}`` for ``ide_key``. + + Empty dict if the file doesn't exist (first run for this IDE). + Malformed files return empty dict + log a warning — we'd rather + re-scan a few extra decisions than crash the CLI. + """ + path = paths.ide_checkpoint_path(ide_key) + if not path.is_file(): + return {} + try: + return json.loads(path.read_text(encoding="utf-8")) or {} + except (OSError, json.JSONDecodeError) as exc: + logger.warning("consensus_store.read_checkpoint(%s) failed: %s", ide_key, exc) + return {} + + +def write_checkpoint(ide_key: str, *, last_seen_decision_id: str) -> None: + """Persist the checkpoint atomically. Creates the checkpoints + subdir lazily so callers don't have to.""" + path = paths.ide_checkpoint_path(ide_key) + path.parent.mkdir(parents=True, exist_ok=True) + payload = { + "last_seen_decision_id": last_seen_decision_id, + "last_seen_at": datetime.now(timezone.utc).isoformat(), + "_schema_v": SCHEMA_V, + } + atomic.atomic_write_text(path, json.dumps(payload, indent=2) + "\n") + + +# ────────────────────────────────────────────────────────────────────── +# Pending conflicts (append-only) +# ────────────────────────────────────────────────────────────────────── + + +def append_conflict(rec: dict[str, Any]) -> str: + """Append a pending-conflict row; return the PC-id.""" + paths.ensure_dirs() + rec = dict(rec) + rec.setdefault("ts", datetime.now(timezone.utc).isoformat()) + rec.setdefault("_schema_v", SCHEMA_V) + return jsonl_store.append_with_generated_id( + paths.pending_conflicts_path(), rec, prefix="PC", width=6 + ) + + +def list_pending(*, limit: int = 50) -> list[dict[str, Any]]: + """Return pending conflict rows, newest first.""" + return jsonl_store.read_recent(paths.pending_conflicts_path(), limit=limit) + + +# ────────────────────────────────────────────────────────────────────── +# Scan +# ────────────────────────────────────────────────────────────────────── + + +def scan_and_materialize(*, current_ide: str | None = None) -> dict[str, Any]: + """The core of ``codevira consensus check``. + + Walks decisions with id > the current IDE's checkpoint. For each + decision authored by a DIFFERENT IDE, runs ``check_conflict`` + against decisions authored by ``current_ide`` since the same + checkpoint. Materializes matches into pending_conflicts.jsonl. + Advances the checkpoint to the max decision id seen. + + Returns ``{scanned, foreign, conflicts_recorded, new_checkpoint}``. + """ + # Lazy origin import so tests that monkeypatch CODEVIRA_IDE see + # the override at call time. + from mcp_server.storage import origin as origin_module + + ide_key = current_ide or origin_module.current_origin().get("ide") or "unknown" + if ide_key == "unknown": + # Without a known ide_key we can't meaningfully distinguish + # 'foreign' decisions — bail out cleanly. + return { + "scanned": 0, + "foreign": 0, + "conflicts_recorded": 0, + "skipped_reason": "current_ide=unknown (CODEVIRA_IDE not set)", + } + + checkpoint = read_checkpoint(ide_key) + last_seen = str(checkpoint.get("last_seen_decision_id") or "") + + # Pull all decisions via the merged view (skips superseded). + from mcp_server.storage import decisions_store + + merged = decisions_store._read_merged() + if not merged: + return { + "scanned": 0, + "foreign": 0, + "conflicts_recorded": 0, + "new_checkpoint": last_seen, + } + + fresh_decisions = [ + d for d in merged if _id_after(str(d.get("id") or ""), last_seen) + ] + + # Current-IDE candidates since checkpoint — used as the "what does + # the local agent believe?" corpus for the conflict check. + current_corpus = [ + d + for d in fresh_decisions + if _origin_ide(d) == ide_key + and not (d.get("is_superseded") or d.get("superseded_by")) + ] + + foreign_decisions = [d for d in fresh_decisions if _origin_ide(d) != ide_key] + + new_pcs: list[str] = [] + for fd in foreign_decisions: + if fd.get("is_superseded") or fd.get("superseded_by"): + continue + for cd in current_corpus: + kind, sim = _check_pair(fd, cd) + if kind is None: + continue + pc_rec = { + "current_ide": ide_key, + "foreign_decision_id": fd.get("id"), + "foreign_origin": fd.get("origin"), + "foreign_decision": fd.get("decision"), + "foreign_do_not_revert": bool(fd.get("do_not_revert")), + "current_decision_id": cd.get("id"), + "current_origin": cd.get("origin"), + "current_decision": cd.get("decision"), + "current_do_not_revert": bool(cd.get("do_not_revert")), + "conflict_kind": kind, + "similarity": round(sim, 3), + "summary": _short_summary(fd.get("decision") or ""), + "do_not_revert": bool( + fd.get("do_not_revert") or cd.get("do_not_revert") + ), + } + new_pcs.append(append_conflict(pc_rec)) + + max_id = last_seen + for d in fresh_decisions: + did = str(d.get("id") or "") + if _id_after(did, max_id): + max_id = did + if max_id and max_id != last_seen: + write_checkpoint(ide_key, last_seen_decision_id=max_id) + + return { + "scanned": len(fresh_decisions), + "foreign": len(foreign_decisions), + "conflicts_recorded": len(new_pcs), + "new_checkpoint": max_id or last_seen, + "current_ide": ide_key, + } + + +# ────────────────────────────────────────────────────────────────────── +# Helpers +# ────────────────────────────────────────────────────────────────────── + + +def _id_after(candidate: str, last_seen: str) -> bool: + """Monotonic D-id comparison. Empty last_seen → all decisions are + after. Plain string ordering works because the IDs are + zero-padded base-36 (``D000001`` < ``D00000Z`` < ``D000010``). + """ + if not candidate: + return False + if not last_seen: + return True + return candidate > last_seen + + +def _origin_ide(rec: dict[str, Any]) -> str: + origin = rec.get("origin") + if isinstance(origin, dict): + return str(origin.get("ide") or "unknown") + return "unknown" + + +def _check_pair(fd: dict[str, Any], cd: dict[str, Any]) -> tuple[str | None, float]: + """Reuse the Jaccard / overlap math from check_conflict, applied + pairwise. + + Returns (kind, similarity). kind is None if the pair doesn't + cross the thresholds. + """ + # Import the existing helpers — single source of truth for the + # tokenizer + Jaccard / overlap math. + from mcp_server.tools.check_conflict import ( + _CONFLICT_MIN_SHARED_TOKENS, + _CONFLICT_OVERLAP_THRESHOLD, + _DUP_THRESHOLD, + _jaccard, + _overlap_coefficient, + _tokenize, + ) + + a_tokens = _tokenize(str(fd.get("decision") or "")) + b_tokens = _tokenize(str(cd.get("decision") or "")) + if not a_tokens or not b_tokens: + return None, 0.0 + jaccard = _jaccard(a_tokens, b_tokens) + overlap = _overlap_coefficient(a_tokens, b_tokens) + shared = len(a_tokens & b_tokens) + is_protected = bool(fd.get("do_not_revert")) or bool(cd.get("do_not_revert")) + + if jaccard >= _DUP_THRESHOLD: + return CONFLICT_KIND_DUPLICATE, max(jaccard, overlap) + if ( + is_protected + and overlap >= _CONFLICT_OVERLAP_THRESHOLD + and shared >= _CONFLICT_MIN_SHARED_TOKENS + and jaccard < _DUP_THRESHOLD + ): + return CONFLICT_KIND_ASYMMETRIC, max(jaccard, overlap) + return None, 0.0 + + +def _short_summary(text: str, *, cap: int = 80) -> str: + text = text.strip() + return text if len(text) <= cap else text[: cap - 1] + "…" diff --git a/mcp_server/storage/paths.py b/mcp_server/storage/paths.py index 9dccd92..2358ec0 100644 --- a/mcp_server/storage/paths.py +++ b/mcp_server/storage/paths.py @@ -109,6 +109,31 @@ def skills_path(project_root: Path | None = None) -> Path: return codevira_dir(project_root) / "skills.jsonl" +def pending_conflicts_path(project_root: Path | None = None) -> Path: + """v3.1.0 M6: cross-IDE conflict log materialized by + ``codevira consensus check`` (Phase B). Each row is a conflict + surfaced for human review — no automatic resolution. Lives in + ``.codevira/`` (canonical, gitable) so a teammate's pending + review survives `git pull`. + + See ``working_archived_path`` for the D000012 lock note. + """ + return codevira_dir(project_root) / "pending_conflicts.jsonl" + + +def ide_checkpoint_path(ide_key: str, project_root: Path | None = None) -> Path: + """v3.1.0 M6: per-IDE checkpoint marking the last decision id this + IDE has scanned. ``codevira consensus check`` updates this after + each run so we only re-examine decisions written by other IDEs + since the checkpoint. + + Filename is the ide_key (claude_code, cursor, …). Callers MUST + keep ide_key filesystem-safe; the v3.1.0 origin schema enforces + a fixed enum so this is safe by construction. + """ + return codevira_dir(project_root) / "checkpoints" / f"{ide_key}.json" + + def induction_proposals_path(project_root: Path | None = None) -> Path: """v3.1.0 M5: human-review staging file for ``codevira induce-skills``. diff --git a/mcp_server/tools/consensus.py b/mcp_server/tools/consensus.py new file mode 100644 index 0000000..ffd0a50 --- /dev/null +++ b/mcp_server/tools/consensus.py @@ -0,0 +1,52 @@ +""" +consensus.py — v3.1.0 M6 Phase B MCP tools for cross-IDE consensus. + +Two read-only tools cover the agent-facing surface: + + - consensus_check — run the cross-IDE conflict scan; materialize + new conflicts to pending_conflicts.jsonl; + advance this IDE's checkpoint. + - consensus_status — return counts + top-3 pending conflicts (for + the get_session_context panel + interactive + queries). + +Phase B does NOT write amendment rows on decisions. The handshake +protocol that lets one IDE supersede another IDE's protected decision +is M7 (opt-in, default off). +""" + +from __future__ import annotations + +from typing import Any + +from mcp_server.storage import consensus_store + + +def consensus_check() -> dict[str, Any]: + """Run the scan; return the summary dict produced by + ``consensus_store.scan_and_materialize``.""" + return consensus_store.scan_and_materialize() + + +def consensus_status(*, top_k: int = 3) -> dict[str, Any]: + """Return the count of pending conflicts + top-K rows for surface + rendering.""" + pending = consensus_store.list_pending(limit=max(top_k, 1) * 4) + return { + "count": len(pending), + "pending": [ + { + "pending_conflict_id": r.get("id"), + "ts": r.get("ts"), + "current_ide": r.get("current_ide"), + "foreign_decision_id": r.get("foreign_decision_id"), + "foreign_origin": r.get("foreign_origin"), + "current_decision_id": r.get("current_decision_id"), + "conflict_kind": r.get("conflict_kind"), + "similarity": r.get("similarity"), + "summary": r.get("summary"), + "do_not_revert": r.get("do_not_revert"), + } + for r in pending[:top_k] + ], + } diff --git a/mcp_server/tools/learning.py b/mcp_server/tools/learning.py index c6d051d..99c6201 100644 --- a/mcp_server/tools/learning.py +++ b/mcp_server/tools/learning.py @@ -633,10 +633,44 @@ def get_session_context(since: str | None = None) -> dict: except Exception: pass + # v3.1.0 M6 Phase B: consensus panel. Top-3 pending cross-IDE + # conflicts ordered by (do_not_revert × recency). Capped at + # ~200 tokens. Best-effort: missing pending_conflicts.jsonl, + # store errors, etc. surface an empty count without crashing. + consensus_panel: dict = {"pending_count": 0, "top": []} + try: + from mcp_server.storage import consensus_store + + pending = consensus_store.list_pending(limit=20) + # Sort: do_not_revert first, then by recency (already + # newest-first from read_recent). + pending.sort( + key=lambda r: (bool(r.get("do_not_revert")), r.get("ts") or ""), + reverse=True, + ) + consensus_panel = { + "pending_count": len(pending), + "top": [ + { + "pending_conflict_id": r.get("id"), + "foreign_decision_id": r.get("foreign_decision_id"), + "foreign_ide": (r.get("foreign_origin") or {}).get("ide"), + "current_decision_id": r.get("current_decision_id"), + "conflict_kind": r.get("conflict_kind"), + "do_not_revert": r.get("do_not_revert"), + "summary": _truncate(r.get("summary"), 80), + } + for r in pending[:3] + ], + } + except Exception: + pass + return { "current_phase": current_phase, "drift_warning": drift_warning, "working": working_panel, + "consensus": consensus_panel, "recent_sessions": [ { "session_id": s["session_id"], diff --git a/tests/test_cli_consensus.py b/tests/test_cli_consensus.py new file mode 100644 index 0000000..28544fc --- /dev/null +++ b/tests/test_cli_consensus.py @@ -0,0 +1,280 @@ +""" +Tests for mcp_server.cli_consensus + mcp_server.storage.consensus_store ++ mcp_server.tools.consensus — v3.1.0 M6 Phase B. + +Covers: + - read/write checkpoint per IDE + - append_conflict + list_pending + - scan_and_materialize: scans only foreign decisions; respects + checkpoint; advances checkpoint; surfaces duplicate vs + asymmetric-conflict shapes; bails out cleanly on + CODEVIRA_IDE=unknown. + - cmd_consensus_check stdout + return codes. + - get_session_context gains a 'consensus' panel. +""" + +from __future__ import annotations + +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.cli_consensus import cmd_consensus_check +from mcp_server.storage import consensus_store, decisions_store, paths +from mcp_server.tools.consensus import consensus_check, consensus_status + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# Checkpoint +# ────────────────────────────────────────────────────────────────────── + + +class TestCheckpoint: + def test_first_read_returns_empty(self, project: Path) -> None: + assert consensus_store.read_checkpoint("claude_code") == {} + + def test_write_then_read_roundtrip(self, project: Path) -> None: + consensus_store.write_checkpoint("cursor", last_seen_decision_id="D000123") + cp = consensus_store.read_checkpoint("cursor") + assert cp["last_seen_decision_id"] == "D000123" + assert cp["_schema_v"] == 1 + # File lives at the documented path. + assert paths.ide_checkpoint_path("cursor").is_file() + + def test_malformed_checkpoint_returns_empty(self, project: Path) -> None: + path = paths.ide_checkpoint_path("windsurf") + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("{this is not json") + assert consensus_store.read_checkpoint("windsurf") == {} + + +# ────────────────────────────────────────────────────────────────────── +# scan_and_materialize +# ────────────────────────────────────────────────────────────────────── + + +class TestScanAndMaterialize: + def test_unknown_ide_bails_out( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.delenv("CODEVIRA_IDE", raising=False) + # Seed a decision so the scan would have something to look at. + decisions_store.record(decision="x") + summary = consensus_store.scan_and_materialize() + assert summary["conflicts_recorded"] == 0 + assert "skipped_reason" in summary + + def test_no_foreign_decisions_records_nothing( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + # All decisions written by THIS IDE → no foreign rows → no conflicts. + decisions_store.record(decision="Use bcrypt", do_not_revert=True) + decisions_store.record(decision="Rate-limit logins") + summary = consensus_store.scan_and_materialize() + assert summary["foreign"] == 0 + assert summary["conflicts_recorded"] == 0 + + def test_foreign_duplicate_recorded( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.record( + decision="Use bcrypt for password hashing", do_not_revert=True + ) + # Switch IDEs, write a near-duplicate. + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record(decision="Use bcrypt for password hashing") + # Run scan from cursor's perspective. + summary = consensus_store.scan_and_materialize() + # No, claude_code's decision is current_ide=cursor's foreign; + # cursor's decision is current_ide=cursor's own. So the foreign + # one (claude_code's) gets paired against cursor's current set. + # But scan from cursor's POV: 1 conflict expected. + # Actually scan is from CURRENT_IDE = cursor, so claude_code's + # decision is foreign, cursor's is current. Pair: 1 conflict. + assert summary["foreign"] == 1 + assert summary["conflicts_recorded"] == 1 + pending = consensus_store.list_pending() + assert len(pending) == 1 + pc = pending[0] + assert pc["conflict_kind"] == "duplicate" + assert pc["current_ide"] == "cursor" + assert pc["foreign_origin"]["ide"] == "claude_code" + + def test_checkpoint_advances_after_scan( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + d1 = decisions_store.record(decision="A") + d2 = decisions_store.record(decision="B") + summary = consensus_store.scan_and_materialize() + assert summary["new_checkpoint"] in (d1, d2) + cp = consensus_store.read_checkpoint("claude_code") + assert cp["last_seen_decision_id"] == summary["new_checkpoint"] + + def test_second_scan_only_sees_new_decisions( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.record(decision="A") + consensus_store.scan_and_materialize() + # New decision after the checkpoint. + decisions_store.record(decision="B") + summary = consensus_store.scan_and_materialize() + assert summary["scanned"] == 1 # only B + + def test_supersededs_skipped( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + d_old = decisions_store.record(decision="old decision text", do_not_revert=True) + # Cursor writes a near-duplicate. + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record(decision="old decision text") + # Then claude_code supersedes its own. + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.supersede(d_old, "old decision text v2", reason="bumped") + # Now scan from cursor's POV; the foreign superseded one should be skipped. + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + # Checkpoint was never set for cursor → all foreign decisions + # are scanned, but superseded should still be excluded. + consensus_store.scan_and_materialize() + pending = consensus_store.list_pending() + for pc in pending: + assert ( + pc["foreign_decision_id"] != d_old + or pc["foreign_origin"]["ide"] == "cursor" + ) + + +# ────────────────────────────────────────────────────────────────────── +# CLI +# ────────────────────────────────────────────────────────────────────── + + +class TestCmdConsensusCheck: + def test_unknown_ide_prints_skip_message( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + monkeypatch.delenv("CODEVIRA_IDE", raising=False) + decisions_store.record(decision="x") + rc = cmd_consensus_check() + assert rc == 0 + out = capsys.readouterr().out + assert "skipped" in out + assert "CODEVIRA_IDE" in out + + def test_no_decisions_returns_zero( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + rc = cmd_consensus_check() + assert rc == 0 + out = capsys.readouterr().out + assert "scanned 0" in out + + def test_records_and_reports_conflicts( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + capsys: pytest.CaptureFixture[str], + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.record( + decision="Use bcrypt for password hashing", do_not_revert=True + ) + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record(decision="Use bcrypt for password hashing") + rc = cmd_consensus_check() + assert rc == 0 + out = capsys.readouterr().out + assert "conflicts recorded: 1" in out + + +# ────────────────────────────────────────────────────────────────────── +# MCP tools +# ────────────────────────────────────────────────────────────────────── + + +class TestMcpTools: + def test_consensus_status_empty(self, project: Path) -> None: + r = consensus_status() + assert r["count"] == 0 + assert r["pending"] == [] + + def test_consensus_check_then_status( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.record( + decision="Use bcrypt for password hashing", do_not_revert=True + ) + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record(decision="Use bcrypt for password hashing") + summary = consensus_check() + assert summary["conflicts_recorded"] == 1 + status = consensus_status(top_k=5) + assert status["count"] == 1 + assert status["pending"][0]["conflict_kind"] == "duplicate" + + +# ────────────────────────────────────────────────────────────────────── +# get_session_context consensus panel +# ────────────────────────────────────────────────────────────────────── + + +class TestSessionContextConsensusPanel: + def test_empty_panel(self, project: Path, monkeypatch: pytest.MonkeyPatch) -> None: + # Patch _setup_project's expected mocks minimally. + from mcp_server.tools import learning + + with monkeypatch.context() as m: + m.setattr( + "mcp_server.tools.roadmap.get_roadmap", + lambda *_a, **_kw: {"current_phase": {}}, + raising=False, + ) + ctx = learning.get_session_context() + assert "consensus" in ctx + assert ctx["consensus"]["pending_count"] == 0 + + def test_populated_panel( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + decisions_store.record( + decision="Use bcrypt for password hashing", do_not_revert=True + ) + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + decisions_store.record(decision="Use bcrypt for password hashing") + consensus_store.scan_and_materialize() # populate pending_conflicts + + from mcp_server.tools import learning + + with monkeypatch.context() as m: + m.setattr( + "mcp_server.tools.roadmap.get_roadmap", + lambda *_a, **_kw: {"current_phase": {}}, + raising=False, + ) + ctx = learning.get_session_context() + assert ctx["consensus"]["pending_count"] >= 1 + assert ctx["consensus"]["top"][0]["conflict_kind"] == "duplicate" From 5b1f421a031eff9952f2147ed4021f72ba943b08 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 14:42:46 +0530 Subject: [PATCH 14/44] =?UTF-8?q?feat(v3.1.0):=20M7=20=E2=80=94=20consensu?= =?UTF-8?q?s=20Phase=20C=20handshake=20(opt-in,=20default=20off)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the belief-revision handshake protocol that lets one IDE propose superseding a do_not_revert decision authored by a different IDE. Gated behind memory.consensus.handshake_enabled (default False) so the v3.1.0 ship doesn't change semantics for users who haven't opted in. Config helper (config.py): tiny accessor over .codevira/config.yaml. get_flag(path, default) for dotted lookups; is_enabled wraps for boolean toggles. Fail-open on missing file / malformed yaml. Storage layer (consensus_store.py): - propose_supersession: validates target; same-IDE fast-path returns {fast_path: True}; cross-IDE appends a proposed_supersession row with expires_at = ts + handshake_timeout_days (default 14). - resolve_proposal: appends resolution row with resolver_origin; action ∈ {approved, rejected, withdrawn}. - find_proposal / find_latest_resolution / proposal_status: derive status from base + latest resolution + expiry. Last resolution wins. - finalize_proposal: convert approved proposal to a real supersession via decisions_store.supersede. Expired proposals require expired_unilateral=True (deadlock safety) — and write an audit row recording the force-finalize. - list_proposals: filtered list with derived status. Row kind taxonomy in pending_conflicts.jsonl: - 'conflict' (M6 read-only) - 'proposed_supersession' (M7 proposals) - 'resolution' (M7 approve/reject/withdraw) MCP tools (tools/consensus.py): - consensus_propose_supersession (opt-in) - consensus_resolve (opt-in) - origin_of (always available) Registered in server.py: 3 Tool entries + 3 dispatch branches. Schemas enforce action enum for early validation. Tests: tests/test_consensus_handshake.py — 24 tests covering config helper, propose (unknown target, cross-IDE, same-IDE fast path), lifecycle (pending/approved/rejected/withdrawn/expired, latest-wins, bad action), finalize (pending blocked, approved finalizes, expired requires unilateral flag, audit row on force- finalize), MCP feature-flag gate. 782 tests across storage + engine + tools + check_conflict + CLI pass green; zero regressions from M6 baseline. Plan M7. M8 (reflections) and M9 (docs) remain. Co-Authored-By: Claude Opus 4.7 --- mcp_server/server.py | 79 ++++++ mcp_server/storage/config.py | 64 +++++ mcp_server/storage/consensus_store.py | 335 +++++++++++++++++++++++++- mcp_server/tools/consensus.py | 88 ++++++- tests/test_consensus_handshake.py | 320 ++++++++++++++++++++++++ 5 files changed, 884 insertions(+), 2 deletions(-) create mode 100644 mcp_server/storage/config.py create mode 100644 tests/test_consensus_handshake.py diff --git a/mcp_server/server.py b/mcp_server/server.py index a57c46e..378ad07 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -1361,6 +1361,64 @@ async def list_tools() -> list[Tool]: }, }, ), + Tool( + name="consensus_propose_supersession", + description=( + "v3.1.0 M7 Phase C: Open a cross-IDE supersession proposal. " + "Writes a 'proposed_supersession' row to pending_conflicts.jsonl " + "with expires_at = ts + handshake_timeout_days (default 14). " + "Opt-in: returns {disabled: True} unless " + "memory.consensus.handshake_enabled is set in " + ".codevira/config.yaml. Same-author fast-path returns " + "{fast_path: True} so the caller can route to " + "supersede_decision directly." + ), + inputSchema={ + "type": "object", + "properties": { + "target_decision_id": {"type": "string"}, + "new_decision": {"type": "string"}, + "reason": {"type": "string"}, + }, + "required": ["target_decision_id", "new_decision", "reason"], + }, + ), + Tool( + name="consensus_resolve", + description=( + "v3.1.0 M7 Phase C: Approve, reject, or withdraw a pending " + "supersession proposal. Opt-in via " + "memory.consensus.handshake_enabled. The approving IDE should " + "match the target decision's origin IDE (or be 'unknown') " + "for cross-IDE proposals; withdrawals come from the " + "proposing IDE." + ), + inputSchema={ + "type": "object", + "properties": { + "proposal_id": {"type": "string"}, + "action": { + "type": "string", + "enum": ["approved", "rejected", "withdrawn"], + }, + "comment": {"type": "string"}, + }, + "required": ["proposal_id", "action"], + }, + ), + Tool( + name="origin_of", + description=( + "v3.1.0 M7: Return the M1 origin block attached to a decision " + "({ide, agent_model, host_hash, ts}) + protection / supersession " + "metadata. Always available regardless of the handshake flag." + ), + inputSchema={ + "type": "object", + "properties": {"decision_id": {"type": "string"}}, + "required": ["decision_id"], + }, + ), # ---- v1.5: Deep Graph Intelligence Tools ---- Tool( name="query_graph", @@ -1781,6 +1839,27 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: from mcp_server.tools.consensus import consensus_status result = consensus_status(top_k=arguments.get("top_k", 3)) + # ---- v3.1.0 M7 Phase C: handshake dispatch ---- + elif name == "consensus_propose_supersession": + from mcp_server.tools.consensus import consensus_propose_supersession + + result = consensus_propose_supersession( + target_decision_id=arguments["target_decision_id"], + new_decision=arguments["new_decision"], + reason=arguments["reason"], + ) + elif name == "consensus_resolve": + from mcp_server.tools.consensus import consensus_resolve + + result = consensus_resolve( + proposal_id=arguments["proposal_id"], + action=arguments["action"], + comment=arguments.get("comment"), + ) + elif name == "origin_of": + from mcp_server.tools.consensus import origin_of + + result = origin_of(decision_id=arguments["decision_id"]) else: result = {"error": f"Unknown tool: {name}"} diff --git a/mcp_server/storage/config.py b/mcp_server/storage/config.py new file mode 100644 index 0000000..808815f --- /dev/null +++ b/mcp_server/storage/config.py @@ -0,0 +1,64 @@ +""" +config.py — v3.1.0: tiny accessor for .codevira/config.yaml flags. + +Most v3.1.0 subsystems are gated behind a config flag so users can +opt into or out of behavior without touching code. The config file +itself is YAML; this module exposes a single ``get_flag(path, +default)`` so feature-flag checks read the same source of truth +everywhere. + +We deliberately don't add a schema validator: the config is small, +fail-open is the right default (missing key → caller's default), +and codevira already inherits a fail-open culture for cache layers. +""" + +from __future__ import annotations + +import logging +from typing import Any + +from mcp_server.storage import paths + +logger = logging.getLogger(__name__) + + +def _load_config() -> dict[str, Any]: + """Read and parse the project config; return empty dict on missing + or malformed input.""" + path = paths.config_path() + if not path.is_file(): + return {} + try: + import yaml + + data = yaml.safe_load(path.read_text(encoding="utf-8")) + except Exception as exc: # noqa: BLE001 + logger.warning( + "config.load: failed to parse %s; falling back to defaults: %s", + path, + exc, + ) + return {} + return data if isinstance(data, dict) else {} + + +def get_flag(path: str, default: Any = None) -> Any: + """Look up a dotted key in the config (e.g. + ``"memory.consensus.handshake_enabled"``). Returns ``default`` on + any miss. + """ + if not isinstance(path, str) or not path: + return default + data = _load_config() + cursor: Any = data + for part in path.split("."): + if not isinstance(cursor, dict) or part not in cursor: + return default + cursor = cursor[part] + return cursor + + +def is_enabled(path: str, *, default: bool = False) -> bool: + """Type-safe wrapper around ``get_flag`` for boolean toggles.""" + val = get_flag(path, default=default) + return bool(val) diff --git a/mcp_server/storage/consensus_store.py b/mcp_server/storage/consensus_store.py index 553da39..9878c2a 100644 --- a/mcp_server/storage/consensus_store.py +++ b/mcp_server/storage/consensus_store.py @@ -49,7 +49,7 @@ import json import logging -from datetime import datetime, timezone +from datetime import datetime, timedelta, timezone from typing import Any from mcp_server.storage import atomic, jsonl_store, paths @@ -62,6 +62,22 @@ CONFLICT_KIND_DUPLICATE = "duplicate" CONFLICT_KIND_ASYMMETRIC = "asymmetric-conflict" +# v3.1.0 M7: row kinds in pending_conflicts.jsonl +ROW_KIND_CONFLICT = "conflict" # M6 read-only conflict materializations +ROW_KIND_PROPOSAL = "proposed_supersession" # M7 proposals +ROW_KIND_RESOLUTION = "resolution" # M7 approve/reject/withdraw + +# Proposal lifecycle states. +PROPOSAL_STATUS_PENDING = "pending" +PROPOSAL_STATUS_APPROVED = "approved" +PROPOSAL_STATUS_REJECTED = "rejected" +PROPOSAL_STATUS_WITHDRAWN = "withdrawn" +PROPOSAL_STATUS_EXPIRED = "expired" + +# Default handshake timeout (overridable via +# memory.consensus.handshake_timeout_days in .codevira/config.yaml). +DEFAULT_HANDSHAKE_TIMEOUT_DAYS = 14 + # ────────────────────────────────────────────────────────────────────── # Checkpoint @@ -290,3 +306,320 @@ def _check_pair(fd: dict[str, Any], cd: dict[str, Any]) -> tuple[str | None, flo def _short_summary(text: str, *, cap: int = 80) -> str: text = text.strip() return text if len(text) <= cap else text[: cap - 1] + "…" + + +# ────────────────────────────────────────────────────────────────────── +# v3.1.0 M7 Phase C — handshake protocol +# ────────────────────────────────────────────────────────────────────── + + +def propose_supersession( + target_decision_id: str, + *, + new_decision: str, + reason: str, + proposing_origin: dict | None = None, + timeout_days: int | None = None, +) -> dict[str, Any]: + """Open a supersession proposal against ``target_decision_id``. + + Writes a ``proposed_supersession`` row to pending_conflicts.jsonl + with ``expires_at = ts + timeout_days``. Default timeout is + ``DEFAULT_HANDSHAKE_TIMEOUT_DAYS`` (overridable via + ``memory.consensus.handshake_timeout_days``). + + Single-IDE fast path: if the proposing origin's IDE matches the + target decision's origin IDE, no handshake is needed — the same + author can revise their own decisions directly. The proposal + short-circuits with ``fast_path: True`` so the caller can route + to ``decisions_store.supersede`` immediately. + + Returns ``{proposed: True, proposal_id, expires_at}`` on success, + ``{proposed: False, error}`` on failure (missing target, etc.), + or ``{fast_path: True, ide_match}`` for the same-IDE case. + """ + from mcp_server.storage import config as cfg + from mcp_server.storage import decisions_store + from mcp_server.storage import origin as origin_module + + target = decisions_store.get(target_decision_id) + if target is None: + return { + "proposed": False, + "error": f"target decision {target_decision_id} not found", + } + + proposing_origin = proposing_origin or origin_module.current_origin() + proposer_ide = ( + proposing_origin.get("ide") if isinstance(proposing_origin, dict) else None + ) + target_origin = target.get("origin") or {} + target_ide = target_origin.get("ide") if isinstance(target_origin, dict) else None + + # Fast path: same author. The protocol is a courtesy across IDEs; + # if the same IDE proposed both, just supersede directly. + if proposer_ide and target_ide and proposer_ide == target_ide: + return { + "fast_path": True, + "ide_match": proposer_ide, + "hint": ( + "Same author; call decisions_store.supersede directly " + "(no handshake required)." + ), + } + + days = ( + timeout_days + if isinstance(timeout_days, int) and timeout_days > 0 + else int( + cfg.get_flag( + "memory.consensus.handshake_timeout_days", + default=DEFAULT_HANDSHAKE_TIMEOUT_DAYS, + ) + ) + ) + now = datetime.now(timezone.utc) + expires_at = (now + timedelta(days=days)).isoformat() + + pc_id = append_conflict( + { + "kind": ROW_KIND_PROPOSAL, + "ts": now.isoformat(), + "status": PROPOSAL_STATUS_PENDING, + "proposing_origin": proposing_origin, + "target_decision_id": target_decision_id, + "target_origin": target_origin, + "proposed_new_decision": new_decision, + "reason": reason, + "expires_at": expires_at, + "do_not_revert": bool(target.get("do_not_revert")), + "summary": _short_summary(new_decision), + } + ) + return { + "proposed": True, + "proposal_id": pc_id, + "expires_at": expires_at, + "target_decision_id": target_decision_id, + } + + +def resolve_proposal( + proposal_id: str, + *, + action: str, + comment: str | None = None, + resolver_origin: dict | None = None, +) -> dict[str, Any]: + """Approve, reject, or withdraw a proposal. + + Appends a ``resolution`` row referencing the proposal. The proposal + itself stays in the JSONL (audit trail); ``find_proposal`` reads + the latest resolution if present. + + ``approve`` / ``reject`` should come from an IDE matching the + target decision's origin (or ``unknown``). ``withdraw`` may come + from the proposing IDE only — we don't enforce here (caller's + responsibility) but record the resolver_origin for audit. + """ + if action not in ( + PROPOSAL_STATUS_APPROVED, + PROPOSAL_STATUS_REJECTED, + PROPOSAL_STATUS_WITHDRAWN, + ): + return { + "resolved": False, + "error": ( + f"action must be one of " + f"{[PROPOSAL_STATUS_APPROVED, PROPOSAL_STATUS_REJECTED, PROPOSAL_STATUS_WITHDRAWN]}; " + f"got {action!r}" + ), + } + proposal = find_proposal(proposal_id) + if proposal is None: + return {"resolved": False, "error": f"proposal {proposal_id} not found"} + + from mcp_server.storage import origin as origin_module + + resolver_origin = resolver_origin or origin_module.current_origin() + + res_id = append_conflict( + { + "kind": ROW_KIND_RESOLUTION, + "proposal_id": proposal_id, + "action": action, + "comment": comment, + "resolver_origin": resolver_origin, + } + ) + return { + "resolved": True, + "resolution_id": res_id, + "proposal_id": proposal_id, + "action": action, + } + + +def find_proposal(proposal_id: str) -> dict[str, Any] | None: + """Locate a proposal row by id. Returns the base row (without + folded resolution).""" + for r in jsonl_store.read_all(paths.pending_conflicts_path()): + if r.get("id") == proposal_id and r.get("kind") == ROW_KIND_PROPOSAL: + return r + return None + + +def find_latest_resolution(proposal_id: str) -> dict[str, Any] | None: + """Return the most-recent resolution row for ``proposal_id`` or + None if none exists. We walk in append order, so the last match + wins (matches the supersede-style "latest amendment" pattern).""" + latest: dict[str, Any] | None = None + for r in jsonl_store.read_all(paths.pending_conflicts_path()): + if r.get("kind") == ROW_KIND_RESOLUTION and r.get("proposal_id") == proposal_id: + latest = r + return latest + + +def proposal_status(proposal_id: str, *, now: datetime | None = None) -> dict[str, Any]: + """Return the merged proposal view: base + latest resolution + + derived status (pending, approved, rejected, withdrawn, expired). + """ + proposal = find_proposal(proposal_id) + if proposal is None: + return {"found": False} + + latest = find_latest_resolution(proposal_id) + derived = PROPOSAL_STATUS_PENDING + if latest is not None: + action = latest.get("action") + if action in ( + PROPOSAL_STATUS_APPROVED, + PROPOSAL_STATUS_REJECTED, + PROPOSAL_STATUS_WITHDRAWN, + ): + derived = action + if derived == PROPOSAL_STATUS_PENDING: + # Check expiry. + exp = proposal.get("expires_at") + if isinstance(exp, str): + try: + exp_dt = datetime.fromisoformat(exp) + if exp_dt.tzinfo is None: + exp_dt = exp_dt.replace(tzinfo=timezone.utc) + now_dt = now or datetime.now(timezone.utc) + if now_dt >= exp_dt: + derived = PROPOSAL_STATUS_EXPIRED + except (ValueError, TypeError): + pass + + return { + "found": True, + "proposal": proposal, + "latest_resolution": latest, + "status": derived, + } + + +def finalize_proposal( + proposal_id: str, + *, + expired_unilateral: bool = False, + now: datetime | None = None, +) -> dict[str, Any]: + """Convert an approved (or expired) proposal into a real + supersession via ``decisions_store.supersede``. + + ``expired_unilateral=True``: the proposer is force-finalizing + past the expiry; an audit-only ``resolution`` row is appended + with ``action='expired'`` and ``expired_unilateral=True`` so the + history shows the proposer didn't wait for human approval. + """ + state = proposal_status(proposal_id, now=now) + if not state["found"]: + return {"finalized": False, "error": "proposal not found"} + + derived = state["status"] + if derived not in (PROPOSAL_STATUS_APPROVED, PROPOSAL_STATUS_EXPIRED): + return { + "finalized": False, + "error": ( + f"proposal {proposal_id} cannot be finalized from " + f"status={derived!r}" + ), + } + if derived == PROPOSAL_STATUS_EXPIRED and not expired_unilateral: + return { + "finalized": False, + "error": ( + f"proposal {proposal_id} has expired; pass " + f"expired_unilateral=True to force-finalize and " + f"record the audit row." + ), + } + + proposal = state["proposal"] + target_id = proposal.get("target_decision_id") + new_text = proposal.get("proposed_new_decision") or "" + reason = proposal.get("reason") or "consensus-handshake supersession" + + from mcp_server.storage import decisions_store + + sup_result = decisions_store.supersede( + old_id=str(target_id), + new_decision=new_text, + reason=reason, + ) + if not sup_result.get("success"): + return {"finalized": False, "error": sup_result.get("error")} + + # Audit-only row when we expired-unilateral. + if expired_unilateral: + from mcp_server.storage import origin as origin_module + + append_conflict( + { + "kind": ROW_KIND_RESOLUTION, + "proposal_id": proposal_id, + "action": PROPOSAL_STATUS_EXPIRED, + "expired_unilateral": True, + "resolver_origin": origin_module.current_origin(), + "comment": "force-finalized past expires_at", + } + ) + + return { + "finalized": True, + "proposal_id": proposal_id, + "supersedes": target_id, + "new_decision_id": sup_result.get("new_id"), + "expired_unilateral": expired_unilateral, + } + + +def list_proposals( + *, + status: str | None = None, + limit: int = 50, + now: datetime | None = None, +) -> list[dict[str, Any]]: + """Return proposals (newest first) with their derived status. + + ``status`` filter: ``"pending"`` / ``"approved"`` / ``"rejected"`` + / ``"withdrawn"`` / ``"expired"`` / ``None`` (all). + """ + raw = jsonl_store.read_recent(paths.pending_conflicts_path(), limit=limit * 4) + out: list[dict[str, Any]] = [] + for row in raw: + if row.get("kind") != ROW_KIND_PROPOSAL: + continue + pid = str(row.get("id") or "") + st = proposal_status(pid, now=now) + if not st["found"]: + continue + derived = st["status"] + if status is not None and derived != status: + continue + out.append({**row, "_derived_status": derived}) + if len(out) >= limit: + break + return out diff --git a/mcp_server/tools/consensus.py b/mcp_server/tools/consensus.py index ffd0a50..1c9764d 100644 --- a/mcp_server/tools/consensus.py +++ b/mcp_server/tools/consensus.py @@ -19,7 +19,13 @@ from typing import Any -from mcp_server.storage import consensus_store +from mcp_server.storage import config, consensus_store + + +# v3.1.0 M7 Phase C: the handshake-using tools call +# config.is_enabled("memory.consensus.handshake_enabled", default=False) +# inline at entry. Inlined (not via a helper) to keep the +# blast-radius surface minimal on this module. def consensus_check() -> dict[str, Any]: @@ -50,3 +56,83 @@ def consensus_status(*, top_k: int = 3) -> dict[str, Any]: for r in pending[:top_k] ], } + + +# ────────────────────────────────────────────────────────────────────── +# v3.1.0 M7 Phase C — handshake MCP tools +# ────────────────────────────────────────────────────────────────────── + + +def consensus_propose_supersession( + target_decision_id: str, + *, + new_decision: str, + reason: str, +) -> dict[str, Any]: + """Open a cross-IDE supersession proposal. + + Opt-in: returns ``{"disabled": True}`` unless + ``memory.consensus.handshake_enabled`` is set in + ``.codevira/config.yaml``. + + Fast-path: when the proposing IDE is the same as the target + decision's origin IDE, no handshake is needed and the response + carries ``fast_path: True`` — the caller should use + ``supersede_decision`` directly. + """ + if not config.is_enabled("memory.consensus.handshake_enabled", default=False): + return { + "disabled": True, + "feature": "memory.consensus.handshake_enabled", + "hint": ( + "The handshake protocol is opt-in. Enable it via " + ".codevira/config.yaml: memory.consensus." + "handshake_enabled: true" + ), + } + return consensus_store.propose_supersession( + target_decision_id, + new_decision=new_decision, + reason=reason, + ) + + +def consensus_resolve( + proposal_id: str, + *, + action: str, + comment: str | None = None, +) -> dict[str, Any]: + """Approve, reject, or withdraw a pending proposal. + + Opt-in via ``memory.consensus.handshake_enabled``. Returns a + structured ``{"resolved": False, "error": ...}`` rather than + raising on bad input so the agent can correct and retry. + """ + if not config.is_enabled("memory.consensus.handshake_enabled", default=False): + return { + "disabled": True, + "feature": "memory.consensus.handshake_enabled", + } + return consensus_store.resolve_proposal(proposal_id, action=action, comment=comment) + + +def origin_of(decision_id: str) -> dict[str, Any]: + """Return the origin block attached to a decision (M1 provenance). + + Always available — does not require the handshake flag. + """ + from mcp_server.storage import decisions_store + + decision = decisions_store.get(decision_id) + if decision is None: + return {"found": False, "error": f"decision {decision_id} not found"} + origin = decision.get("origin") + return { + "found": True, + "decision_id": decision_id, + "origin": origin if isinstance(origin, dict) else None, + "do_not_revert": bool(decision.get("do_not_revert")), + "is_superseded": bool(decision.get("is_superseded")), + "superseded_by": decision.get("superseded_by"), + } diff --git a/tests/test_consensus_handshake.py b/tests/test_consensus_handshake.py new file mode 100644 index 0000000..e81102e --- /dev/null +++ b/tests/test_consensus_handshake.py @@ -0,0 +1,320 @@ +""" +Tests for v3.1.0 M7 Phase C: consensus handshake protocol. + +Covers: + - config.get_flag / is_enabled + - consensus_store: propose, resolve, find, status (pending, + approved, rejected, withdrawn, expired), finalize with + expired_unilateral safety + - same-IDE fast path + - MCP tools: feature-flag gate; opt-in behavior; consensus_propose, + consensus_resolve, origin_of +""" + +from __future__ import annotations + +from datetime import datetime, timedelta, timezone +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import ( + config, + consensus_store, + decisions_store, + paths, +) + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +def _enable_handshake(project: Path) -> None: + """Toggle memory.consensus.handshake_enabled=true in config.""" + (project / ".codevira" / "config.yaml").write_text( + "project:\n" + " name: test\n" + "memory:\n" + " consensus:\n" + " handshake_enabled: true\n" + " handshake_timeout_days: 14\n" + ) + + +# ────────────────────────────────────────────────────────────────────── +# config helper +# ────────────────────────────────────────────────────────────────────── + + +class TestConfig: + def test_missing_file_returns_default(self, project: Path) -> None: + (project / ".codevira" / "config.yaml").unlink() + assert ( + config.get_flag("memory.consensus.handshake_enabled", default=False) + is False + ) + + def test_unset_key_returns_default(self, project: Path) -> None: + assert config.is_enabled("memory.nonexistent.flag") is False + + def test_explicit_true(self, project: Path) -> None: + _enable_handshake(project) + assert config.is_enabled("memory.consensus.handshake_enabled") is True + assert config.get_flag("memory.consensus.handshake_timeout_days") == 14 + + def test_malformed_yaml_returns_default(self, project: Path) -> None: + (project / ".codevira" / "config.yaml").write_text( + "::: not yaml ::: at all :::" + ) + assert config.is_enabled("memory.consensus.handshake_enabled") is False + + +# ────────────────────────────────────────────────────────────────────── +# propose_supersession +# ────────────────────────────────────────────────────────────────────── + + +class TestPropose: + def test_unknown_target_rejected(self, project: Path) -> None: + r = consensus_store.propose_supersession( + "D999999", new_decision="x", reason="missing" + ) + assert r["proposed"] is False + assert "not found" in r["error"] + + def test_cross_ide_opens_proposal( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + # Target was authored by claude_code. + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + target = decisions_store.record(decision="Use bcrypt", do_not_revert=True) + # Cursor proposes superseding it. + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + r = consensus_store.propose_supersession( + target, new_decision="Use Argon2 instead", reason="modern hash" + ) + assert r["proposed"] is True + assert "expires_at" in r + # Default timeout: 14 days from now. + exp = datetime.fromisoformat(r["expires_at"]) + now = datetime.now(timezone.utc) + assert timedelta(days=13) < (exp - now) < timedelta(days=15) + + def test_same_ide_fast_path( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + target = decisions_store.record(decision="A", do_not_revert=True) + # Same IDE proposes — fast path bypasses the handshake. + r = consensus_store.propose_supersession( + target, new_decision="B", reason="cleaner" + ) + assert r.get("fast_path") is True + assert r.get("ide_match") == "claude_code" + # No proposal row appended. + assert not paths.pending_conflicts_path().is_file() + + +# ────────────────────────────────────────────────────────────────────── +# resolve_proposal + lifecycle +# ────────────────────────────────────────────────────────────────────── + + +def _open_proposal(monkeypatch: pytest.MonkeyPatch) -> str: + """Helper: open a cross-IDE proposal; return proposal_id.""" + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + target = decisions_store.record(decision="X", do_not_revert=True) + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + r = consensus_store.propose_supersession(target, new_decision="Y", reason="bumped") + return r["proposal_id"] + + +class TestResolveLifecycle: + def test_unknown_proposal_rejected(self, project: Path) -> None: + r = consensus_store.resolve_proposal("PC999999", action="approved") + assert r["resolved"] is False + + def test_bad_action_rejected( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + r = consensus_store.resolve_proposal(pid, action="maybe") + assert r["resolved"] is False + assert "action must be one of" in r["error"] + + def test_pending_status( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + assert consensus_store.proposal_status(pid)["status"] == "pending" + + def test_approved_status( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + r = consensus_store.resolve_proposal(pid, action="approved") + assert r["resolved"] is True + assert consensus_store.proposal_status(pid)["status"] == "approved" + + def test_rejected_status( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + consensus_store.resolve_proposal(pid, action="rejected", comment="no") + assert consensus_store.proposal_status(pid)["status"] == "rejected" + + def test_withdrawn_status( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + consensus_store.resolve_proposal(pid, action="withdrawn") + assert consensus_store.proposal_status(pid)["status"] == "withdrawn" + + def test_latest_resolution_wins( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + consensus_store.resolve_proposal(pid, action="rejected") + consensus_store.resolve_proposal(pid, action="approved") + # Last write wins (mirrors decisions amendment semantics). + assert consensus_store.proposal_status(pid)["status"] == "approved" + + def test_expired_status( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + # Travel 30 days into the future. + far_future = datetime.now(timezone.utc) + timedelta(days=30) + assert ( + consensus_store.proposal_status(pid, now=far_future)["status"] == "expired" + ) + + +# ────────────────────────────────────────────────────────────────────── +# finalize_proposal +# ────────────────────────────────────────────────────────────────────── + + +class TestFinalize: + def test_pending_cannot_finalize( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + r = consensus_store.finalize_proposal(pid) + assert r["finalized"] is False + + def test_approved_finalizes( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + consensus_store.resolve_proposal(pid, action="approved") + r = consensus_store.finalize_proposal(pid) + assert r["finalized"] is True + assert r["new_decision_id"] + + # Target is now superseded; new decision exists. + target_id = r["supersedes"] + old = decisions_store.get(target_id) + assert old["is_superseded"] is True + new = decisions_store.get(r["new_decision_id"]) + assert new is not None + assert "Y" in new["decision"] + + def test_expired_requires_unilateral_flag( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + far_future = datetime.now(timezone.utc) + timedelta(days=30) + r = consensus_store.finalize_proposal(pid, now=far_future) + assert r["finalized"] is False + assert "expired_unilateral=True" in r["error"] + + def test_expired_unilateral_finalizes_with_audit( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + pid = _open_proposal(monkeypatch) + far_future = datetime.now(timezone.utc) + timedelta(days=30) + r = consensus_store.finalize_proposal( + pid, expired_unilateral=True, now=far_future + ) + assert r["finalized"] is True + assert r["expired_unilateral"] is True + + # Audit row appears in pending_conflicts with action='expired' and + # expired_unilateral=True. + from mcp_server.storage import jsonl_store + + rows = jsonl_store.read_all(paths.pending_conflicts_path()) + audit_rows = [ + r + for r in rows + if r.get("kind") == "resolution" and r.get("expired_unilateral") is True + ] + assert len(audit_rows) == 1 + assert audit_rows[0]["action"] == "expired" + + +# ────────────────────────────────────────────────────────────────────── +# MCP tools (feature-flag gate) +# ────────────────────────────────────────────────────────────────────── + + +class TestMcpToolsFeatureFlag: + def test_propose_disabled_by_default(self, project: Path) -> None: + from mcp_server.tools.consensus import consensus_propose_supersession + + r = consensus_propose_supersession( + target_decision_id="D000001", + new_decision="x", + reason="y", + ) + assert r["disabled"] is True + + def test_resolve_disabled_by_default(self, project: Path) -> None: + from mcp_server.tools.consensus import consensus_resolve + + r = consensus_resolve(proposal_id="PC000001", action="approved") + assert r["disabled"] is True + + def test_propose_when_enabled( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + _enable_handshake(project) + monkeypatch.setenv("CODEVIRA_IDE", "claude_code") + target = decisions_store.record(decision="x", do_not_revert=True) + monkeypatch.setenv("CODEVIRA_IDE", "cursor") + from mcp_server.tools.consensus import consensus_propose_supersession + + r = consensus_propose_supersession( + target_decision_id=target, new_decision="y", reason="z" + ) + assert r["proposed"] is True + + +class TestOriginOf: + def test_unknown_decision_returns_error(self, project: Path) -> None: + from mcp_server.tools.consensus import origin_of + + r = origin_of("D999999") + assert r["found"] is False + + def test_returns_origin_block( + self, project: Path, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setenv("CODEVIRA_IDE", "windsurf") + did = decisions_store.record(decision="x") + from mcp_server.tools.consensus import origin_of + + r = origin_of(did) + assert r["found"] is True + assert r["origin"]["ide"] == "windsurf" From 2b7e4e57516b4f5ee1e7548dce7791c713bce429 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 14:51:11 +0530 Subject: [PATCH 15/44] =?UTF-8?q?feat(v3.1.0):=20M8=20=E2=80=94=20reflecti?= =?UTF-8?q?ons=20(storage=20+=20sanitization=20+=20CLI/MCP=20surface)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the durable LLM-abstraction subsystem. Reflections live in .codevira/reflections.jsonl (canonical, committed) — Generative- Agents-style abstractions over recent decisions + sessions that the next agent can read on get_session_context. Sampling integration scope: v3.1.0 ships storage + sanitization + source-context builder + prompt template + the API surface. The MCP sampling/createMessage RPC that asks the host LLM for the abstraction is the v3.2 deliverable. Until then, reflect() returns {sampling_supported: False, rendered_prompt, source_context} and the CLI accepts an LLM response via --from-file. Storage layer (reflections_store.py): - scrub_sensitive(text): regex redaction of api keys / Bearer / passwords / AWS AKIA / long hex / long base64 → . - build_source_context(period_days, now): aggregate sessions + decisions in window; plan caps (≤30 / ≤100 / ≤6 KB); sanitize narrative fields; envelope trim drops oldest first when over. - render_prompt(ctx): inline source into bundled prompt template (mcp_server/data/prompts/reflection_v1.md). Fallback inline when template missing. - append(target='reflections'|'proposals'): write finalized or pending; R-prefixed monotonic ids. - list_recent / list_filtered: newest-first reads with since/tags. CLI (cli_reflect.py): codevira reflect [--period 7d] [--from-file PATH] [--apply] [--yes]. - No --from-file: render prompt and print it. - --from-file PATH: parse the LLM YAML response (first ```yaml fence or whole-text fallback); write to reflection_proposals.jsonl. - --from-file PATH --apply [--yes]: commit to reflections.jsonl (interactive confirm unless --yes). Empty abstraction rejected with non-zero exit. MCP tools (tools/reflections.py): - reflect(period_days, dry_run): {sampling_supported: False, deferred_to: 'v3.2', rendered_prompt, source_context, ...}. - get_reflections(top_k): newest-first reflections. - list_reflections(since, tags, limit): filtered list. Registered in server.py: 3 Tool entries + 3 dispatch branches. Bundled prompt: mcp_server/data/prompts/reflection_v1.md (single yaml-fenced output with abstraction/tags/confidence; ships via existing pyproject 'mcp_server/data/**/*' package-data glob). Tests: tests/test_reflections.py — 26 tests across scrub_sensitive (per-pattern + plain text untouched), build_source_context (window filter + caps + sanitization), render_prompt (template inline + fallback), storage (append / list_recent / list_filtered), CLI (render mode + --from-file proposal + --apply commit + unfenced parsing + missing file + empty rejection), MCP tools (reflect stub + get_reflections). 808 tests across storage + engine + tools + check_conflict + CLI pass green; zero regressions from M7 baseline. Plan M8. M9 (docs + verification smoke) is the only remaining milestone. Co-Authored-By: Claude Opus 4.7 --- mcp_server/cli.py | 49 +++ mcp_server/cli_reflect.py | 170 +++++++++++ mcp_server/data/prompts/reflection_v1.md | 40 +++ mcp_server/server.py | 69 +++++ mcp_server/storage/paths.py | 22 ++ mcp_server/storage/reflections_store.py | 364 ++++++++++++++++++++++ mcp_server/tools/reflections.py | 125 ++++++++ tests/test_reflections.py | 370 +++++++++++++++++++++++ 8 files changed, 1209 insertions(+) create mode 100644 mcp_server/cli_reflect.py create mode 100644 mcp_server/data/prompts/reflection_v1.md create mode 100644 mcp_server/storage/reflections_store.py create mode 100644 mcp_server/tools/reflections.py create mode 100644 tests/test_reflections.py diff --git a/mcp_server/cli.py b/mcp_server/cli.py index 3355a73..0838567 100644 --- a/mcp_server/cli.py +++ b/mcp_server/cli.py @@ -1239,6 +1239,43 @@ def error(self, message): # type: ignore[override] ), ) + # v3.1.0 M8: reflections — codevira reflect [--period 7d] + # [--from-file PATH] [--apply] [--yes]. Without --from-file the + # CLI prints the rendered prompt + source-context summary for the + # user to feed to their own LLM; with --from-file it parses the + # LLM response and writes a proposal (or commits with --apply). + reflect_parser = subparsers.add_parser( + "reflect", + help="Build a reflection over recent decisions + sessions " + "(v3.1.0 M8). MCP sampling/createMessage integration ships " + "in v3.2; meanwhile this CLI renders the prompt and accepts " + "an LLM response via --from-file.", + ) + reflect_parser.add_argument( + "--period", + type=int, + default=7, + help="Look-back window in days (default 7).", + ) + reflect_parser.add_argument( + "--from-file", + type=str, + default=None, + help="Read an LLM YAML response from this file (per the prompt " + "template) and persist it as a reflection proposal.", + ) + reflect_parser.add_argument( + "--apply", + action="store_true", + help="Commit to .codevira/reflections.jsonl (otherwise the " + "result lands in reflection_proposals.jsonl for review).", + ) + reflect_parser.add_argument( + "--yes", + action="store_true", + help="With --apply: skip the interactive confirm prompt.", + ) + # v3.1.0 M6 Phase B: cross-IDE consensus check (read-only). The # MCP surface (consensus_check / consensus_status) is also exposed. consensus_parser = subparsers.add_parser( @@ -1560,6 +1597,18 @@ def error(self, message): # type: ignore[override] keep_data=getattr(args, "keep_data", False), ) sys.exit(rc) + elif args.command == "reflect": + # v3.1.0 M8: reflections CLI. + from mcp_server.cli_reflect import cmd_reflect + + sys.exit( + cmd_reflect( + period_days=getattr(args, "period", 7), + from_file=getattr(args, "from_file", None), + apply=getattr(args, "apply", False), + yes=getattr(args, "yes", False), + ) + ) elif args.command == "consensus": # v3.1.0 M6: cross-IDE consensus CLI. consensus_action = getattr(args, "consensus_action", None) diff --git a/mcp_server/cli_reflect.py b/mcp_server/cli_reflect.py new file mode 100644 index 0000000..42d9a52 --- /dev/null +++ b/mcp_server/cli_reflect.py @@ -0,0 +1,170 @@ +""" +cli_reflect.py — v3.1.0 M8: ``codevira reflect`` CLI. + +Three modes (orthogonal): + + - **No flags**: build the source context + render the prompt; + print it so the user can feed it to their own LLM. + - ``--from-file ``: read an LLM response from the file, + parse the YAML output, write to + ``.codevira/reflection_proposals.jsonl`` for review. + - ``--from-file --apply [--yes]``: write directly to + ``.codevira/reflections.jsonl``. ``--yes`` skips the interactive + confirm. + +The LLM response format follows the prompt template +(``reflection_v1.md``) — a single ``yaml`` fenced block with +``abstraction``, ``tags``, and ``confidence`` fields. + +Offline behavior: CLI mode never calls ``sampling/createMessage`` +(no MCP client attached when invoked from a plain terminal). The +``reflect()`` MCP tool returns ``sampling_supported: False`` in +v3.1.0; the CLI is the recommended interactive path until v3.2 +wires the live sampling RPC. +""" + +from __future__ import annotations + +import sys +from typing import Any + +from mcp_server.storage import paths, reflections_store + + +def cmd_reflect( + *, + period_days: int = 7, + from_file: str | None = None, + apply: bool = False, + yes: bool = False, +) -> int: + """Entry point for ``codevira reflect``. + + Returns 0 on success (including "no-op, here's the prompt"). + Non-zero on parse / storage failure. + """ + ctx = reflections_store.build_source_context(period_days=period_days) + + if from_file is None: + # Pure render mode — print the prompt and instruct. + prompt = reflections_store.render_prompt(ctx) + sys.stdout.write( + f"codevira reflect: built source context for the last " + f"{period_days} day(s) " + f"({len(ctx['sessions'])} session(s), " + f"{len(ctx['decisions'])} decision(s), " + f"{ctx['envelope_bytes']} bytes).\n\n" + ) + sys.stdout.write( + "Feed the prompt below to your LLM, save its YAML response, then " + "re-run with --from-file (add --apply --yes to commit " + "the reflection to .codevira/reflections.jsonl).\n\n" + ) + sys.stdout.write("─" * 70 + "\n") + sys.stdout.write(prompt) + sys.stdout.write("\n" + "─" * 70 + "\n") + return 0 + + # Parse the LLM response file. + try: + with open(from_file, encoding="utf-8") as fh: + response = fh.read() + except OSError as exc: + sys.stderr.write(f"codevira reflect: could not read {from_file}: {exc}\n") + return 1 + + parsed = _parse_response(response) + if not parsed: + sys.stderr.write( + "codevira reflect: could not parse abstraction/tags/confidence " + "from the response. The LLM should return a single yaml-fenced " + "block per the prompt template (see " + "mcp_server/data/prompts/reflection_v1.md).\n" + ) + return 1 + + abstraction = parsed.get("abstraction") or "" + tags = parsed.get("tags") or [] + confidence = parsed.get("confidence") + + if not abstraction.strip(): + sys.stderr.write( + "codevira reflect: response had an empty 'abstraction:' " + "field; refusing to record an empty reflection.\n" + ) + return 1 + + # Confirm prompt unless --yes or proposal-only. + if apply and not yes: + sys.stdout.write("Proposed reflection:\n") + sys.stdout.write(f" tags: {tags}\n") + sys.stdout.write(f" confidence: {confidence}\n\n") + sys.stdout.write(abstraction.strip() + "\n\n") + sys.stdout.write("Commit to .codevira/reflections.jsonl? [y/N]: ") + sys.stdout.flush() + try: + resp = input().strip().lower() + except EOFError: + resp = "n" + if resp not in ("y", "yes"): + sys.stdout.write("codevira reflect: not committed.\n") + return 0 + + target = "reflections" if apply else "proposals" + rid = reflections_store.append( + abstraction=abstraction, + confidence=( + float(confidence) + if isinstance(confidence, (int, float, str)) and _is_floatable(confidence) + else None + ), + tags=tags, + period_start=ctx["period_start"], + period_end=ctx["period_end"], + source_session_ids=ctx["source_session_ids"], + source_decision_ids=ctx["source_decision_ids"], + target=target, + ) + dest = paths.reflections_path() if apply else paths.reflection_proposals_path() + sys.stdout.write( + f"codevira reflect: wrote {rid} to {dest}\n" + f" ({'committed reflection' if apply else 'proposal for review'})\n" + ) + return 0 + + +# ────────────────────────────────────────────────────────────────────── +# Parsing +# ────────────────────────────────────────────────────────────────────── + + +def _parse_response(text: str) -> dict[str, Any] | None: + """Pull out the first yaml-fenced block + parse it. + + Falls back to whole-text parsing if no fence is found (some LLMs + omit the fence even when asked). Returns None on hard failure. + """ + import re + + try: + import yaml + except Exception: # noqa: BLE001 + return None + + fence_match = re.search(r"```(?:yaml)?\s*(.+?)```", text, re.DOTALL | re.IGNORECASE) + block = fence_match.group(1) if fence_match else text + try: + data = yaml.safe_load(block) + except Exception: # noqa: BLE001 + return None + if not isinstance(data, dict): + return None + return data + + +def _is_floatable(value: Any) -> bool: + try: + float(value) + return True + except (TypeError, ValueError): + return False diff --git a/mcp_server/data/prompts/reflection_v1.md b/mcp_server/data/prompts/reflection_v1.md new file mode 100644 index 0000000..3287c52 --- /dev/null +++ b/mcp_server/data/prompts/reflection_v1.md @@ -0,0 +1,40 @@ +# Codevira reflection prompt — v1 + +You are reflecting on a slice of a software project's recent history +to produce a concise *abstraction* the next AI agent (or human +reader) can use to orient. The slice contains: + +- Decisions recorded during the period (each: id, decision text, file + context if any, tags). +- Sessions logged (each: task summary, task_type if set, outcome). + +## Output + +Respond with a single block in the following shape: + +```yaml +abstraction: | + <2-6 sentences. What pattern emerges across these decisions and + sessions? What is the team's evolving stance? Avoid restating the + facts — synthesize the higher-level *belief* or *trajectory*.> +tags: [<3-5 short topical tags>] +confidence: <0.0 to 1.0 — how strongly the input justifies this + abstraction. Lower confidence = more inference; higher = + clearly grounded in the source records.> +``` + +## Constraints + +- Markdown is fine inside `abstraction:`, but keep it tight. +- Do not invent decisions, files, or commits not present in the + source. If the input is too thin to support a pattern, return + confidence < 0.3 and say so plainly in the abstraction. +- If the source records contain potential secrets that were stripped + (you'll see `` markers), do not try to guess what was + there. +- Stay in this YAML output block. Do not preface or append commentary + outside the fenced block. + +## Source records + +<<>> diff --git a/mcp_server/server.py b/mcp_server/server.py index 378ad07..6cd86e3 100644 --- a/mcp_server/server.py +++ b/mcp_server/server.py @@ -1419,6 +1419,55 @@ async def list_tools() -> list[Tool]: "required": ["decision_id"], }, ), + # ---- v3.1.0 M8: reflections (episodic abstraction) ---- + Tool( + name="reflect", + description=( + "v3.1.0 M8: Build the source context + rendered prompt for an " + "LLM abstraction over recent decisions + sessions. v3.1.0 " + "returns {sampling_supported: False, rendered_prompt, " + "source_context} so callers can feed the prompt to a locally-" + "available LLM. The MCP sampling/createMessage RPC integration " + "is the v3.2 deliverable; until then, use `codevira reflect " + "--from-file` to commit an LLM-supplied abstraction." + ), + inputSchema={ + "type": "object", + "properties": { + "period_days": {"type": "integer", "default": 7}, + "dry_run": {"type": "boolean", "default": True}, + }, + }, + ), + Tool( + name="get_reflections", + description=("v3.1.0 M8: Top-K most recent reflections (newest first)."), + inputSchema={ + "type": "object", + "properties": { + "top_k": {"type": "integer", "default": 5}, + }, + }, + ), + Tool( + name="list_reflections", + description=( + "v3.1.0 M8: Filtered reflection list. 'since' is an ISO 8601 " + "timestamp cutoff; 'tags' is set intersection (every requested " + "tag must appear)." + ), + inputSchema={ + "type": "object", + "properties": { + "since": {"type": "string"}, + "tags": { + "type": "array", + "items": {"type": "string"}, + }, + "limit": {"type": "integer", "default": 50}, + }, + }, + ), # ---- v1.5: Deep Graph Intelligence Tools ---- Tool( name="query_graph", @@ -1860,6 +1909,26 @@ async def call_tool(name: str, arguments: dict) -> list[TextContent]: from mcp_server.tools.consensus import origin_of result = origin_of(decision_id=arguments["decision_id"]) + # ---- v3.1.0 M8: reflections dispatch ---- + elif name == "reflect": + from mcp_server.tools.reflections import reflect + + result = reflect( + period_days=arguments.get("period_days", 7), + dry_run=arguments.get("dry_run", True), + ) + elif name == "get_reflections": + from mcp_server.tools.reflections import get_reflections + + result = get_reflections(top_k=arguments.get("top_k", 5)) + elif name == "list_reflections": + from mcp_server.tools.reflections import list_reflections + + result = list_reflections( + since=arguments.get("since"), + tags=arguments.get("tags"), + limit=arguments.get("limit", 50), + ) else: result = {"error": f"Unknown tool: {name}"} diff --git a/mcp_server/storage/paths.py b/mcp_server/storage/paths.py index 2358ec0..91b4a09 100644 --- a/mcp_server/storage/paths.py +++ b/mcp_server/storage/paths.py @@ -109,6 +109,28 @@ def skills_path(project_root: Path | None = None) -> Path: return codevira_dir(project_root) / "skills.jsonl" +def reflections_path(project_root: Path | None = None) -> Path: + """v3.1.0 M8: durable LLM-generated abstractions over recent + episodic memory. Committed because reflections are semantic + artifacts (not scratchpad).""" + return codevira_dir(project_root) / "reflections.jsonl" + + +def reflection_proposals_path(project_root: Path | None = None) -> Path: + """v3.1.0 M8: human-review staging for ``codevira reflect``. + Without ``--apply`` the CLI writes a proposal here so the user + can scan it before committing.""" + return codevira_dir(project_root) / "reflection_proposals.jsonl" + + +def reflection_prompt_path() -> Path: + """v3.1.0 M8: bundled prompt template path. Lives next to the + package so the wheel ships it via pyproject's package-data glob.""" + return ( + Path(__file__).resolve().parent.parent / "data" / "prompts" / "reflection_v1.md" + ) + + def pending_conflicts_path(project_root: Path | None = None) -> Path: """v3.1.0 M6: cross-IDE conflict log materialized by ``codevira consensus check`` (Phase B). Each row is a conflict diff --git a/mcp_server/storage/reflections_store.py b/mcp_server/storage/reflections_store.py new file mode 100644 index 0000000..6d794d5 --- /dev/null +++ b/mcp_server/storage/reflections_store.py @@ -0,0 +1,364 @@ +""" +reflections_store.py — v3.1.0 M8: durable LLM abstractions. + +Reflections are Generative-Agents-style abstractions over recent +episodic memory. The agent (or a CLI invocation) periodically asks +the host LLM (via MCP ``sampling/createMessage``) to synthesize the +pattern in recent decisions + sessions; the result lands here as a +durable semantic artifact the next agent can consult on +get_session_context. + +# Why a separate store + +- **Canonical**: lives in ``.codevira/reflections.jsonl`` — committed + to the repo because reflections survive sessions and inform + teammates (not scratchpad). +- **Sanitized inputs**: before the LLM ever sees the source records, + we strip secrets (API keys, Bearer tokens, AWS-style AKIA, password + fields, long hex/base64 blobs) so reflections can't accidentally + encode credentials. +- **Cap-then-sample**: hard caps on input size (≤30 sessions + ≤100 + decisions per reflection; ~6 KB input envelope) so a giant project + history doesn't blow the LLM's context budget. + +# Schema + +:: + + { + "id": "R000001", + "ts": "2026-05-28T10:00:00+00:00", + "period_start": "2026-05-21T00:00:00+00:00", + "period_end": "2026-05-28T00:00:00+00:00", + "source_session_ids": ["sess-abc", "sess-def"], + "source_decision_ids": ["D000123", "D000124"], + "abstraction": "<≤ 4 KB markdown from the LLM>", + "confidence": 0.0-1.0, + "tags": ["release", "auth"], + "model_used": "", + "origin": {ide, agent_model, host_hash, ts}, + "_schema_v": 1, + } + +# Sampling integration scope (v3.1.0 vs v3.2) + +The MCP ``sampling/createMessage`` request is what asks the connected +client (Claude Code / Claude Desktop) to invoke its LLM on a prompt. +v3.1.0 ships the storage + sanitization + prompt-template + the API +surface; the *actual* sampling call is stubbed (``reflect()`` returns +``{sampling_unsupported: True, deferred_to: "v3.2"}``). When v3.2 wires +the live sampling RPC through, the existing tests and CLI flow stay +the same — only the inner call swaps from stub to real. +""" + +from __future__ import annotations + +import logging +import re +from datetime import datetime, timezone +from typing import Any + +from mcp_server.storage import jsonl_store, origin as origin_module, paths + +logger = logging.getLogger(__name__) + +SCHEMA_V = 1 + +# Caps from the plan. +MAX_SESSIONS_PER_REFLECTION = 30 +MAX_DECISIONS_PER_REFLECTION = 100 +MAX_INPUT_BYTES = 6 * 1024 # 6 KB cap on the source context envelope + + +# ────────────────────────────────────────────────────────────────────── +# Sanitization +# ────────────────────────────────────────────────────────────────────── + + +# Secret-shaped patterns. The goal is "obvious accidents", not crypto +# defeat — these are line-level redactions before the LLM sees the +# source. We replace each match with ```` so the +# downstream reader knows something was stripped (for confidence +# scoring) without exposing the content. +_SECRET_PATTERNS: tuple[tuple[str, re.Pattern[str]], ...] = ( + ("api-key", re.compile(r"(?i)\b(api[_-]?key)\s*[:=]\s*\S+")), + ("bearer", re.compile(r"(?i)\bbearer\s+[A-Za-z0-9._\-+/=]{8,}")), + ("password", re.compile(r"(?i)\bpassword\s*[:=]\s*\S+")), + ("aws-akia", re.compile(r"\bAKIA[0-9A-Z]{16}\b")), + # Long hex / base64 blob — 32+ chars of plausible token material. + ("long-token", re.compile(r"\b[A-Fa-f0-9]{32,}\b")), + ("long-b64", re.compile(r"\b[A-Za-z0-9+/]{40,}={0,2}\b")), +) + + +def scrub_sensitive(text: str) -> str: + """Replace recognised secret-shaped substrings with + ```` markers. Conservative — better to over-redact + than to ship a key into a committed reflection. + """ + if not isinstance(text, str) or not text: + return text + out = text + for kind, pattern in _SECRET_PATTERNS: + out = pattern.sub(f"", out) + return out + + +# ────────────────────────────────────────────────────────────────────── +# Source context aggregation +# ────────────────────────────────────────────────────────────────────── + + +def build_source_context( + *, + period_days: int = 7, + now: datetime | None = None, +) -> dict[str, Any]: + """Aggregate recent sessions + decisions for the reflection prompt. + + Applies the plan's caps (≤30 sessions, ≤100 decisions, ≤6 KB + serialized envelope) and runs ``scrub_sensitive`` over every text + field that could carry a secret. Returns ``{period_start, + period_end, sessions, decisions, source_session_ids, + source_decision_ids, envelope_bytes}``. + """ + now_dt = now or datetime.now(timezone.utc) + from datetime import timedelta + + period_start = now_dt - timedelta(days=max(period_days, 1)) + + raw_sessions = jsonl_store.read_recent( + paths.sessions_path(), limit=MAX_SESSIONS_PER_REFLECTION * 2 + ) + sessions: list[dict[str, Any]] = [] + for s in raw_sessions: + if s.get("_amendment_to_id"): + continue + ts_str = s.get("ts") + if not isinstance(ts_str, str): + continue + try: + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + if ts < period_start or ts > now_dt: + continue + except (ValueError, TypeError): + continue + sessions.append(s) + if len(sessions) >= MAX_SESSIONS_PER_REFLECTION: + break + + raw_decisions = jsonl_store.read_recent( + paths.decisions_path(), limit=MAX_DECISIONS_PER_REFLECTION * 2 + ) + decisions: list[dict[str, Any]] = [] + for d in raw_decisions: + if d.get("_amendment_to_id"): + continue + ts_str = d.get("ts") + if not isinstance(ts_str, str): + continue + try: + ts = datetime.fromisoformat(ts_str) + if ts.tzinfo is None: + ts = ts.replace(tzinfo=timezone.utc) + if ts < period_start or ts > now_dt: + continue + except (ValueError, TypeError): + continue + decisions.append(d) + if len(decisions) >= MAX_DECISIONS_PER_REFLECTION: + break + + # Sanitize narrative fields before they hit the prompt. + sanitized_sessions = [ + { + "session_id": s.get("session_id"), + "task": scrub_sensitive(s.get("task") or ""), + "task_type": s.get("task_type"), + "summary": scrub_sensitive(s.get("summary") or ""), + "outcome": s.get("outcome"), + } + for s in sessions + ] + sanitized_decisions = [ + { + "id": d.get("id"), + "decision": scrub_sensitive(d.get("decision") or ""), + "context": scrub_sensitive(d.get("context") or ""), + "file_path": d.get("file_path"), + "tags": d.get("tags") or [], + } + for d in decisions + ] + + # Envelope-size enforcement: trim from the *oldest* end first if + # we exceed the cap. A clipped reflection is fine; a reflection + # that blew the LLM's context isn't. + def _serialize_size(sess: list[dict], dec: list[dict]) -> int: + return len(repr({"s": sess, "d": dec}).encode("utf-8")) + + while _serialize_size( + sanitized_sessions, sanitized_decisions + ) > MAX_INPUT_BYTES and (sanitized_sessions or sanitized_decisions): + if len(sanitized_decisions) >= len(sanitized_sessions): + sanitized_decisions.pop() # drop oldest in iteration order + else: + sanitized_sessions.pop() + + return { + "period_start": period_start.isoformat(), + "period_end": now_dt.isoformat(), + "sessions": sanitized_sessions, + "decisions": sanitized_decisions, + "source_session_ids": [ + str(s.get("session_id") or "") + for s in sanitized_sessions + if s.get("session_id") + ], + "source_decision_ids": [ + str(d.get("id") or "") for d in sanitized_decisions if d.get("id") + ], + "envelope_bytes": _serialize_size(sanitized_sessions, sanitized_decisions), + } + + +def render_prompt(source_context: dict[str, Any]) -> str: + """Inline the source context into the bundled prompt template. + + The placeholder ``<<>>`` in + ``mcp_server/data/prompts/reflection_v1.md`` is replaced with a + deterministic YAML-ish rendering of the aggregated sessions + + decisions. Falls back to a minimal inline template if the bundled + file is missing (defensive — shouldn't happen with package-data). + """ + template_path = paths.reflection_prompt_path() + try: + template = template_path.read_text(encoding="utf-8") + except OSError as exc: + logger.warning( + "reflections_store.render_prompt: bundled template missing " + "(%s); using minimal fallback", + exc, + ) + template = ( + "Reflect on the project's recent decisions and sessions. " + "Output a YAML block with abstraction, tags, confidence.\n" + "<<>>" + ) + + return template.replace( + "<<>>", _render_context_block(source_context) + ) + + +def _render_context_block(ctx: dict[str, Any]) -> str: + lines: list[str] = [ + f"period_start: {ctx.get('period_start')}", + f"period_end: {ctx.get('period_end')}", + "", + "sessions:", + ] + for s in ctx.get("sessions") or []: + lines.append( + f" - session_id: {s.get('session_id')!r}" + f" task_type: {s.get('task_type')!r}" + ) + if s.get("task"): + lines.append(f" task: {s['task']}") + if s.get("summary"): + lines.append(f" summary: {s['summary']}") + lines.append("") + lines.append("decisions:") + for d in ctx.get("decisions") or []: + tags = ", ".join(d.get("tags") or []) + lines.append( + f" - id: {d.get('id')!r} tags: [{tags}] " f"file: {d.get('file_path')!r}" + ) + if d.get("decision"): + lines.append(f" decision: {d['decision']}") + return "\n".join(lines) + + +# ────────────────────────────────────────────────────────────────────── +# Writes / Reads +# ────────────────────────────────────────────────────────────────────── + + +def append( + *, + abstraction: str, + confidence: float | None, + tags: list[str], + period_start: str, + period_end: str, + source_session_ids: list[str], + source_decision_ids: list[str], + model_used: str | None = None, + target: str = "reflections", +) -> str: + """Persist a finalized reflection (target='reflections') or a + pending proposal (target='proposals'). Returns the R-id. + """ + paths.ensure_dirs() + rec = { + "ts": datetime.now(timezone.utc).isoformat(), + "period_start": period_start, + "period_end": period_end, + "source_session_ids": list(source_session_ids or []), + "source_decision_ids": list(source_decision_ids or []), + "abstraction": (abstraction or "").strip(), + "confidence": ( + float(confidence) if isinstance(confidence, (int, float)) else None + ), + "tags": [str(t).strip().lower() for t in (tags or []) if str(t).strip()], + "model_used": model_used, + "origin": origin_module.current_origin(), + "_schema_v": SCHEMA_V, + } + dest = ( + paths.reflections_path() + if target == "reflections" + else paths.reflection_proposals_path() + ) + return jsonl_store.append_with_generated_id(dest, rec, prefix="R", width=6) + + +def list_recent(*, limit: int = 5, target: str = "reflections") -> list[dict[str, Any]]: + """Return the most recent reflections (or proposals).""" + dest = ( + paths.reflections_path() + if target == "reflections" + else paths.reflection_proposals_path() + ) + return jsonl_store.read_recent(dest, limit=limit) + + +def list_filtered( + *, + target: str = "reflections", + since: str | None = None, + tags: list[str] | None = None, + limit: int = 50, +) -> list[dict[str, Any]]: + """Return reflections filtered by since (ISO ts) and tags + intersection (every requested tag must be present).""" + rows = list_recent(target=target, limit=limit * 4) + norm_tags = ( + {str(t).strip().lower() for t in tags if str(t).strip()} if tags else None + ) + out: list[dict[str, Any]] = [] + for r in rows: + if since and (r.get("ts") or "") < since: + continue + if norm_tags: + row_tags = { + str(t).strip().lower() for t in (r.get("tags") or []) if str(t).strip() + } + if not norm_tags.issubset(row_tags): + continue + out.append(r) + if len(out) >= limit: + break + return out diff --git a/mcp_server/tools/reflections.py b/mcp_server/tools/reflections.py new file mode 100644 index 0000000..d7679bb --- /dev/null +++ b/mcp_server/tools/reflections.py @@ -0,0 +1,125 @@ +""" +reflections.py — v3.1.0 M8 MCP tools for episodic abstractions. + +Three tools cover the agent-facing surface: + + - reflect — build the source context + rendered prompt for + the host LLM to abstract over. v3.1.0 ships + with the actual sampling call stubbed; v3.2 + wires the MCP sampling/createMessage RPC + through. In v3.1, callers receive + ``{sampling_supported: False}`` plus the + prerendered prompt so they can feed it to the + LLM manually and ``reflect_apply()`` (CLI) to + persist the result. + - get_reflections — top-K most recent reflections. + - list_reflections — filtered list (since / tags / limit). + +The opt-in scheduled-reflection path +(``memory.reflections.auto_reflect_days``) reads its flag via +config.get_flag. The MCP tools themselves always work — reflections +are read-only consumers of episodic memory and never produce +side effects on decisions/sessions. +""" + +from __future__ import annotations + +from typing import Any + +from mcp_server.storage import reflections_store + + +def reflect( + *, + period_days: int = 7, + dry_run: bool = True, +) -> dict[str, Any]: + """Build the source context + rendered prompt for an LLM + abstraction. + + v3.1.0: returns ``sampling_supported: False`` with the rendered + prompt + source_context so callers can feed the prompt to a + locally-available LLM (or via codevira reflect --from-file). The + sampling/createMessage MCP RPC integration is the v3.2 deliverable; + swapping the stub for a real sampling call is a single-function + change. + + ``dry_run=True`` (the default) is the storage-safe path; it never + writes — the caller decides when to apply via reflect_apply or + ``codevira reflect --apply``. + """ + ctx = reflections_store.build_source_context(period_days=period_days) + prompt = reflections_store.render_prompt(ctx) + return { + "sampling_supported": False, + "deferred_to": "v3.2", + "hint": ( + "v3.1.0 ships reflections' storage + prompt-rendering + " + "sanitization. The MCP sampling/createMessage RPC that " + "would call the host LLM is the v3.2 follow-up. Until then " + "you can run `codevira reflect --from-file ` to commit " + "an LLM-supplied abstraction, or read the rendered prompt " + "below and feed it to your own LLM." + ), + "period_days": period_days, + "period_start": ctx["period_start"], + "period_end": ctx["period_end"], + "source_context": { + "session_count": len(ctx["sessions"]), + "decision_count": len(ctx["decisions"]), + "envelope_bytes": ctx["envelope_bytes"], + "source_session_ids": ctx["source_session_ids"], + "source_decision_ids": ctx["source_decision_ids"], + }, + "rendered_prompt": prompt, + "dry_run": bool(dry_run), + } + + +def get_reflections(*, top_k: int = 5) -> dict[str, Any]: + """Top-K most recent reflections (newest first).""" + rows = reflections_store.list_recent(limit=top_k) + return { + "count": len(rows), + "reflections": [ + { + "reflection_id": r.get("id"), + "ts": r.get("ts"), + "period_start": r.get("period_start"), + "period_end": r.get("period_end"), + "abstraction": r.get("abstraction"), + "confidence": r.get("confidence"), + "tags": r.get("tags") or [], + "model_used": r.get("model_used"), + "source_session_ids": r.get("source_session_ids") or [], + "source_decision_ids": r.get("source_decision_ids") or [], + } + for r in rows + ], + } + + +def list_reflections( + *, + since: str | None = None, + tags: list[str] | None = None, + limit: int = 50, +) -> dict[str, Any]: + """Filtered reflection list. ``since`` is an ISO 8601 timestamp + cutoff; ``tags`` is set-intersection (every requested tag must + appear).""" + rows = reflections_store.list_filtered(since=since, tags=tags, limit=limit) + return { + "count": len(rows), + "reflections": [ + { + "reflection_id": r.get("id"), + "ts": r.get("ts"), + "tags": r.get("tags") or [], + "abstraction": r.get("abstraction"), + "confidence": r.get("confidence"), + } + for r in rows + ], + "filtered_by": {"since": since, "tags": tags}, + } diff --git a/tests/test_reflections.py b/tests/test_reflections.py new file mode 100644 index 0000000..8cd0532 --- /dev/null +++ b/tests/test_reflections.py @@ -0,0 +1,370 @@ +""" +Tests for v3.1.0 M8: reflections. + +Covers: + - scrub_sensitive: redacts api keys / bearer / passwords / AKIA / + long hex / long base64. + - build_source_context: filters by period; obeys session/decision + caps; sanitizes narrative fields; envelope-size cap trims when over. + - render_prompt: template inlines source context with placeholder + expansion; missing template falls back gracefully. + - reflections_store.append / list_recent / list_filtered. + - cmd_reflect: render mode prints prompt; --from-file parses YAML + and writes proposal; --apply --yes commits to reflections.jsonl; + missing/empty/malformed input rejected. + - MCP tools: reflect returns sampling_supported=False; get/list + return durable data. +""" + +from __future__ import annotations + +from datetime import datetime, timezone +from pathlib import Path + +import pytest + +import mcp_server.paths as paths_module +from mcp_server.storage import ( + decisions_store, + jsonl_store, + paths, + reflections_store, + sessions_store, +) + + +@pytest.fixture +def project(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + root = tmp_path / "proj" + (root / ".codevira").mkdir(parents=True) + (root / ".codevira" / "config.yaml").write_text("project:\n name: test\n") + monkeypatch.setattr(paths_module, "_project_dir_override", None) + monkeypatch.chdir(root.resolve()) + return root + + +# ────────────────────────────────────────────────────────────────────── +# scrub_sensitive +# ────────────────────────────────────────────────────────────────────── + + +class TestScrubSensitive: + def test_api_key_redacted(self) -> None: + out = reflections_store.scrub_sensitive("api_key=hunter2-deadbeefcafe") + assert "" in out + assert "hunter2" not in out + + def test_bearer_redacted(self) -> None: + out = reflections_store.scrub_sensitive( + "Authorization: Bearer abc123XYZ.token-here" + ) + assert "" in out + assert "abc123XYZ" not in out + + def test_password_redacted(self) -> None: + out = reflections_store.scrub_sensitive( + "password=hunter2-correct-horse-battery-staple" + ) + assert "" in out + + def test_akia_redacted(self) -> None: + out = reflections_store.scrub_sensitive("AKIAIOSFODNN7EXAMPLE here") + assert "" in out + + def test_long_hex_redacted(self) -> None: + out = reflections_store.scrub_sensitive( + "tok = a1b2c3d4e5f607182930405060708090abcdef0123456789" + ) + assert "" in out + + def test_plain_text_untouched(self) -> None: + text = "Use bcrypt for password hashing in auth.py." + # Note: the "password" word is part of normal prose; the + # secret regex requires `password=` or `password:` form. + assert reflections_store.scrub_sensitive(text) == text + + +# ────────────────────────────────────────────────────────────────────── +# build_source_context +# ────────────────────────────────────────────────────────────────────── + + +class TestBuildSourceContext: + def test_empty_returns_empty_lists(self, project: Path) -> None: + ctx = reflections_store.build_source_context(period_days=7) + assert ctx["sessions"] == [] + assert ctx["decisions"] == [] + + def test_in_window_included(self, project: Path) -> None: + decisions_store.record(decision="X", tags=["t1"]) + sessions_store.write("s1", task="x", task_type="bug") + ctx = reflections_store.build_source_context(period_days=7) + assert len(ctx["sessions"]) == 1 + assert len(ctx["decisions"]) == 1 + + def test_out_of_window_excluded(self, project: Path) -> None: + # Inject an old decision row directly. + old_ts = (datetime(2020, 1, 1, tzinfo=timezone.utc)).isoformat() + jsonl_store.append( + paths.decisions_path(), + { + "id": "D000099", + "ts": old_ts, + "session_id": "ad-hoc", + "decision": "ancient", + "_schema_v": 0, + }, + ) + decisions_store.record(decision="fresh", tags=[]) + ctx = reflections_store.build_source_context(period_days=7) + # Only the fresh decision surfaces. + assert all("ancient" not in d.get("decision", "") for d in ctx["decisions"]) + + def test_session_cap_enforced(self, project: Path) -> None: + for i in range(50): + sessions_store.write(f"sess-{i}", task=f"task {i}", task_type="bug") + ctx = reflections_store.build_source_context(period_days=7) + assert len(ctx["sessions"]) <= reflections_store.MAX_SESSIONS_PER_REFLECTION + + def test_decision_cap_enforced(self, project: Path) -> None: + for i in range(150): + decisions_store.record(decision=f"d{i}", tags=["t"]) + ctx = reflections_store.build_source_context(period_days=7) + assert len(ctx["decisions"]) <= reflections_store.MAX_DECISIONS_PER_REFLECTION + + def test_sanitization_runs(self, project: Path) -> None: + decisions_store.record( + decision="see api_key=hunter2-deadbeefcafedeadbeef", + tags=["secret"], + ) + ctx = reflections_store.build_source_context(period_days=7) + assert "" in ctx["decisions"][0]["decision"] + assert "hunter2" not in ctx["decisions"][0]["decision"] + + +# ────────────────────────────────────────────────────────────────────── +# render_prompt +# ────────────────────────────────────────────────────────────────────── + + +class TestRenderPrompt: + def test_inlines_source_context(self, project: Path) -> None: + decisions_store.record(decision="Use bcrypt", tags=["auth"]) + ctx = reflections_store.build_source_context(period_days=7) + prompt = reflections_store.render_prompt(ctx) + assert "Use bcrypt" in prompt + # Template placeholder was replaced. + assert "<<>>" not in prompt + + def test_template_missing_falls_back( + self, + project: Path, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + # Point to a non-existent template; the fallback inline prompt + # still renders without crashing. + monkeypatch.setattr( + paths, + "reflection_prompt_path", + lambda: Path("/nonexistent/path/reflection.md"), + ) + ctx = reflections_store.build_source_context(period_days=7) + prompt = reflections_store.render_prompt(ctx) + # Fallback inline contains the "abstraction" guidance string. + assert "abstraction" in prompt + + +# ────────────────────────────────────────────────────────────────────── +# Storage: append / list / filter +# ────────────────────────────────────────────────────────────────────── + + +class TestStorage: + def test_append_returns_r_id(self, project: Path) -> None: + rid = reflections_store.append( + abstraction="The team prioritizes auth hardening.", + confidence=0.7, + tags=["auth", "security"], + period_start=datetime(2026, 5, 21, tzinfo=timezone.utc).isoformat(), + period_end=datetime(2026, 5, 28, tzinfo=timezone.utc).isoformat(), + source_session_ids=["s1", "s2"], + source_decision_ids=["D000001"], + ) + assert rid.startswith("R") + + def test_list_recent_newest_first(self, project: Path) -> None: + for i, text in enumerate(["first", "second", "third"]): + reflections_store.append( + abstraction=text, + confidence=0.5, + tags=[], + period_start="2026-05-01T00:00:00+00:00", + period_end="2026-05-07T00:00:00+00:00", + source_session_ids=[], + source_decision_ids=[], + ) + rows = reflections_store.list_recent(limit=5) + assert [r["abstraction"] for r in rows] == ["third", "second", "first"] + + def test_list_filtered_by_tags(self, project: Path) -> None: + reflections_store.append( + abstraction="A", + confidence=0.5, + tags=["release", "v3"], + period_start="2026-05-01T00:00:00+00:00", + period_end="2026-05-07T00:00:00+00:00", + source_session_ids=[], + source_decision_ids=[], + ) + reflections_store.append( + abstraction="B", + confidence=0.5, + tags=["v3"], + period_start="2026-05-01T00:00:00+00:00", + period_end="2026-05-07T00:00:00+00:00", + source_session_ids=[], + source_decision_ids=[], + ) + rows = reflections_store.list_filtered(tags=["release", "v3"]) + assert [r["abstraction"] for r in rows] == ["A"] + + +# ────────────────────────────────────────────────────────────────────── +# CLI: cmd_reflect +# ────────────────────────────────────────────────────────────────────── + + +_GOOD_RESPONSE = """\ +```yaml +abstraction: | + The team is consolidating around bcrypt+rate-limiting for auth. +tags: [auth, security] +confidence: 0.78 +``` +""" + +_NO_FENCE_RESPONSE = """\ +abstraction: | + Auth hardening continues this week. +tags: [auth] +confidence: 0.6 +""" + +_EMPTY_ABSTRACTION = """\ +```yaml +abstraction: "" +tags: [] +confidence: 0.1 +``` +""" + + +class TestCmdReflect: + def test_no_from_file_prints_prompt( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + from mcp_server.cli_reflect import cmd_reflect + + rc = cmd_reflect() + assert rc == 0 + out = capsys.readouterr().out + assert "Feed the prompt below to your LLM" in out + assert "abstraction" in out + + def test_from_file_writes_proposal(self, project: Path, tmp_path: Path) -> None: + decisions_store.record(decision="x") # seed source context + resp_path = tmp_path / "resp.yaml" + resp_path.write_text(_GOOD_RESPONSE) + from mcp_server.cli_reflect import cmd_reflect + + rc = cmd_reflect(from_file=str(resp_path)) + assert rc == 0 + proposals = jsonl_store.read_all(paths.reflection_proposals_path()) + assert len(proposals) == 1 + assert "bcrypt" in proposals[0]["abstraction"] + assert sorted(proposals[0]["tags"]) == ["auth", "security"] + assert abs(proposals[0]["confidence"] - 0.78) < 1e-3 + + def test_apply_yes_commits_to_reflections( + self, project: Path, tmp_path: Path + ) -> None: + decisions_store.record(decision="x") + resp_path = tmp_path / "resp.yaml" + resp_path.write_text(_GOOD_RESPONSE) + from mcp_server.cli_reflect import cmd_reflect + + rc = cmd_reflect(from_file=str(resp_path), apply=True, yes=True) + assert rc == 0 + rows = jsonl_store.read_all(paths.reflections_path()) + assert len(rows) == 1 + assert "bcrypt" in rows[0]["abstraction"] + + def test_unfenced_response_still_parsed( + self, project: Path, tmp_path: Path + ) -> None: + decisions_store.record(decision="x") + resp_path = tmp_path / "resp.yaml" + resp_path.write_text(_NO_FENCE_RESPONSE) + from mcp_server.cli_reflect import cmd_reflect + + assert cmd_reflect(from_file=str(resp_path)) == 0 + + def test_missing_file_returns_1( + self, project: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + from mcp_server.cli_reflect import cmd_reflect + + rc = cmd_reflect(from_file="/nonexistent/path.yaml") + assert rc == 1 + assert "could not read" in capsys.readouterr().err + + def test_empty_abstraction_rejected( + self, project: Path, tmp_path: Path, capsys: pytest.CaptureFixture[str] + ) -> None: + resp_path = tmp_path / "resp.yaml" + resp_path.write_text(_EMPTY_ABSTRACTION) + from mcp_server.cli_reflect import cmd_reflect + + rc = cmd_reflect(from_file=str(resp_path)) + assert rc == 1 + assert "empty" in capsys.readouterr().err + + +# ────────────────────────────────────────────────────────────────────── +# MCP tools +# ────────────────────────────────────────────────────────────────────── + + +class TestMcpTools: + def test_reflect_returns_sampling_supported_false(self, project: Path) -> None: + decisions_store.record(decision="x") + from mcp_server.tools.reflections import reflect + + r = reflect() + assert r["sampling_supported"] is False + assert r["deferred_to"] == "v3.2" + assert "rendered_prompt" in r + assert "source_context" in r + + def test_get_reflections_empty(self, project: Path) -> None: + from mcp_server.tools.reflections import get_reflections + + r = get_reflections() + assert r["count"] == 0 + assert r["reflections"] == [] + + def test_get_reflections_populated(self, project: Path) -> None: + reflections_store.append( + abstraction="A", + confidence=0.5, + tags=["x"], + period_start="2026-05-01T00:00:00+00:00", + period_end="2026-05-07T00:00:00+00:00", + source_session_ids=[], + source_decision_ids=[], + ) + from mcp_server.tools.reflections import get_reflections + + r = get_reflections() + assert r["count"] == 1 + assert r["reflections"][0]["abstraction"] == "A" From b28ce39e864877f0ce5d3a920dfb8fe131c2fd4e Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 15:33:08 +0530 Subject: [PATCH 16/44] =?UTF-8?q?docs(v3.1.0):=20M9=20=E2=80=94=20CLAUDE.m?= =?UTF-8?q?d=20memory=20catalog=20+=20CHANGELOG=20entry=20+=20version=20bu?= =?UTF-8?q?mp?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes out v3.1.0 with the documentation polish: - CLAUDE.md gains a 'Memory subsystems (v3.1.0)' section cataloguing all the new MCP tools and when each should be called. Walks through working memory (4 tools), skill library (6 tools), spatial memory (4 tools), consensus (5 tools spanning Phase B and the opt-in Phase C handshake), and reflections (3 tools). - CHANGELOG.md gains a comprehensive 3.1.0 entry covering all 8 milestones (M1 origin tagging, M2 working memory, M3 skill library, M4 spatial memory, M5 induction wired to outcomes, M6 consensus check, M7 handshake, M8 reflections). Also covers the v3.0.x storage prereq (jsonl_store primitives + session_id uniqueness fix). - pyproject.toml + mcp_server/__init__.py bumped to 3.1.0. Verification smoke: - Full test suite: 2282 passing, 57 pre-existing environmental failures (treesitter grammars / pyyaml absence — same baseline as v3.0.0). - Wheel builds cleanly to codevira-3.1.0-py3-none-any.whl; installs in a fresh venv; reports 'codevira 3.1.0' on --version. - All 4 new CLI subcommands surface in the installed wheel: 'codevira working', 'codevira induce-skills', 'codevira consensus', 'codevira reflect'. Each --help renders the documented options. Plan M9. v3.1.0 is feature-complete; the remaining v3.2 work is the live MCP sampling/createMessage RPC integration for the reflections subsystem. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 236 +++++++++++++++++++++++++++++++++++++++++ CLAUDE.md | 70 ++++++++++++ mcp_server/__init__.py | 2 +- pyproject.toml | 2 +- 4 files changed, 308 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 3b575ef..6324d3f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,242 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- +## [3.1.0] — Five memory subsystems + cross-IDE consensus + +v3.1.0 adds five memory subsystems on top of the v3.0.x decision +log, plus a cross-IDE consensus layer. Every addition is additive +to the existing schemas; v3.0.x records continue to read without +migration. The MCP surface gains 22 new tools across the new +subsystems. + +### v3.0.x storage prereq (ships first) + +- **`refactor(jsonl_store)`** — extract `read_merged` / `compact` / + `read_recent` from `decisions_store._read_merged` / + `sessions_store.read_recent`. The five new memory subsystems + share this amendment-overlay primitive instead of duplicating + it. Tests cover amendment-chain-three-deep recursion semantics. + Zero behavior change for existing callers. + +- **`fix(session_id)`** — `decisions_store.record` and + `sessions_store.write` now default `session_id` to + `f"ad-hoc-{secrets.token_hex(3)}"` (e.g., `ad-hoc-a1b2c3`) + instead of the literal string `"ad-hoc"`. Every concurrent IDE + that didn't pass a slug previously collided into one bucket; + the unique-per-call default fixes cross-IDE attribution. + +### M1 — Origin tagging (provenance) + +Every decision and session write now carries an `origin` block: + +```json +"origin": {"ide": "claude_code", "agent_model": "...", + "host_hash": "<12 hex chars>", "ts": "..."} +``` + +- `host_hash` = `sha1(uuid.getnode() bytes + username)[:12]` — + stable per machine (MAC-based, `platform.node()` fallback), + privacy-preserving (no plaintext hostname/username leaks). +- `CODEVIRA_IDE` env var read at MCP server startup; default + `"unknown"`. `ide_inject.py` now writes `CODEVIRA_IDE=` + into the MCP config block for all 10 supported IDE configs + (Claude Code, Claude Desktop, Cursor, Windsurf, Antigravity — + per-project + global). +- `check_conflict` response includes the candidate's `origin` so + agents can see "this conflicts with a decision Cursor wrote 3 + days ago" instead of just an opaque decision_id. +- Reads tolerate `origin` absent (legacy v3.0.x records treated + as `ide="unknown"`). + +### M2 — Working memory + +Bounded, decay-scored intra-session scratchpad. + +- `.codevira-cache/working.jsonl` (per-machine, ephemeral, + gitignored). Auto-populated by `post_tool_use` hook on Edit / + Write / MultiEdit / NotebookEdit / update_node (importance 4), + Bash (importance 3), tool errors (importance bumped to 7). +- 4 MCP tools: `working_add`, `working_get`, `working_promote` + (to=decision|skill|playbook with check_conflict gate), and + `get_working_context` (compact markdown for ReAct loops). +- Decay score: `importance × exp(-Δt_hours / 6) + 0.5 × + access_count`. Top-3 surfaces in `get_session_context`. +- Eviction = amendment tombstone; periodic compaction during + `codevira sync`. +- CLI: `codevira working commit ` archives a + session's live entries to + `.codevira/working_archived/.jsonl`. + +### M3 — Skill library (procedural memory) + +`.codevira/skills.jsonl` (canonical, team-shareable). FTS5 +retrieval + composite ranking. + +- `skill_fts` virtual table in the existing + `.codevira-cache/fts5.sqlite`. Independent staleness key + (`skill_source_mtime`) so the existing decisions tracking is + unaffected. +- Composite ranking: + `score = 0.5 × BM25_norm + 0.3 × tag_jaccard + 0.2 × + recency_decay(τ=30d)`. Never-used skills score 0 recency — + reinforcement, not existence, drives the recency signal. +- 6 MCP tools: `record_skill`, `get_skill`, `apply_skill_outcome`, + `list_skills`, `supersede_skill`, + `promote_skill_to_playbook` (writes + `.codevira/playbooks//.md`). +- Lifecycle states: `active` (default), `archived` (5 consec + failures OR `unused_days ≥ 90` — configurable; do_not_revert + exempt), `superseded` (final). +- M5 wires git-derived outcomes_writer to skill reinforcement (see + below). + +### M4 — Spatial memory + +Activity heatmap + folder-tree neighborhoods + affordances. + +- `.codevira-cache/activity.jsonl` (per-machine). Auto-emitted on + Edit/Write via `memory_fanout` + on `decisions_store.record` + when `file_path` is set. Schema: `{id, ts, node_id, kind: + edit|decision_ref, session_id, origin, _schema_v: 1}`. +- 4 MCP tools: `spatial_nearby` (BFS ≤ 2 hops over the indexer + graph + same-neighborhood union, ranked by `(1/(1+bfs_dist)) × + log(1+visit_count_30d)`), `spatial_heat`, + `spatial_neighborhood`, `spatial_affordances`. +- Folder-tree neighborhoods (top-2 dir components, e.g., + `mcp_server/storage`). Project-overridable via + `.codevira/neighborhoods.yaml`. +- Bundled `mcp_server/data/affordances.yaml` mapping file globs to + task_type affordances (e.g., `mcp_server/tools/*.py` → + `{add_tool, write_test}`). Project override: + `.codevira/affordances.yaml`; bundled + project union per match. + +### M5 — Skill induction wired to outcomes_writer + +Closes the reinforcement loop. Two pieces: + +- **Sessions schema additions**: `task_type` (`feature` | `bug` | + `refactor` | `release` | `docs` | `other`) and `skill_ids: []` + (skills used during the session). Additive; legacy sessions + tolerate absence. +- **outcomes_writer fan-out**: when `observe_all()` classifies a + decision as `kept` or `reverted`, each skill referenced via + `skill_ids` on the same session gets `mark_used(success=…)`. + Pre-builds a `{session_id → set[skill_id]}` index so the + per-decision fan-out is O(1). Best-effort: skill errors log and + drop without blocking the decision-outcome write. +- **CLI**: `codevira induce-skills [--apply] [--yes]` — + deterministic induction (no LLM in v3.1.0). Pipeline: filter + sessions with ≥80% kept; group by task_type; cluster by + tag-Jaccard ≥ 0.5; keep clusters ≥3 sessions; render candidate + skill with `name = ": "`, + `procedure = bullet-summary of session.task + + decision.decision` (capped 30 lines). + +### M6 — Consensus Phase B (cross-IDE conflict check, read-only) + +- Per-IDE checkpoint files + `.codevira/checkpoints/.json` keyed on + `last_seen_decision_id` — zero-padded base-36 D-ids preserve + monotonic ordering without clock drift. +- `consensus_store.scan_and_materialize()`: walks decisions with + `id > checkpoint`, partitions by `origin.ide` into + `current_corpus` + `foreign`, runs the reused `check_conflict` + tokenize/Jaccard/overlap math on every pair, records matches as + PC-prefixed rows in `.codevira/pending_conflicts.jsonl`. +- 2 MCP tools: `consensus_check`, `consensus_status`. + `get_session_context` surfaces a top-3 panel sorted by + `(do_not_revert × recency)`. +- CLI: `codevira consensus check`. Read-only — no amendment rows + written on decisions. + +### M7 — Consensus Phase C handshake (opt-in, default off) + +Opt-in belief-revision protocol gated behind +`memory.consensus.handshake_enabled` in `.codevira/config.yaml`. + +- New `config.py` helper for dotted-key lookups against + `.codevira/config.yaml`. +- `propose_supersession` (cross-IDE) appends a + `proposed_supersession` row with `expires_at = ts + + handshake_timeout_days` (default 14, configurable). Same-IDE + fast-path returns `{fast_path: True}` so the caller routes to + `decisions_store.supersede` directly. +- `resolve_proposal(action: approved|rejected|withdrawn)` + appends a resolution row carrying `resolver_origin`. +- `finalize_proposal(expired_unilateral=False)` — approved + proposals turn into a real `decisions_store.supersede` call. + Expired proposals require `expired_unilateral=True` (deadlock + safety); the audit row records the force-finalize. +- 3 MCP tools: `consensus_propose_supersession`, + `consensus_resolve`, `origin_of` (provenance lookup; always + available). +- Row kind taxonomy in pending_conflicts.jsonl: `conflict` (M6), + `proposed_supersession` (M7), `resolution` (M7). + +### M8 — Reflections (durable LLM abstractions) + +Generative-Agents-style abstractions over recent decisions + +sessions. + +- `.codevira/reflections.jsonl` (canonical, committed) + + `.codevira/reflection_proposals.jsonl` (review staging). +- `scrub_sensitive` strips api keys, Bearer tokens, passwords, + AWS-style AKIA, long hex/base64 from source records before the + LLM sees them. +- `build_source_context` aggregates sessions + decisions in the + period window with plan caps (≤30 sessions, ≤100 decisions, + ≤6 KB envelope). +- Bundled prompt template at + `mcp_server/data/prompts/reflection_v1.md`. +- **MCP sampling integration scope**: v3.1.0 ships the storage + + sanitization + prompt rendering + the API surface. The + `sampling/createMessage` RPC that asks the host LLM for the + abstraction is the **v3.2** deliverable. v3.1.0 `reflect()` + returns `{sampling_supported: False, rendered_prompt, + source_context, deferred_to: "v3.2"}`; the CLI accepts an LLM + response via `--from-file`. +- 3 MCP tools: `reflect`, `get_reflections`, `list_reflections`. +- CLI: `codevira reflect [--period 7d] [--from-file PATH] + [--apply] [--yes]`. Render mode prints the prompt; + `--from-file` parses the LLM YAML response and writes a + proposal; `--apply --yes` commits to `reflections.jsonl`. + +### Schema versioning convention + +All NEW JSONL stores (`working`, `skills`, `activity`, +`pending_conflicts`, `reflections`) carry `_schema_v: 1` on each +record. Readers tolerate absence (treats as v1). Existing +`decisions.jsonl` / `sessions.jsonl` are unchanged. + +### `get_session_context` panels + +Now carries five panels in addition to the existing roadmap / +recent decisions: + - `working` — top-3 live entries (M2). + - `consensus` — top-3 pending conflicts (M6) sorted by + `(do_not_revert × recency)`. +The plan reserves panels for working, skills, spatial, +reflections in future ticks if value justifies the token cost. + +### Tests + +~450+ new tests across `tests/storage/`, `tests/test_tools_*`, +`tests/test_cli_*`, `tests/test_reflections.py`, +`tests/test_consensus_handshake.py`, etc. The full v3.1.0 suite +runs in <20s; zero regressions from the v3.0.x baseline. + +### Locked decisions honored + +The v3.0.0 locks remain intact: +- D000001 (atomic writes through `mcp_server/storage/atomic.py`) +- D000012 (WRITE-path forbidden-root validation via + `ensure_dirs`) +- The v2.2.0 "no embeddings; FTS5 + Jaccard only" decision — + M3's skill retrieval and M6's conflict check both use the + existing FTS5/Jaccard infrastructure; no new embedding deps. + +--- + ## [3.0.0] — 2026-05-27 — Lean, audited, opinionated ### Hardened (RC audit — rounds 2 + 3, pre-publish) diff --git a/CLAUDE.md b/CLAUDE.md index d75f50a..4a1d519 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -51,6 +51,76 @@ That warning fires when codevira's claimed phase hasn't been updated for several - **`search_decisions(query="X")`** is the answer. Don't guess — surface the actual decision log. +## Memory subsystems (v3.1.0) + +v3.1.0 added five memory subsystems on top of the existing decision log. Each has a specific moment to call it; together they cover the gap between "episodic" (decisions) and "the agent's day-to-day state." + +### Working memory — intra-session scratchpad + +`.codevira-cache/working.jsonl` (per-machine, ephemeral, gitignored). Capacity-bounded, decay-scored. + +- **`working_add(content, kind="observation"|"goal", importance=5, links=[])`** — record an observation (something you saw) or a goal (something you're trying). `Edit`/`Write`/`Bash` calls auto-populate this via the post_tool_use hook; explicit calls add narrative + intent the auto path can't see. +- **`working_get(top_k=10, kind=?)`** — top-K live entries by decay score (importance × exp(-Δt/τ=6h) + 0.5 × access_count). Tombstoned entries excluded. +- **`working_promote(entry_id, to="decision"|"skill"|"playbook", ...)`** — move an observation/goal into LTM. Calls `check_conflict` first; tombstones the source on success. +- **`get_working_context(top_k=5)`** — compact markdown for ReAct-loop injection. + +Working memory persists into `get_session_context` (top-3 panel) so the next call sees your recent scratchpad. + +CLI escape hatch: `codevira working commit ` archives a session's live entries to `.codevira/working_archived/.jsonl` (canonical, team-shareable). + +### Skill library — procedural memory + +`.codevira/skills.jsonl` (canonical, team-shareable). FTS5-backed retrieval with composite ranking (BM25 + tag-Jaccard + recency). + +- **`record_skill(name, procedure, summary, triggers, do_not_revert, force)`** — author a reusable procedure ("how we rebase in this repo", "the project's commit-message convention"). Conflict-checked against existing skills. +- **`get_skill(query, top_k=5, file_path=?)`** — composite-ranked search. Returns `score_breakdown` so you can see WHY each skill surfaced. +- **`apply_skill_outcome(skill_id, success)`** — manual reinforcement. The *canonical* signal comes from git via `outcomes_writer` fan-out (M5) — this tool is the override. +- **`list_skills(status="active"|"archived"|"superseded"|"all", source, tags)`** — daily-driver `active` filter by default. +- **`supersede_skill(old_id, name, procedure, ...)`** — version a skill; amendment chain preserves audit. +- **`promote_skill_to_playbook(skill_id, task_type, name?, force)`** — write a skill's procedure as a playbook markdown so `get_playbook(task_type)` finds it. + +Auto-archive at 5 consecutive failures OR `unused_days ≥ 90` (configurable). Skills with `do_not_revert=true` are exempt. + +CLI: `codevira induce-skills [--apply] [--yes]` — cluster productive sessions (≥80% kept, tag-Jaccard ≥ 0.5) and propose induced skills. Without `--apply`: writes to `.codevira/induction_proposals.jsonl` for review. + +### Spatial memory — code-as-space + +Activity heatmap (`.codevira-cache/activity.jsonl`, per-machine) + folder-tree neighborhoods + affordances. + +- **`spatial_nearby(file_path, k=5)`** — files topologically near a file (BFS ≤ 2 hops over import/call edges + same-neighborhood), ranked by recent activity. Use when navigating unfamiliar code. +- **`spatial_heat(top_k=20, since_days=?)`** — where attention has concentrated. Use for "what changed this week?". +- **`spatial_neighborhood(file_path)`** — the folder-tree-derived (or yaml-overridden) neighborhood + members. +- **`spatial_affordances(file_path)`** — what task_types apply here. E.g., a file under `mcp_server/tools/` typically affords `{add_tool, write_test}`. Combine with `get_playbook(task_type)` for relevant rules. + +Override files: `.codevira/neighborhoods.yaml` (re-label folder mapping); `.codevira/affordances.yaml` (project-specific affordances on top of `mcp_server/data/affordances.yaml`). + +### Consensus — cross-IDE awareness + +Tracks which IDE wrote each decision so contradictions across IDEs surface. + +- **`consensus_check()`** — run a scan (read-only) for cross-IDE conflicts since this IDE's last checkpoint. Materializes matches to `.codevira/pending_conflicts.jsonl`. +- **`consensus_status(top_k=3)`** — count + top-K pending conflicts (`get_session_context` also surfaces a panel). +- **`origin_of(decision_id)`** — provenance lookup (always available — provenance is M1). + +Phase C (opt-in handshake, default off) — gated by `memory.consensus.handshake_enabled` in `.codevira/config.yaml`: +- **`consensus_propose_supersession(target_decision_id, new_decision, reason)`** — open a proposal against a foreign IDE's `do_not_revert` decision. Same-IDE fast-path bypasses the handshake. +- **`consensus_resolve(proposal_id, action="approved"|"rejected"|"withdrawn", comment?)`** — record the response. +- 14-day timeout default; expired proposals can be force-finalized via `expired_unilateral=True` (with audit row). + +CLI: `codevira consensus check`. + +### Reflections — episodic abstraction + +`.codevira/reflections.jsonl` (committed). LLM-generated abstractions over recent decisions + sessions. + +- **`reflect(period_days=7, dry_run=True)`** — build the source context + render the prompt. v3.1.0 returns `sampling_supported: False` + `rendered_prompt` (the MCP sampling/createMessage RPC ships in v3.2). Use the CLI to commit an LLM response. +- **`get_reflections(top_k=5)`** — most recent reflections. +- **`list_reflections(since?, tags?, limit=50)`** — filtered list. + +CLI: `codevira reflect [--period 7d] [--from-file PATH] [--apply] [--yes]`. + +Sanitization pass strips api keys / Bearer tokens / passwords / AWS AKIA / long hex / long base64 from the source context before the LLM sees it. + ## Tool budget discipline Codevira is **token-efficient by design**: diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py index 26cadb7..7be493a 100644 --- a/mcp_server/__init__.py +++ b/mcp_server/__init__.py @@ -14,4 +14,4 @@ from mcp_server.cli import main __all__ = ["main"] -__version__ = "3.0.0" +__version__ = "3.1.0" diff --git a/pyproject.toml b/pyproject.toml index 716fdf8..eb1948c 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codevira" -version = "3.0.0" +version = "3.1.0" description = "Cross-IDE decision enforcement for AI coding agents. 1 MB per project, in your repo, no cloud, no vectors. Claude Code, Cursor, Windsurf, Antigravity, Codex all share the same in-repo memory; hooks block AI tool calls that violate prior decisions. MIT, local-first." readme = "README.md" license = { text = "MIT" } From e3b127b54d1910aaecb4c5bf4d8fb0233b6b8aa5 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Fri, 29 May 2026 15:49:08 +0530 Subject: [PATCH 17/44] =?UTF-8?q?feat(graph):=20interactive=20viewer=20?= =?UTF-8?q?=E2=80=94=20pan/zoom/drag=20+=20hover=20focus=20+=20clutter=20c?= =?UTF-8?q?leanup?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The 'codevira graph' viewer was rendering correctly but the SVG was flat-static: no pan, no zoom, no drag, all labels on, no visual hierarchy. With more than a handful of decisions the view became unreadable. This rewrites the embedded JS to make the viewer properly interactive while leaving the Python rendering pipeline unchanged (template placeholders, XSS escape, and structural test expectations all preserved). # Interactivity - **Pan**: drag empty canvas to translate the viewport. - **Zoom**: mouse wheel zooms in/out, centered on the cursor. Min 0.2x, max 6x. - **Drag nodes**: click+drag a node to pin its position; the incident edges update in place without a full re-render. - **Hover focus**: hovering a node highlights it + its 1-hop neighbors with stroke white-up; everything else dims. - **Controls bar** (top-right): Fit, +, -, ↻ Layout buttons for explicit control. # Clutter cleanup - **Labels hidden by default**, shown only when (a) the node is hovered, (b) a filter term matches it, (c) the zoom is ≥ 1.4x, or (d) the new 'always show labels' checkbox is on. - **Node size by degree** so hub decisions are visually obvious rather than indistinguishable dots. - **Initial seeding by degree**: high-degree nodes seed near the center on inner rings; periphery falls to outer rings. The force layout then refines, but starts from a readable shape instead of a random ball. - **Fit-to-view on load + resize** so the graph stays usable when the window changes. - **Layout reset button** un-pins every node + re-seeds + re-runs the layout — recovery path when manual drags get out of hand. # Same tests still pass tests/test_cli_graph.py — all 9 tests pass: - Structural assertions (placeholders filled, DATA inlined, self-contained / no CDNs). - XSS escape (\u003c/script>). - cmd_graph exit codes + lineage rendering. Manual smoke: generated a viewer over an 8-decision seeded project; HTML is 19.7 KB, self-contained, contains all new wiring (#viewport, btnFit, attachDrag, focusNode, fitToView), no leftover @@ placeholders. # Note on the previously-reported 'pre-existing environmental # failures' (57 tests) After installing the project editable in a clean venv ('pip install -e .' inside a venv), 2339 tests pass and 0 fail. The failures were running pytest from system Python where pyyaml + tree-sitter + mcp live in user-site, and several tests sanitize HOME for sandbox-testing — which strips user-site discovery. Documented workflow: contributors should run the suite from a venv. No code change required for that. Co-Authored-By: Claude Opus 4.7 --- mcp_server/cli_graph.py | 268 +++++++++++++++++++++++++++++++++++----- 1 file changed, 240 insertions(+), 28 deletions(-) diff --git a/mcp_server/cli_graph.py b/mcp_server/cli_graph.py index bb651ed..a6808f1 100644 --- a/mcp_server/cli_graph.py +++ b/mcp_server/cli_graph.py @@ -183,15 +183,32 @@ def _build_graph( #detail .tag { display:inline-block; background:#23262f; border-radius:4px; padding:1px 6px; margin:2px 3px 0 0; font-size:11px; } #detail .txt { white-space:pre-wrap; margin-top:6px; } - #canvasWrap { flex:1; position:relative; } + #canvasWrap { flex:1; position:relative; overflow:hidden; } + /* Controls bar floats over the canvas, top-right */ + #controls { position:absolute; top:10px; right:10px; z-index:5; + display:flex; gap:6px; } + #controls button { background:#15171d; color:#cdd2dd; border:1px solid #2a2d35; + border-radius:6px; padding:6px 10px; font-size:11px; + cursor:pointer; } + #controls button:hover { background:#1d2029; } + #hint { position:absolute; bottom:10px; left:10px; right:10px; text-align:center; + color:#5a6072; font-size:11px; pointer-events:none; z-index:4; } svg { width:100%; height:100%; display:block; cursor:grab; } - .node circle { stroke:#0f1115; stroke-width:1.5px; cursor:pointer; } - .node text { fill:#cdd2dd; font-size:10px; pointer-events:none; } - .edge { stroke:#4a4f5c; stroke-width:1.2px; } + svg.panning { cursor:grabbing; } + .node circle { stroke:#0f1115; stroke-width:1.5px; cursor:pointer; + transition:stroke-width 0.1s; } + .node:hover circle, .node.focus circle { stroke:#fff; stroke-width:2.5px; } + .node text { fill:#cdd2dd; font-size:10px; pointer-events:none; + opacity:0; transition:opacity 0.1s; } + /* Labels visible only on hover, when filter-matched, or when zoomed in. */ + .show-labels .node text { opacity:0.85; } + .node.match text, .node:hover text, .node.focus text { opacity:1; } + .edge { stroke:#4a4f5c; stroke-width:1.2px; transition:opacity 0.1s; } .edge-supersedes { marker-end:url(#arrow); } .edge-touches { stroke:#3a3f4b; stroke-dasharray:3 3; } .edge-depends { stroke:#2f6f4f; } .dim { opacity:0.1; } + .edge.lit { stroke:#cdd2dd; stroke-width:1.8px; opacity:1; } @@ -200,6 +217,7 @@ def _build_graph(
@@GENERATED@@
+
protected @@ -210,28 +228,66 @@ def _build_graph(
- - - - - +
+ + + + +
+
drag empty space to pan · scroll to zoom · drag a node to pin · hover to focus
+ + + + + + + +
From db4518bc32982720351e4e29873c927faa485903 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 01:54:04 +0530 Subject: [PATCH 18/44] feat(graph): multi-lens interactive viewer + memory subsystem overlays MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interactive viewer now renders the full v3.1.0 memory model — not just decisions + files, but skills (purple diamonds), reflections (cyan hexagons), supersedes/touches/depends/induced/covers edges, and origin IDE provenance — in one self-contained HTML file. # What's new - Lens dropdown: Type / Origin IDE / First tag / Age / Protection / Status - Layout dropdown: Force-directed / Radial-by-tag / Timeline-by-ts - Show panel: per-node-type and per-edge-kind filter checkboxes - Tokenized search: tag: ide: kind: protected: since: until: - Time scrubber with two thumbs + Play (animated window slide) - Mini-map (180x130, bottom-right) with draggable viewport rectangle - Right-click context menu: Isolate / Expand neighbors / Copy ID / Pin / Hide - Selection history: back/forward + Alt+Left/Right - Edge hover tooltip with kind + endpoint labels - ? help dialog listing every key + gesture - Hero stat banner (top-center, fades on first interaction) - URL hash state: lens / layout / search / time survive reload # Visual polish - CSS palette tokens (--bg-0, --c-decision, etc.) - Radial vignette + dot-grid canvas background - SVG drop shadow on every node; red glow halo on protected - Curved paths for touches / induced / covers; straight lines for supersedes / depends (with arrows) - Animated edge flow (CSS dashoffset, respects prefers-reduced-motion) - Type-specific glyphs inside shapes (lock / file / lightning / sparkle) when radius >= 8 - Sidebar brand strip + small-caps section headers + pill legend chips - Frosted-glass controls + focus rings on inputs/buttons # Backend - mcp_server/cli_graph.py: _build_graph extended with skills + reflections + ts/ide meta block; render_graph_html grows with_skills / with_reflections kwargs - mcp_server/cli.py: --no-skills / --no-reflections flags on codevira graph # Tests - 36 tests in tests/test_cli_graph.py (was 9): structural wiring, XSS escape for skill / reflection text, multi-scenario render (skills + reflections + supersession + multi-IDE), large synthetic dataset, embedded JS syntax check via node --check (skipped when node is unavailable) Co-Authored-By: Claude Opus 4.7 --- mcp_server/cli.py | 14 + mcp_server/cli_graph.py | 1991 +++++++++++++++++++++++++++++++++------ tests/test_cli_graph.py | 593 ++++++++++++ 3 files changed, 2325 insertions(+), 273 deletions(-) diff --git a/mcp_server/cli.py b/mcp_server/cli.py index 0838567..dae130e 100644 --- a/mcp_server/cli.py +++ b/mcp_server/cli.py @@ -1139,6 +1139,18 @@ def error(self, message): # type: ignore[override] action="store_false", help="Decisions-only view (omit the code-file overlay)", ) + graph_parser.add_argument( + "--no-skills", + dest="with_skills", + action="store_false", + help="Omit the skills overlay (procedural memory)", + ) + graph_parser.add_argument( + "--no-reflections", + dest="with_reflections", + action="store_false", + help="Omit the reflections overlay (LLM abstractions)", + ) graph_parser.add_argument( "--dry-run", action="store_true", @@ -1569,6 +1581,8 @@ def error(self, message): # type: ignore[override] out=getattr(args, "out", None), dry_run=getattr(args, "dry_run", False), with_files=getattr(args, "with_files", True), + with_skills=getattr(args, "with_skills", True), + with_reflections=getattr(args, "with_reflections", True), ) sys.exit(rc) elif args.command == "sync": diff --git a/mcp_server/cli_graph.py b/mcp_server/cli_graph.py index a6808f1..e2295a5 100644 --- a/mcp_server/cli_graph.py +++ b/mcp_server/cli_graph.py @@ -1,24 +1,24 @@ """ cli_graph.py — `codevira graph` : self-contained interactive memory viewer. -v3.0.0 (D000016): render the project's decision memory as a single -self-contained HTML file — zero runtime dependencies, no server, works -offline. Data is read through the canonical JSONL store -(``decisions_store.list_all`` — honors D000002) and inlined as JSON; the -page ships an inlined vanilla-JS force layout plus a client-side query -box and a details panel. - -Nodes are decisions; edges are the ``supersedes`` lineage -(old → replacement). Querying/filtering (text, tag, file_path, -protected-only) happens entirely client-side, so the artifact is a -portable snapshot you can open anywhere or attach to a review. - -Design rationale (see D000016): a self-contained HTML beats a local -Flask/FastAPI server (extra dep + running process) and pyvis (extra -deps) — it reuses the data layer that already exists, works offline, and -ships as one file. The interactive code-graph overlay -(``.codevira-cache/graph.sqlite``) is a deliberate follow-up; v1 covers -decision memory. +v3.1.x: full multi-lens interactive viewer over the project's memory. +Renders decisions, files, skills, and reflections as one graph; the +embedded JS lets you switch lenses (color-by), layouts, filters, and +time-window without leaving the page. Self-contained: no CDN, no +runtime deps, works offline. + +Nodes: + - decision — id, decision text, tags, file_path, origin.ide, ts + - file — id "file:", referenced by decisions + - skill — id "K…", procedure summary, triggers.tags, origin.ide + - reflection — id "R…", abstraction, tags, period, origin.ide + +Edges: + - supersedes (decision→decision; also skill→skill) + - touches (decision→file) + - depends (file→file, from code graph if available) + - induced (skill→decision, via shared session_ids) + - covers (reflection→decision, via source_decision_ids) """ from __future__ import annotations @@ -31,36 +31,44 @@ from typing import Any # Hard cap so a pathological store can never produce an unbounded O(n^2) -# layout in the browser (P5). Far above any realistic decision count. -_MAX_NODES = 2000 +# layout in the browser (P5). Far above any realistic memory size. +_MAX_NODES_PER_TYPE = 2000 def _load_decisions() -> list[dict[str, Any]]: - """Read every decision (including superseded, for lineage edges). - - Goes through the canonical JSONL store — never graph.db (D000002). - """ + """Read every decision (including superseded, for lineage edges).""" from mcp_server.storage import decisions_store result = decisions_store.list_all( - limit=_MAX_NODES, + limit=_MAX_NODES_PER_TYPE, include_superseded=True, full=True, ) return result.get("decisions", []) +def _load_skills() -> list[dict[str, Any]]: + """Read all skills (any status). Best-effort: return [] on any error.""" + try: + from mcp_server.storage import skills_store + + return skills_store.list_all(status=None, limit=_MAX_NODES_PER_TYPE) + except Exception: # noqa: BLE001 + return [] + + +def _load_reflections() -> list[dict[str, Any]]: + """Read recent reflections. Best-effort: return [] on any error.""" + try: + from mcp_server.storage import reflections_store + + return reflections_store.list_filtered(limit=_MAX_NODES_PER_TYPE) + except Exception: # noqa: BLE001 + return [] + + def _load_code_graph_edges(file_paths: set[str]) -> list[tuple[str, str]]: - """Best-effort file→file dependency edges from the code graph. - - Reads the tree-sitter code graph (``/graph/graph.db``) and - returns ``(src_file, tgt_file)`` pairs where BOTH endpoints are in - ``file_paths`` — so the overlay only links files that already carry - decisions, keeping it focused. Degrades to ``[]`` if the graph store - is missing or unreadable (P9: the viewer must still render from the - canonical decision data even when the rebuildable graph cache is - absent or its location has drifted). - """ + """Best-effort file→file dependency edges from the code graph.""" if not file_paths: return [] try: @@ -86,48 +94,76 @@ def _load_code_graph_edges(file_paths: set[str]) -> list[tuple[str, str]]: return sorted(out) finally: conn.close() - except Exception: # noqa: BLE001 — overlay is best-effort, never fatal + except Exception: # noqa: BLE001 return [] +def _origin_ide(record: dict[str, Any]) -> str: + """Pull ``origin.ide`` safely. Returns 'unknown' when absent.""" + o = record.get("origin") or {} + if isinstance(o, dict): + return str(o.get("ide") or "unknown") + return "unknown" + + def _build_graph( - decisions: list[dict[str, Any]], *, with_files: bool = True + decisions: list[dict[str, Any]], + *, + with_files: bool = True, + skills: list[dict[str, Any]] | None = None, + reflections: list[dict[str, Any]] | None = None, ) -> dict[str, Any]: - """Shape raw decision records into ``{nodes, edges}`` for the viewer. - - Decision edges encode supersession (retired → replacement). When - ``with_files`` is set, the graph also overlays code structure: a - ``file`` node per distinct ``file_path``, a ``touches`` edge from each - decision to the file it pertains to, and best-effort ``depends`` - edges between those files pulled from the code graph. Dangling - references are dropped defensively. - """ - ids = {str(d.get("id")) for d in decisions if d.get("id")} + """Shape raw memory records into ``{nodes, edges, meta}`` for the viewer.""" + skills = skills or [] + reflections = reflections or [] + + decision_ids = {str(d.get("id")) for d in decisions if d.get("id")} + skill_ids = {str(s.get("id")) for s in skills if s.get("id")} + nodes: list[dict[str, Any]] = [] edges: list[dict[str, str]] = [] file_set: set[str] = set() + session_to_decisions: dict[str, list[str]] = {} + for d in decisions: + sid = d.get("session_id") + did = str(d.get("id") or "") + if sid and did: + session_to_decisions.setdefault(str(sid), []).append(did) + + all_tags: set[str] = set() + all_ides: set[str] = set() + timestamps: list[str] = [] + for d in decisions: did = str(d.get("id") or "") if not did: continue text = (d.get("decision") or "").strip() fp = d.get("file_path") or "" + ts = d.get("ts") or "" + ide = _origin_ide(d) + tags = list(d.get("tags") or []) + all_tags.update(t.lower() for t in tags if t) + all_ides.add(ide) + if ts: + timestamps.append(ts) nodes.append( { "id": did, "type": "decision", "decision": text, "file_path": fp, - "tags": d.get("tags") or [], + "tags": tags, "do_not_revert": bool(d.get("do_not_revert", False)), "is_superseded": bool(d.get("is_superseded") or d.get("superseded_by")), - "ts": d.get("ts") or "", + "ts": ts, "session_id": d.get("session_id") or "", + "ide": ide, } ) sup_by = d.get("superseded_by") - if sup_by and str(sup_by) in ids: + if sup_by and str(sup_by) in decision_ids: edges.append({"source": did, "target": str(sup_by), "kind": "supersedes"}) if with_files and fp: file_set.add(fp) @@ -141,6 +177,11 @@ def _build_graph( "type": "file", "file_path": fp, "label": fp.rsplit("/", 1)[-1], + "tags": [], + "ts": "", + "ide": "", + "do_not_revert": False, + "is_superseded": False, } ) for sf, tf in _load_code_graph_edges(file_set): @@ -148,7 +189,88 @@ def _build_graph( {"source": f"file:{sf}", "target": f"file:{tf}", "kind": "depends"} ) - return {"nodes": nodes, "edges": edges} + for s in skills: + sid = str(s.get("id") or "") + if not sid: + continue + triggers = s.get("triggers") or {} + tags = list(triggers.get("tags") or []) + all_tags.update(t.lower() for t in tags if t) + ide = _origin_ide(s) + all_ides.add(ide) + ts = s.get("ts") or "" + if ts: + timestamps.append(ts) + nodes.append( + { + "id": sid, + "type": "skill", + "name": str(s.get("name") or ""), + "summary": str(s.get("summary") or ""), + "procedure": str(s.get("procedure") or ""), + "tags": tags, + "status": str(s.get("status") or "active"), + "source": str(s.get("source") or "explicit"), + "success_count": int(s.get("success_count") or 0), + "failure_count": int(s.get("failure_count") or 0), + "do_not_revert": bool(s.get("do_not_revert", False)), + "is_superseded": str(s.get("status") or "") == "superseded", + "ts": ts, + "ide": ide, + } + ) + sup_by = s.get("superseded_by") + if sup_by and str(sup_by) in skill_ids: + edges.append({"source": sid, "target": str(sup_by), "kind": "supersedes"}) + for src_sess in s.get("source_session_ids") or []: + for did in session_to_decisions.get(str(src_sess), []): + edges.append({"source": sid, "target": did, "kind": "induced"}) + + for r in reflections: + rid = str(r.get("id") or "") + if not rid: + continue + tags = list(r.get("tags") or []) + all_tags.update(t.lower() for t in tags if t) + ide = _origin_ide(r) + all_ides.add(ide) + ts = r.get("ts") or "" + if ts: + timestamps.append(ts) + nodes.append( + { + "id": rid, + "type": "reflection", + "abstraction": str(r.get("abstraction") or ""), + "tags": tags, + "period_start": str(r.get("period_start") or ""), + "period_end": str(r.get("period_end") or ""), + "confidence": float(r.get("confidence") or 0.0), + "model_used": str(r.get("model_used") or ""), + "do_not_revert": False, + "is_superseded": False, + "ts": ts, + "ide": ide, + } + ) + for did in r.get("source_decision_ids") or []: + if str(did) in decision_ids: + edges.append({"source": rid, "target": str(did), "kind": "covers"}) + + meta = { + "tags": sorted(all_tags), + "ides": sorted(all_ides), + "ts_min": min(timestamps) if timestamps else "", + "ts_max": max(timestamps) if timestamps else "", + "counts": { + "decisions": sum(1 for n in nodes if n["type"] == "decision"), + "files": sum(1 for n in nodes if n["type"] == "file"), + "skills": sum(1 for n in nodes if n["type"] == "skill"), + "reflections": sum(1 for n in nodes if n["type"] == "reflection"), + }, + } + + return {"nodes": nodes, "edges": edges, "meta": meta} # The HTML template uses ``@@PLACEHOLDER@@`` markers rather than an @@ -160,133 +282,559 @@ def _build_graph( @@TITLE@@
-

@@TITLE@@

+
+ + codevira + memory +
@@GENERATED@@
- - - -
-
- protected - active - superseded - file + +
+ + +
+
tokens: tag: ide: kind: protected: since: until:
+ +
+ Lens +
-
+
+ Layout + +
+ +
+ Show +
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+
+ +
+ Legend +
+
+ +
+ Details +
click a node to inspect (or press ? for keys)
+
+ +
+
- - - - + + + + + + + +
+
+
+ drag empty space · scroll to zoom · drag node to pin · + / f + Esc + L R ? +
+
+ + + + + + + + + + + + + + + + + + + + + +
+
+
+ + Time +
+
+
+ + +
+
-
drag empty space to pan · scroll to zoom · drag a node to pin · hover to focus
- - - - - - - -
+ + + +
+ + @@ -569,20 +1989,29 @@ def _build_graph( def render_graph_html( - decisions: list[dict[str, Any]], *, with_files: bool = True + decisions: list[dict[str, Any]], + *, + with_files: bool = True, + skills: list[dict[str, Any]] | None = None, + reflections: list[dict[str, Any]] | None = None, ) -> str: - """Render the self-contained viewer HTML for ``decisions``. + """Render the self-contained viewer HTML. ``with_files`` overlays code-file nodes (and best-effort file→file - code-dependency edges). Pure function (no I/O) so it is directly - unit-testable. + code-dependency edges). ``skills`` / ``reflections`` overlay + procedural + abstraction memory respectively (pass ``None`` to omit + each). Pure function (no I/O) so it is directly unit-testable. """ - graph = _build_graph(decisions, with_files=with_files) + graph = _build_graph( + decisions, + with_files=with_files, + skills=skills, + reflections=reflections, + ) generated = datetime.now(timezone.utc).strftime("%Y-%m-%d %H:%M UTC") - # Escape ``<`` as ``<`` in the inlined JSON so decision text - # containing a literal ```` (or `` + + +
Lens @@ -374,6 +465,7 @@ +
@@ -704,6 +796,13 @@

Keys & gestures

if (n.type === 'skill' && n.status === 'archived') return '#8a6a4a'; if (n.do_not_revert) return '#ff6b6b'; return '#7c8cff'; + case 'outcome': + // v3.1.x: color by classified outcome (kept/modified/reverted/unclassified). + if (n.type !== 'decision') return '#3a3f4b'; + if (n.outcome === 'kept') return '#3fb88a'; + if (n.outcome === 'modified') return '#f0883e'; + if (n.outcome === 'reverted') return '#ff6b6b'; + return '#6b7280'; // unclassified } return '#6b7280'; } @@ -930,6 +1029,12 @@

Keys & gestures

wrap.appendChild(legendItem('#ff6b6b', 'protected')); wrap.appendChild(legendItem('#6b7280', 'superseded')); wrap.appendChild(legendItem('#8a6a4a', 'archived')); + } else if (lens === 'outcome') { + const o = DATA.meta.outcomes || {}; + wrap.appendChild(legendItem('#3fb88a', `kept (${o.kept || 0})`)); + wrap.appendChild(legendItem('#f0883e', `modified (${o.modified || 0})`)); + wrap.appendChild(legendItem('#ff6b6b', `reverted (${o.reverted || 0})`)); + wrap.appendChild(legendItem('#6b7280', `unclassified (${o.unclassified || 0})`)); } } @@ -1018,15 +1123,50 @@

Keys & gestures

`
${esc(n.abstraction)}
` + neighborButtons(n.id); } else { + // Decision rich-detail (v3.1.x). + const outcomeBadge = n.outcome + ? `${esc(n.outcome)}` + : `unclassified`; + const altsBlock = (n.alternatives_considered || []).length + ? `
alternatives considered
` + + `
    ` + + n.alternatives_considered.map(a => `
  • ${esc(a)}
  • `).join('') + + `
` + : ''; + const reExamine = n.would_re_examine_if + ? `
would re-examine if
` + + `
${esc(n.would_re_examine_if)}
` + : ''; + const ctxBlock = n.context + ? `
context
` + + `
${esc(n.context)}
` + : ''; + // Lineage chain (only when decision is part of a supersedes chain). + const chain = (DATA.meta.chains && DATA.meta.chains[n.id]) || null; + const lineageBlock = chain + ? `
lineage (oldest → newest) — ` + + `trace
` + + `
` + + chain.map((cid, i) => + (i > 0 ? '' : '') + + `` + ).join('') + + `
` + : ''; body = - `

${esc(n.id)} ${n.do_not_revert?'🔒':''}

` + + `

${esc(n.id)} ${n.do_not_revert?'🔒':''} ${outcomeBadge}

` + `
file: ${esc(n.file_path||'—')}
` + `
when: ${esc((n.ts||'').slice(0,19))}` + ` · session: ${esc(n.session_id||'—')}
` + `
degree: ${n.degree}` + (n.is_superseded?` · superseded`:``) + `
` + `
${tags}${ide}
` + - `
${esc(n.decision)}
` + + `
decision
` + + `
${esc(n.decision)}
` + + ctxBlock + + altsBlock + + reExamine + + lineageBlock + neighborButtons(n.id); } d.innerHTML = body; @@ -1036,6 +1176,13 @@

Keys & gestures

if (nb) { selectNode(nb); focusNode(nb.id); centerOn(nb); } }); }); + // v3.1.x: lineage-trace action — pin the supersedes chain. + d.querySelectorAll('a[data-action="lineage-mode"]').forEach(a => { + a.addEventListener('click', (ev) => { + ev.preventDefault(); + enterLineageMode(a.dataset.id); + }); + }); } // ───────────────────────────────────────────────────────────────── @@ -1317,6 +1464,58 @@

Keys & gestures

} document.querySelector('#isoChip button').addEventListener('click', clearIsolation); +// ───────────────────────────────────────────────────────────────── +// v3.1.x — LINEAGE TRACE MODE +// ───────────────────────────────────────────────────────────────── +// Click "trace" on a decision's lineage block → enter lineage mode: +// the supersedes chain is highlighted, everything else dimmed. Esc +// (or click empty canvas) exits. + +let lineageActive = false; +function enterLineageMode(decisionId) { + const chain = (DATA.meta.chains && DATA.meta.chains[decisionId]) || null; + if (!chain || chain.length < 2) return; + const chainSet = new Set(chain); + svg.classList.add('lineage-mode'); + document.querySelectorAll('.node').forEach(g => { + g.classList.toggle('lineage', chainSet.has(g.dataset.id)); + }); + document.querySelectorAll('.edge').forEach(l => { + l.classList.toggle( + 'lineage', + l.dataset.kind === 'supersedes' + && chainSet.has(l.dataset.s) + && chainSet.has(l.dataset.t) + ); + }); + lineageActive = true; + // Fit-to-view of just the chain nodes. + const chainNodes = chain.map(id => byId[id]).filter(Boolean); + if (chainNodes.length) { + let mnX=Infinity, mnY=Infinity, mxX=-Infinity, mxY=-Infinity; + chainNodes.forEach(n => { + const r = nodeRadius(n); + mnX = Math.min(mnX, n.x - r); mnY = Math.min(mnY, n.y - r); + mxX = Math.max(mxX, n.x + r); mxY = Math.max(mxY, n.y + r); + }); + const rect = document.getElementById('svgWrap').getBoundingClientRect(); + const bw = Math.max(1, mxX-mnX), bh = Math.max(1, mxY-mnY); + const k = Math.min(6, Math.min((rect.width-160)/bw, (rect.height-160)/bh)); + t.k = Math.max(k, 0.5); + t.x = (rect.width - (mnX+mxX) * t.k) / 2; + t.y = (rect.height - (mnY+mxY) * t.k) / 2; + applyTransform(); + } +} +function exitLineageMode() { + if (!lineageActive) return; + svg.classList.remove('lineage-mode'); + document.querySelectorAll('.node.lineage, .edge.lineage').forEach(el => { + el.classList.remove('lineage'); + }); + lineageActive = false; +} + // ───────────────────────────────────────────────────────────────── // EDGE HOVER TOOLTIP // ───────────────────────────────────────────────────────────────── @@ -1500,11 +1699,252 @@

Keys & gestures

qb.classList.remove('active'); } + // v3.1.x: ranked search panel + Q&A intent detection. + renderRankedAndAsk(document.getElementById('q').value, matchIds); + updateTimeLabels(tb); renderMinimap(); writeHashState(); } +// ───────────────────────────────────────────────────────────────── +// v3.1.x — RANKED SEARCH + Q&A +// ───────────────────────────────────────────────────────────────── + +// Score a node against the user's free-text query. BM25-ish: token +// overlap on (decision text + name + summary + abstraction + tags + +// file_path + label) weighted by inverse-frequency-ish (longer matches +// score more), plus a recency bump (newer wins ties). +function _scoreForQuery(n, queryTokens) { + if (!queryTokens.length) return 0; + const hay = [ + n.id, n.decision, n.name, n.summary, n.procedure, n.abstraction, + n.context, n.would_re_examine_if, + (n.tags || []).join(' '), + (n.alternatives_considered || []).join(' '), + n.file_path, n.label, n.ide, + ].filter(Boolean).join(' ').toLowerCase(); + if (!hay) return 0; + let score = 0; + for (const tok of queryTokens) { + if (!tok) continue; + // Count occurrences. Cheap proxy for TF. + let pos = 0, hits = 0; + while ((pos = hay.indexOf(tok, pos)) !== -1) { hits++; pos += tok.length; } + if (hits === 0) continue; + // Diminishing returns: log(1+hits) * length boost. + score += Math.log(1 + hits) * Math.max(1, tok.length / 4); + } + if (score === 0) return 0; + // Tag exact-match bonus. + const tagSet = new Set((n.tags || []).map(t => t.toLowerCase())); + for (const tok of queryTokens) { + if (tagSet.has(tok)) score += 0.5; + } + // Recency bump: newer decisions slightly favored. (0..0.3) + if (n.tsMs) score += 0.3 * ((n.tsMs - TS_MIN) / TS_SPAN); + // do_not_revert is signal-rich: small bump. + if (n.do_not_revert) score += 0.2; + return score; +} + +function renderRankedAndAsk(rawQuery, _matchIds) { + const panel = document.getElementById('rankedResults'); + const askEl = document.getElementById('askAnswer'); + const queryRaw = (rawQuery || '').trim(); + if (queryRaw.length < 3) { + panel.style.display = 'none'; + askEl.style.display = 'none'; + return; + } + + // Q&A intent detection — runs over the raw query so we can detect + // natural-language patterns even when token filters are present. + const ask = _detectIntent(queryRaw); + if (ask) { + askEl.innerHTML = + `
${esc(ask.intent)}
` + + `
${ask.body}
`; + askEl.style.display = 'block'; + // Wire result-row jumps inside the answer body. + askEl.querySelectorAll('code[data-jump]').forEach(c => { + c.addEventListener('click', () => { + const nb = byId[c.dataset.jump]; + if (nb) { selectNode(nb); centerOn(nb); } + }); + }); + } else { + askEl.style.display = 'none'; + } + + // Ranked search — runs the FREE-TEXT portion of the query (drop tokens + // like tag:/ide:/kind: so they don't pollute scoring). + const freeText = queryRaw.toLowerCase() + .split(/\s+/) + .filter(t => t && !/^(tag|ide|kind|protected|since|until):/.test(t)); + if (freeText.length === 0) { + panel.style.display = 'none'; + return; + } + const scored = []; + for (const n of DATA.nodes) { + if (n._hidden) continue; // respect type/edge/time filters + const s = _scoreForQuery(n, freeText); + if (s > 0) scored.push({ n, s }); + } + scored.sort((a, b) => b.s - a.s); + const topK = scored.slice(0, 10); + if (!topK.length) { + panel.innerHTML = `
No matches for "${esc(queryRaw)}"
`; + panel.style.display = 'block'; + return; + } + const rows = topK.map(({ n, s }) => { + const badges = []; + if (n.do_not_revert) badges.push('🔒'); + if (n.outcome) badges.push(`${esc(n.outcome)}`); + if (n.is_superseded) badges.push('superseded'); + const label = n.type === 'file' ? esc(n.label || n.id) + : n.type === 'skill' ? esc(n.name || n.id) + : esc(n.id); + const snippet = (n.decision || n.summary || n.abstraction || n.label || '').slice(0, 110); + return ( + `
` + + `${label}` + + `${esc(snippet)}` + + `${badges.join('')}` + + `${s.toFixed(2)}` + + `
` + ); + }).join(''); + panel.innerHTML = + `
${topK.length} ranked result(s) — click to inspect
` + rows; + panel.style.display = 'block'; + panel.querySelectorAll('.r-row').forEach(row => { + row.addEventListener('click', () => { + const nb = byId[row.dataset.jump]; + if (nb) { selectNode(nb); centerOn(nb); } + }); + }); +} + +// Q&A intent patterns. Each returns {intent, body} or null. +function _detectIntent(query) { + const q = query.toLowerCase().trim(); + // "what did we decide about X" / "what about X" + let m = q.match(/^(?:what (?:did we |do we |have we )?(?:decided?|decide(?:d)?) (?:about|on|for) |what about |decisions? (?:about|on|for) )(.+?)\??$/); + if (m) return _answerAbout(m[1].trim(), 'WHAT WE DECIDED ABOUT ' + m[1].trim().toUpperCase()); + + // "why did we pick X" / "why X" — surfaces alternatives_considered + m = q.match(/^(?:why (?:did we )?(?:pick|choose|use|go with|adopt)|why )(.+?)\??$/); + if (m) return _answerWhy(m[1].trim()); + + // "what got reverted" — show outcome=reverted decisions + if (/^what(?:'s|s| was| got| has been)?(?: been)? reverted\??$/.test(q) + || /^(?:show me )?reverted decisions?\??$/.test(q)) { + return _answerOutcome('reverted'); + } + // "what's protected" / "what's locked" + if (/^what(?:'s|s| is)?(?: been)? (?:protected|locked|do_not_revert|do-not-revert)\??$/.test(q) + || /^(?:show me )?(?:protected|locked) decisions?\??$/.test(q)) { + return _answerProtected(); + } + // "what worked" / "what's been kept" + if (/^what (?:worked|was kept|got kept|stuck)\??$/.test(q)) { + return _answerOutcome('kept'); + } + return null; +} + +function _answerAbout(topic, intentLabel) { + const tokens = topic.toLowerCase().split(/\s+/).filter(t => t.length >= 3); + if (!tokens.length) return null; + const scored = DATA.nodes + .filter(n => n.type === 'decision' && !n._hidden) + .map(n => ({ n, s: _scoreForQuery(n, tokens) })) + .filter(x => x.s > 0) + .sort((a, b) => b.s - a.s) + .slice(0, 3); + if (!scored.length) { + return { intent: intentLabel, body: `No decisions match "${esc(topic)}". Try recording one via record_decision.` }; + } + const top = scored[0].n; + let body = `Top match: ${esc(top.id)} — ${esc((top.decision || '').slice(0, 200))}`; + if (top.outcome) body += `
Outcome: ${esc(top.outcome)}`; + if ((top.alternatives_considered || []).length) { + body += '
Alternatives considered:
    ' + + top.alternatives_considered.map(a => `
  • ${esc(a)}
  • `).join('') + '
'; + } + if (top.would_re_examine_if) { + body += `
Would re-examine if: ${esc(top.would_re_examine_if)}`; + } + if (scored.length > 1) { + body += '

Other matches:
    ' + + scored.slice(1).map(({n}) => + `
  • ${esc(n.id)} ${esc((n.decision || '').slice(0, 80))}
  • ` + ).join('') + '
'; + } + return { intent: intentLabel, body }; +} + +function _answerWhy(topic) { + const tokens = topic.toLowerCase().split(/\s+/).filter(t => t.length >= 3); + const scored = DATA.nodes + .filter(n => n.type === 'decision' && !n._hidden) + .map(n => ({ n, s: _scoreForQuery(n, tokens) })) + .filter(x => x.s > 0) + .sort((a, b) => b.s - a.s) + .slice(0, 1); + if (!scored.length) { + return { intent: 'WHY ' + topic.toUpperCase(), + body: `No matching decision found for "${esc(topic)}".` }; + } + const top = scored[0].n; + const alts = top.alternatives_considered || []; + let body = `${esc(top.id)}: ${esc((top.decision || '').slice(0, 180))}`; + if (top.context) body += `
Context: ${esc(top.context.slice(0, 200))}`; + if (alts.length) { + body += '
Why this won (rejected alternatives):
    ' + + alts.map(a => `
  • ${esc(a)}
  • `).join('') + '
'; + } else { + body += '
(No alternatives_considered recorded. record_decision now accepts this field — capture losers next time.)'; + } + if (top.would_re_examine_if) { + body += `
Would re-examine if: ${esc(top.would_re_examine_if)}`; + } + return { intent: 'WHY ' + topic.toUpperCase(), body }; +} + +function _answerOutcome(kind) { + const matches = DATA.nodes + .filter(n => n.type === 'decision' && n.outcome === kind && !n._hidden) + .sort((a, b) => (b.tsMs || 0) - (a.tsMs || 0)) + .slice(0, 8); + const counts = DATA.meta.outcomes || {}; + let body = `${counts[kind] || 0} decision(s) with outcome=${esc(kind)}` + + ` (of ${(counts.kept||0)+(counts.modified||0)+(counts.reverted||0)+(counts.unclassified||0)} total).`; + if (matches.length) { + body += '
    ' + matches.map(n => + `
  • ${esc(n.id)} ${esc((n.decision || '').slice(0, 80))}
  • ` + ).join('') + '
'; + } + return { intent: kind.toUpperCase() + ' OUTCOMES', body }; +} + +function _answerProtected() { + const matches = DATA.nodes + .filter(n => n.type === 'decision' && n.do_not_revert && !n._hidden) + .sort((a, b) => (b.tsMs || 0) - (a.tsMs || 0)) + .slice(0, 10); + let body = `${matches.length} protected (do_not_revert) decision(s) visible. Click to inspect.`; + if (matches.length) { + body += '
    ' + matches.map(n => + `
  • ${esc(n.id)} ${esc((n.decision || '').slice(0, 80))}
  • ` + ).join('') + '
'; + } + return { intent: 'PROTECTED DECISIONS', body }; +} + function updateTimeLabels(tb) { const f = document.getElementById('tFrom'), to = document.getElementById('tTo'); const lbl = document.getElementById('timeWindowLabel'); @@ -1633,11 +2073,12 @@

Keys & gestures

document.getElementById('timeHi').addEventListener('input', applyVisibility); document.getElementById('playBtn').addEventListener('click', togglePlay); -// Click empty canvas → clear selection. +// Click empty canvas → clear selection (and exit lineage mode). svg.addEventListener('click', (ev) => { if (ev.target.closest('.node') || ev.target.closest('.edge-hit')) return; selectedId = null; document.querySelectorAll('.node').forEach(g => g.classList.remove('selected')); + exitLineageMode(); }); // Keyboard navigation. @@ -1659,6 +2100,7 @@

Keys & gestures

if (ev.key === 'Escape') { closeHelp(); closeCtx(); if (document.getElementById('helpDlg').style.display === 'block') return; + if (lineageActive) { exitLineageMode(); return; } if (isolateSet) { clearIsolation(); return; } document.getElementById('q').value = ''; selectedId = null; diff --git a/tests/test_cli_graph.py b/tests/test_cli_graph.py index 03489a1..b1827c4 100644 --- a/tests/test_cli_graph.py +++ b/tests/test_cli_graph.py @@ -179,6 +179,69 @@ def test_reflection_node_and_covers_edge(self): assert not any(e["target"] == "D999" for e in covers) assert g["meta"]["counts"]["reflections"] == 1 + def test_decision_surfaces_outcome_and_counter_fields(self): + """v3.1.x viewer overhaul: outcome, alternatives_considered, + would_re_examine_if, context must round-trip onto the node.""" + decisions = [ + { + "id": "D1", + "decision": "use bcrypt", + "outcome": "kept", + "alternatives_considered": ["argon2", "scrypt"], + "would_re_examine_if": "if argon2 lands in stdlib", + "context": "hashed passwords, no clear winner", + } + ] + g = _build_graph(decisions, with_files=False) + n = g["nodes"][0] + assert n["outcome"] == "kept" + assert n["alternatives_considered"] == ["argon2", "scrypt"] + assert n["would_re_examine_if"] == "if argon2 lands in stdlib" + assert n["context"] == "hashed passwords, no clear winner" + + def test_meta_outcomes_distribution(self): + decisions = [ + {"id": "D1", "decision": "x", "outcome": "kept"}, + {"id": "D2", "decision": "y", "outcome": "modified"}, + {"id": "D3", "decision": "z", "outcome": "reverted"}, + {"id": "D4", "decision": "w"}, # unclassified + ] + g = _build_graph(decisions, with_files=False) + assert g["meta"]["outcomes"] == { + "kept": 1, + "modified": 1, + "reverted": 1, + "unclassified": 1, + } + + def test_meta_chains_precomputes_supersedes_lineage(self): + """For every decision in a supersedes chain, meta.chains[id] + is the full ordered list oldest → newest.""" + decisions = [ + { + "id": "D1", + "decision": "v1", + "superseded_by": "D2", + "is_superseded": True, + }, + { + "id": "D2", + "decision": "v2", + "supersedes": "D1", + "superseded_by": "D3", + "is_superseded": True, + }, + {"id": "D3", "decision": "v3", "supersedes": "D2"}, + {"id": "D9", "decision": "singleton"}, + ] + g = _build_graph(decisions, with_files=False) + chains = g["meta"]["chains"] + assert chains["D1"] == ["D1", "D2", "D3"] + assert chains["D2"] == ["D1", "D2", "D3"] + assert chains["D3"] == ["D1", "D2", "D3"] + # Singleton has no chain. + assert "D9" not in chains + def test_skill_supersedes_chain(self): skills = [ { @@ -330,6 +393,40 @@ def test_template_wires_v2_enhancements(self): assert "filter:url(#nodeShadow)" in h assert "filter:url(#nodeGlow)" in h + def test_template_wires_v3_1x_search_qa_lineage(self): + """v3.1.x viewer overhaul: ranked search panel + Q&A + outcome + lens + lineage trace mode must all be wired into the template.""" + h = render_graph_html([{"id": "D1", "decision": "x", "outcome": "kept"}]) + # New panel containers + assert 'id="rankedResults"' in h + assert 'id="askAnswer"' in h + # New lens option + assert 'value="outcome"' in h + # New JS landmarks + for sym in ( + "renderRankedAndAsk", + "_scoreForQuery", + "_detectIntent", + "_answerAbout", + "_answerWhy", + "_answerOutcome", + "_answerProtected", + "enterLineageMode", + "exitLineageMode", + "lineage-mode", + ): + assert sym in h, f"missing JS symbol {sym}" + # Lineage-mode CSS + assert "svg.lineage-mode" in h + # Rich-detail field classes + for cls in ( + ".alts", + ".re-examine", + ".outcome-badge", + ".chain", + ): + assert cls in h, f"missing CSS class {cls}" + def test_protected_node_gets_protected_class_in_render(self): """Protected (do_not_revert) decisions must be marked so the glow filter applies. We verify the JS classList toggle is wired.""" From 35ae6d1f62e198155521d90d366b00b3f4a10d73 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 19:09:59 +0530 Subject: [PATCH 28/44] =?UTF-8?q?fix(graph):=20paranoia=20pass=20=E2=80=94?= =?UTF-8?q?=20debounce=20search,=20clarify=20outcome=20legend,=20lineage-m?= =?UTF-8?q?ode=20focus=20guard?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Defensive sweep after shipping the v3.1.x viewer overhaul (aedc2ae). Three issues a real user would have hit: 1. **Search re-scored on every keystroke (perf).** renderRankedAndAsk walks all DATA.nodes per input event; at the 2000-node cap that's measurable lag while typing. Added 120ms trailing-edge debounce on the search input. Type-then-look still feels instant; bursty typing coalesces. 2. **Outcome lens leaves files/skills/reflections gray** because they have no `outcome` concept. The legend showed 'unclassified (N)' alongside the gray swatch — easy to misread non-decision nodes as "unclassified decisions". Added a 'decisions only' italic note to the legend. 3. **Lineage-trace mode + hover focus competed.** Hovering a node inside lineage mode would re-apply focus dimming on top of the lineage chain emphasis, producing flicker. focusNode now early- returns when lineageActive is true; the only way to use hover- focus is to Esc out of lineage mode first. Tests: 40/40 graph tests still pass. JS syntax-check clean (243 KB). Co-Authored-By: Claude Opus 4.7 --- mcp_server/graph/template.html | 22 +++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/mcp_server/graph/template.html b/mcp_server/graph/template.html index 19ea123..c67241c 100644 --- a/mcp_server/graph/template.html +++ b/mcp_server/graph/template.html @@ -1035,6 +1035,14 @@

Keys & gestures

wrap.appendChild(legendItem('#f0883e', `modified (${o.modified || 0})`)); wrap.appendChild(legendItem('#ff6b6b', `reverted (${o.reverted || 0})`)); wrap.appendChild(legendItem('#6b7280', `unclassified (${o.unclassified || 0})`)); + // Disambiguate the dim gray on files/skills/reflections — those are + // NOT 'unclassified', they just don't have an outcome concept. + const note = document.createElement('span'); + note.textContent = 'decisions only'; + note.style.color = 'var(--text-mute)'; + note.style.fontStyle = 'italic'; + note.style.fontSize = '10px'; + wrap.appendChild(note); } } @@ -1191,6 +1199,10 @@

Keys & gestures

let focusedId = null; function focusNode(id) { + // v3.1.x: hover-focus is suppressed inside lineage-trace mode so the + // lineage emphasis doesn't fight the focus dimming. Exiting lineage + // mode is the only way to use hover-focus again. + if (lineageActive) return; focusedId = id; if (id === null) { document.querySelectorAll('.node').forEach(g => g.classList.remove('focus')); @@ -2064,7 +2076,15 @@

Keys & gestures

DATA.nodes.forEach(n => { n.pinned = false; }); runLayout(); render(); fitToView(); }); -document.getElementById('q').addEventListener('input', applyVisibility); +// v3.1.x: debounce search input. The ranked-score pass is O(N * tokens) +// per keystroke; at the 2000-node cap that's perceptibly laggy without +// debouncing. 120ms trailing-edge is the sweet spot — fast enough that +// type-then-look feels instant, slow enough to coalesce typing bursts. +let _qDebounce = null; +document.getElementById('q').addEventListener('input', () => { + if (_qDebounce) clearTimeout(_qDebounce); + _qDebounce = setTimeout(applyVisibility, 120); +}); document.getElementById('protOnly').addEventListener('change', applyVisibility); document.getElementById('alwaysLabels').addEventListener('change', applyTransform); document.querySelectorAll('.ftype, .fkind').forEach(c => From aac00fd322850f76b7a9064e9b07c1672ef724ac Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 19:10:20 +0530 Subject: [PATCH 29/44] docs: sync AGENTS.md decision-tail (290 -> 310) post viewer-overhaul Co-Authored-By: Claude Opus 4.7 --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index e7f6f3e..5ba8ac6 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ - **D00001G** v3.0.x storage prereq IMPLEMENTATION COMPLETE on branch release/3.0.1 (commits 6253940 + c1352d7). Patches 1+2+3 done: … · _memory, prereq, storage, v3.0.1_ - **D00001H** M1 Phase A origin tagging IMPLEMENTATION COMPLETE on release/3.0.1 (commits 618710a storage + ff06b3d ide_inject). orig… · _consensus, m1, memory, origin, v3.1.0_ -_+290 more decision(s) — full log in `.codevira/decisions.jsonl`._ +_+310 more decision(s) — full log in `.codevira/decisions.jsonl`._ For the full decision log + outcomes + reverts, see `.codevira/decisions.jsonl` or run `codevira list-decisions`. From da8c0e518b37f964eeea21bee6a8d7c06c7b248c Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 21:41:49 +0530 Subject: [PATCH 30/44] =?UTF-8?q?revert(engine):=20=5FDEFAULT=5FMIN=5FSCOR?= =?UTF-8?q?E=200.25=20->=200.10=20=E2=80=94=20broke=20cross-tool=20wedge?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # The bug Bumping _DEFAULT_MIN_SCORE from 0.10 to 0.25 in 6d2a6d6 broke test_decision_recorded_in_tool_a_visible_in_tool_b_via_inject and test_four_tools_in_sequence_see_identical_decision in tests/e2e/test_cross_tool_universality.py. The cross-tool wedge — codevira's whole reason to exist — silently stopped propagating single-FTS-match decisions to other IDEs. # How it slipped past the gate D000010 requires `make test-e2e` BEFORE any engine-policy change. The procedural gate ran (39 passed). BUT the gate was structurally incomplete: it only invoked test_first_contact.py + test_product_invariants.py — it did NOT include test_cross_tool_universality.py, which is exactly where the single-FTS-match wedge regression lives. So the lock fired (good), I ran the gate (good), the gate said pass (misleading), and the regression shipped past three commits before the full `pytest tests/` (no --ignore) caught it during a final paranoia pass. Trust-loss anti-pattern. # Fix 1. Restore _DEFAULT_MIN_SCORE = 0.10. The threshold was load-bearing for the wedge contract; the 0.25 noise-reduction was a wash if it kills the core feature. 2. Widen `make test-e2e` to include test_cross_tool_universality.py. Future engine-policy changes will get caught at the right gate. # What the original bump was trying to fix Auto-surfaced prior decisions can feel noisy (D00005N meta-review called this out). The right approach is NOT lowering the threshold; it's raising per-source weights to compensate, OR adding a recency penalty for stale tags, OR moving noise-reduction to the inject layer rather than the rank layer. All deferred to a separate investigation with the proper regression coverage in place. # Verification - Full project suite (NOTHING ignored): 2538 passed, 28 skipped - Widened `make test-e2e`: 43 passing (was 39), 9 skipped Co-Authored-By: Claude Opus 4.7 --- Makefile | 2 +- .../engine/policies/relevance_inject.py | 22 ++++++++++--------- 2 files changed, 13 insertions(+), 11 deletions(-) diff --git a/Makefile b/Makefile index 668ae01..e98b306 100644 --- a/Makefile +++ b/Makefile @@ -69,7 +69,7 @@ test-unit: $(PYTHON) -m pytest tests/ -q --ignore=tests/e2e --ignore=tests/integration test-e2e: - $(PYTHON) -m pytest tests/e2e/test_first_contact.py tests/e2e/test_product_invariants.py -v + $(PYTHON) -m pytest tests/e2e/test_first_contact.py tests/e2e/test_product_invariants.py tests/e2e/test_cross_tool_universality.py -v # v2.1.2 hardening — integration suite (slower; runs in gauntlet): # MCP round-trip, help-text linter, sandboxed-parent. Skipped from diff --git a/mcp_server/engine/policies/relevance_inject.py b/mcp_server/engine/policies/relevance_inject.py index 11e6037..d09fd8a 100644 --- a/mcp_server/engine/policies/relevance_inject.py +++ b/mcp_server/engine/policies/relevance_inject.py @@ -66,16 +66,18 @@ _DEFAULT_MODE = "inject" _DEFAULT_MAX_DECISIONS = 3 _DEFAULT_MAX_TOKENS = 600 -# v3.1.x: raised from 0.10 → 0.25. Per-component weights are -# TAG=0.4, FILE=0.4, FTS=0.2; a single tag match × default outcome -# weight (0.5) = 0.20, which used to clear the old 0.10 threshold. -# That meant any decision tagged with a common token (e.g. "engine", -# "policy") surfaced on every prompt that mentioned the token, even -# tangentially. 0.25 requires either (a) two source matches OR -# (b) a single source match with a strong outcome weight (≥0.7). -# Override via .codevira/config.yaml: memory.relevance_min_score. -# Locked by D000010 (procedural: must run make test-e2e BEFORE commit). -_DEFAULT_MIN_SCORE = 0.25 +# REVERTED to 0.10 in 2026-05-30 after the bump to 0.25 broke the +# cross-tool universality wedge (test_decision_recorded_in_tool_a_ +# visible_in_tool_b_via_inject). With weights TAG=0.4, FILE=0.4, +# FTS=0.2 and the default outcome weight 0.5, a single-FTS-match +# decision (no tags, no file overlap) scores exactly 0.10 — the +# 0.25 threshold would suppress it entirely. The wedge is load- +# bearing; suppress noise some other way (e.g. raise per-source +# weights, or add a recency penalty for stale tags). +# Locked by D000010 (procedural: must run make test-e2e BEFORE +# commit). NOTE: the test-e2e target was missing +# tests/e2e/test_cross_tool_universality.py — fixed in same commit. +_DEFAULT_MIN_SCORE = 0.10 _MIN_PROMPT_CHARS = 10 # ignore tiny prompts (e.g. "ok", "thanks") _MODES = ("off", "inject") From 50a3027eba79d43f524ba5dbd4a425469b58ce9f Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 21:57:33 +0530 Subject: [PATCH 31/44] docs: sync AGENTS.md after threshold revert + e2e gate widening --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 5ba8ac6..41d1a32 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ - **D00001G** v3.0.x storage prereq IMPLEMENTATION COMPLETE on branch release/3.0.1 (commits 6253940 + c1352d7). Patches 1+2+3 done: … · _memory, prereq, storage, v3.0.1_ - **D00001H** M1 Phase A origin tagging IMPLEMENTATION COMPLETE on release/3.0.1 (commits 618710a storage + ff06b3d ide_inject). orig… · _consensus, m1, memory, origin, v3.1.0_ -_+310 more decision(s) — full log in `.codevira/decisions.jsonl`._ +_+370 more decision(s) — full log in `.codevira/decisions.jsonl`._ For the full decision log + outcomes + reverts, see `.codevira/decisions.jsonl` or run `codevira list-decisions`. From 9af20bc193bb8eff1cc95041f0b3d096a81c6761 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 22:17:09 +0530 Subject: [PATCH 32/44] =?UTF-8?q?feat(release):=20G3=20real-IDE=20smoke=20?= =?UTF-8?q?=E2=80=94=20implement=20the=20last=20stubbed=20gate?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit scripts/check_real_ide_smoke.sh was a stub since v2.0, recorded as "skipped" in every evidence file. Now produces a real true/false: # What G3 checks 1. codevira binary is on PATH (what IDE configs assume). 2. For each detected IDE config file (Claude Code / Claude Desktop / Cursor / Windsurf / Antigravity per-app + shared): - Parses JSON; "empty file" treated as not-configured (warning), "malformed" treated as hard fail. - Verifies codevira (or codevira-) registered. - Reports env.CODEVIRA_IDE state — pre-v3.1.0 configs show as "missing" with a guidance message to re-run setup after upgrade. 3. Spawns a codevira MCP stdio server against a fresh tmp project, runs the initialize + tools/list handshake: - initialize: 5s budget (allows tokenizer warm-load). - tools/list: 1s HARD (Claude Desktop disconnect class). - tool count: >=20. # Exit codes 0 — every detected IDE check passes + handshake fast 1 — at least one hard failure (release blocked) 2 — no IDE configs found (G3 skipped — no fault) # Verified on this machine ✓ 4 IDE configs detected (claude_code, claude_desktop, antigravity_b, antigravity_a-empty) ✓ MCP initialize → 526ms ✓ tools/list → 2ms, 24 tools ✓ G3 exit 0 # Evidence file now records G3 = true (was "skipped") The pre-existing antigravity_a empty config + pre-v3.1.0 CODEVIRA_IDE-missing entries surface as warnings — they are real state but not v3.1.0 release blockers. Users will re-inject after pipx upgrade and the warnings clear. # How this surfaces real bugs in the future The handshake test catches the "Claude Desktop disconnects after 80ms" class — if any future change makes tools/list slow, this gate fails before publish. Co-Authored-By: Claude Opus 4.7 --- scripts/check_real_ide_smoke.sh | 241 ++++++++++++++++++++++++++------ 1 file changed, 202 insertions(+), 39 deletions(-) diff --git a/scripts/check_real_ide_smoke.sh b/scripts/check_real_ide_smoke.sh index de2d2a5..9590d7d 100755 --- a/scripts/check_real_ide_smoke.sh +++ b/scripts/check_real_ide_smoke.sh @@ -2,48 +2,211 @@ # # check_real_ide_smoke.sh — G3 of the release gauntlet. # -# Verifies codevira appears connected in real running IDEs (Claude Code, -# Claude Desktop, Cursor, Windsurf, Antigravity). Currently a STUB. -# Filling this in is a v2.1 reliability item — until then it exits 1 -# so the gauntlet records G3 as "skipped" rather than "passed." -# -# What this script SHOULD do (when fully implemented): -# -# 1. For each detected IDE config (~/.claude.json, ~/Library/Application -# Support/Claude/claude_desktop_config.json, ~/.cursor/mcp.json, etc.): -# - Verify codevira is registered. -# - Verify the registered command path actually exists. -# -# 2. Spawn an MCP stdio server (`codevira --project-dir `): -# - Send an `initialize` request. -# - Send `tools/list` and measure time-to-response. -# - Assert response time < 1 second. -# - Assert tools count > 20. -# - Clean shutdown. -# -# 3. If a launchd daemon is configured (v2.2 multi-project HTTPS): -# - Verify the daemon is reachable at https://localhost:8443/mcp. -# - Send the same initialize + tools/list and assert <50ms. +# Verifies codevira appears connected in real IDE configs (Claude Code, +# Claude Desktop, Cursor, Windsurf, Antigravity) AND that an MCP stdio +# server boots + responds to tools/list in <1s (the Claude Desktop +# disconnect timeout). # # Exit codes: -# 0 — all IDE smoke tests pass. -# 1 — at least one IDE smoke test failed. -# 2 — stub state (current behavior). +# 0 — every detected IDE is configured AND MCP handshake is fast. +# 1 — at least one check failed (release blocked). +# 2 — no IDE detected on this machine (G3 skipped — no fault). # -# When this script is filled in, the gauntlet's G3 step will produce -# a real true/false in the evidence file instead of "skipped." +# Implemented 2026-05-30 — was a stub since v2.0. Closes the last +# permanently-skipped gauntlet gate. set -uo pipefail -echo "G3 stub: scripts/check_real_ide_smoke.sh is not yet implemented." -echo "" -echo "What this needs to test (v2.1 backlog):" -echo " - codevira registered in each detected IDE config" -echo " - MCP stdio handshake completes in <1s (Claude Desktop timeout)" -echo " - tools/list returns >20 tools" -echo " - No HNSW segment writer corruption on the project's Chroma store" -echo " - codevira binary on PATH resolves to current pipx install" -echo "" -echo "Until this script is filled in, the gauntlet records G3 as 'skipped'." -echo "v2.1 launch gate: G3 must produce a real result before v2.1.0 ships." -exit 2 +# ─── locate codevira (must be on PATH for IDE-spawned MCP servers) ───── +if command -v codevira >/dev/null 2>&1; then + CODEVIRA="$(command -v codevira)" +elif [ -x "${HOME}/.local/bin/codevira" ]; then + CODEVIRA="${HOME}/.local/bin/codevira" +else + echo "✗ codevira binary not on PATH — IDE configs that hard-code 'codevira'" + echo " will fail to spawn an MCP server. Install with pipx install codevira." + exit 1 +fi +echo "✓ codevira on PATH: $CODEVIRA" +"$CODEVIRA" --version | sed 's/^/ /' +echo + +# ─── per-IDE config paths (macOS + Linux) ────────────────────────────── +declare -a IDE_NAMES +declare -a IDE_CONFIGS +IDE_NAMES=() +IDE_CONFIGS=() +add_ide() { IDE_NAMES+=("$1"); IDE_CONFIGS+=("$2"); } + +add_ide "claude_code" "${HOME}/.claude.json" +add_ide "claude_desktop" "${HOME}/Library/Application Support/Claude/claude_desktop_config.json" +add_ide "cursor" "${HOME}/.cursor/mcp.json" +add_ide "windsurf" "${HOME}/.codeium/windsurf/mcp_config.json" +add_ide "antigravity_a" "${HOME}/.gemini/config/mcp_config.json" +add_ide "antigravity_b" "${HOME}/.gemini/antigravity/mcp_config.json" + +# Linux fallback. +if [ "$(uname)" = "Linux" ]; then + add_ide "claude_desktop_linux" "${HOME}/.config/Claude/claude_desktop_config.json" +fi + +# ─── check 1: per-IDE codevira registration ──────────────────────────── +DETECTED=0 +REG_FAILED=0 +for i in "${!IDE_NAMES[@]}"; do + name="${IDE_NAMES[$i]}" + cfg="${IDE_CONFIGS[$i]}" + [ -f "$cfg" ] || continue + DETECTED=$((DETECTED + 1)) + + # Returns 0 on registered + parseable, 1 on registered with concerns + # (e.g. missing CODEVIRA_IDE env on a pre-v3.1.0 install), 2 on + # broken config (parse fail, missing entry). Only 2 is a hard fail + # for the gauntlet — 1 is a "should reinject after upgrade" warning. + result=$(python3 - "$cfg" "$name" <<'EOF' +import json, sys, os +cfg_path, ide_name = sys.argv[1], sys.argv[2] +# Empty file → "not configured", which is a soft no-op (some IDEs +# create the file at first launch with zero content). Not a release +# blocker. +if os.path.getsize(cfg_path) == 0: + print("EMPTY_FILE_NOT_CONFIGURED"); sys.exit(1) +try: + data = json.loads(open(cfg_path).read()) +except Exception as e: + print(f"PARSE_FAIL: {e}"); sys.exit(2) +servers = data.get("mcpServers") or {} +matches = [k for k in servers if k == "codevira" or k.startswith("codevira-")] +if not matches: + print("NO_CODEVIRA"); sys.exit(2) +warn = False +out = [] +for k in matches[:3]: # cap output at 3 entries — Antigravity often has many + entry = servers[k] + cmd = entry.get("command") or entry.get("url") or "" + env = entry.get("env") or {} + has_ide_env = "CODEVIRA_IDE" in env + out.append(f"key={k} cmd={cmd[:60]} env.CODEVIRA_IDE={env.get('CODEVIRA_IDE','')}") + if not has_ide_env: warn = True +extra = f" (+{len(matches)-3} more)" if len(matches) > 3 else "" +print(" | ".join(out) + extra) +sys.exit(1 if warn else 0) +EOF +) + rc=$? + if [ "$rc" = "0" ]; then + echo " ✓ $name → $result" + elif [ "$rc" = "1" ]; then + echo " ⚠ $name → $result" + echo " (env.CODEVIRA_IDE missing — pre-v3.1.0 install; re-run setup after pipx upgrade)" + else + echo " ✗ $name → $result" + REG_FAILED=$((REG_FAILED + 1)) + fi +done + +if [ "$DETECTED" = "0" ]; then + echo + echo "⚠ No IDE configs detected on this machine (looked in 6 standard paths)." + echo " G3 is not failing — there's nothing to smoke. Exit 2 = 'skipped'." + exit 2 +fi + +# ─── check 2: MCP stdio handshake speed against a tmp project ────────── +echo +TMP_PROJECT=$(mktemp -d -t codevira-g3-XXXXXXXX) +trap 'rm -rf "$TMP_PROJECT"' EXIT +mkdir -p "$TMP_PROJECT/.codevira" +printf 'project:\n name: g3-smoke\n' > "$TMP_PROJECT/.codevira/config.yaml" + +python3 - "$CODEVIRA" "$TMP_PROJECT" <<'PYEOF' +import json, os, subprocess, sys, time + +codevira, project = sys.argv[1], sys.argv[2] +env = {"PATH": os.environ.get("PATH", "/usr/bin:/bin"), + "HOME": os.environ.get("HOME", ""), + # Avoid the background watcher thread — irrelevant for stdio handshake. + "CODEVIRA_NO_WATCHER": "1"} + +# No subcommand → MCP stdio server (the path IDEs invoke). +proc = subprocess.Popen( + [codevira, "--project-dir", project], + stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.PIPE, + env=env, text=True, bufsize=1, +) + +def send(req): + proc.stdin.write(json.dumps(req) + "\n") + proc.stdin.flush() + +def recv(id_, timeout=10.0): + deadline = time.time() + timeout + while time.time() < deadline: + line = proc.stdout.readline() + if not line: + continue + try: + msg = json.loads(line) + except Exception: + continue + if msg.get("id") == id_: + return msg + raise TimeoutError(f"no message id={id_} in {timeout}s") + +try: + t0 = time.time() + send({"jsonrpc": "2.0", "id": 1, "method": "initialize", + "params": {"protocolVersion": "2025-03-26", + "capabilities": {}, + "clientInfo": {"name": "g3", "version": "1"}}}) + recv(1, timeout=15.0) # generous: first-boot may load tokenizers + t_init = time.time() - t0 + + send({"jsonrpc": "2.0", "method": "notifications/initialized"}) + + t1 = time.time() + send({"jsonrpc": "2.0", "id": 2, "method": "tools/list", "params": {}}) + tools_resp = recv(2, timeout=5.0) # tighter: this is the hot path + t_tools = time.time() - t1 + + n_tools = len(tools_resp.get("result", {}).get("tools", [])) + print(f"✓ initialize → {t_init*1000:.0f}ms") + print(f"✓ tools/list → {t_tools*1000:.0f}ms, {n_tools} tools") + + # Thresholds: + # - initialize: 5s headroom (Claude Desktop tolerates a few seconds + # here; tooling like sentence-transformers warm-load takes time). + # - tools/list: 1s HARD (Claude Desktop's known disconnect class — + # if this is slow, the IDE drops the connection mid-handshake). + # - tools count: >=20. + failures = [] + if t_init > 5.0: + failures.append(f"initialize too slow: {t_init*1000:.0f}ms > 5000ms") + if t_tools > 1.0: + failures.append(f"tools/list too slow: {t_tools*1000:.0f}ms > 1000ms (Claude Desktop disconnect class)") + if n_tools < 20: + failures.append(f"tools/list returned only {n_tools} tools (expected >=20)") + + if failures: + for f in failures: + print(f"✗ {f}") + sys.exit(1) +finally: + try: + proc.terminate(); proc.wait(timeout=2) + except Exception: + try: + proc.kill() + except Exception: + pass +PYEOF +HANDSHAKE_RC=$? + +# ─── tally ───────────────────────────────────────────────────────────── +echo +if [ "$REG_FAILED" = "0" ] && [ "$HANDSHAKE_RC" = "0" ]; then + echo "✓ G3 PASSED — $DETECTED IDE config(s) checked, MCP handshake fast" + exit 0 +fi +echo "✗ G3 FAILED — registration_failures=$REG_FAILED handshake_rc=$HANDSHAKE_RC" +exit 1 From 78af055102d34cf0688e9da9eed74ef068778ccc Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 22:19:21 +0530 Subject: [PATCH 33/44] feat(sync): auto-classify outcomes via observe-git tail step --- mcp_server/cli_sync.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/mcp_server/cli_sync.py b/mcp_server/cli_sync.py index 72e6a79..2328e7b 100644 --- a/mcp_server/cli_sync.py +++ b/mcp_server/cli_sync.py @@ -103,6 +103,34 @@ def cmd_sync(*, dry_run: bool = False, verbose: bool = False) -> int: print(f" ✗ AGENTS.md regenerate failed: {exc}", file=sys.stderr) return 1 + # v3.1.x: opt-in outcome classification. If the project has a git + # working tree, run observe-git so the decisions get outcome + # tags (kept/modified/reverted) — this drives the v3.1.x outcome + # lens + Q&A "what got reverted" features. Best-effort; we never + # fail the sync on git troubles (project might not be a git repo + # at all, which is fine). + try: + from mcp_server.storage import outcomes_writer + + summary = outcomes_writer.observe_all() + if "error" in summary: + if verbose: + print(f" ⓘ observe-git skipped: {summary['error']}") + else: + counts = ( + f"{summary.get('kept', 0)} kept · " + f"{summary.get('modified', 0)} modified · " + f"{summary.get('reverted', 0)} reverted · " + f"{summary.get('unclassified', 0)} unclassified" + ) + print( + f" ✓ observe-git ({counts}, " + f"{summary.get('outcomes_appended', 0)} new outcome(s))" + ) + except Exception as exc: # noqa: BLE001 — never block sync on outcome wiring + if verbose: + print(f" ⓘ observe-git skipped: {exc}", file=sys.stderr) + print() print(" ✓ Sync complete.") print() From 6874fcf398cdfa2cb58777debbf4da93ac06f308 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 22:21:14 +0530 Subject: [PATCH 34/44] test(engine): pin the cross-tool wedge at the unit level MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # The thrash that motivated this test 6d2a6d6 bumped _DEFAULT_MIN_SCORE 0.10 → 0.25 to reduce surface noise. That broke a load-bearing scenario: a Tool A decision with no tag/file overlap to Tool B's prompt — the score reduces to FTS_WEIGHT(0.2) × outcome_weight(0.5) = 0.10, exactly at the old threshold. With the new 0.25, it stopped injecting silently. The existing unit tests in TestScoringComponents are TOLERANT (`if verdict.action == "inject"`), so they passed. The test_cross_tool_universality e2e tests caught it BUT were not in `make test-e2e` at the time of the bump. # What this test pins The minimum-signal cross-tool wedge: - prompt mentions text from a decision - no tag overlap, no file overlap - score = TAG(0)+FILE(0)+FTS(0.2) × outcome_weight(0.5) = 0.10 - MUST clear _DEFAULT_MIN_SCORE and inject If a future change tightens the threshold or weights, this test fails immediately at unit level (not just e2e), and the failure message names the specific regression class. # What this test deliberately does NOT do It doesn't pin the score model itself (weights, threshold). The team can re-tune the scoring; what it CAN'T do is silently kill this minimum-signal path. The test will need an update if the score model changes, which forces deliberate review of the wedge contract. # Noise-reduction itself: deferred The original motivation for the 0.25 bump (surfaces feel noisy) was subjective, not measured. Real noise reduction needs: - a measurement (count of surface events per N prompts) - a benchmark of "useful surface" vs "noise surface" - a tuning loop that holds the wedge invariant fixed Deferred until those exist. Co-Authored-By: Claude Opus 4.7 --- tests/engine/test_relevance_inject.py | 41 +++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/tests/engine/test_relevance_inject.py b/tests/engine/test_relevance_inject.py index 6d12963..963a5c7 100644 --- a/tests/engine/test_relevance_inject.py +++ b/tests/engine/test_relevance_inject.py @@ -257,6 +257,47 @@ def test_fts_keyword_contributes(self, seeded_decisions): # can be tuned. The test passes either way. +class TestCrossToolWedgeRegression: + """v3.1.x: a single-FTS-match decision (no tag overlap, no file + overlap) is the WEAKEST signal in the system but it's load-bearing + — it's how Tool A's decision reaches Tool B when Tool B's prompt + only shares decision-text keywords. If this stops injecting, the + whole cross-tool memory promise dies silently. + + This regression test exists because bumping _DEFAULT_MIN_SCORE + 0.10 → 0.25 in 6d2a6d6 broke exactly this scenario. The + test_cross_tool_universality e2e tests caught it but were not in + `make test-e2e` at the time. Even after widening the gate, the + score model deserves a strict unit-level assertion.""" + + def test_single_fts_match_no_tags_no_file_overlap_DOES_inject( + self, seeded_decisions + ): + """The minimum-signal case: prompt mentions text from a + decision but shares no tags or files with it. Score reduces + to FTS_WEIGHT(0.2) × outcome_weight(0.5) = 0.10 — must clear + _DEFAULT_MIN_SCORE(0.10) and inject. If a future change + tightens scoring, the wedge breaks; fix the score model, do + NOT skip this test.""" + policy = RelevanceInject() + # Use a keyword that's ONLY in seeded decision text (no tag, + # no file match in seeded_decisions). + event = _make_prompt_event( + "Looking at pgvector for embedding lookups in production", + seeded_decisions, + ) + verdict = policy.evaluate(event, signals=None) + assert verdict.action == "inject", ( + f"WEDGE BROKEN at the unit level: single-FTS-match decision " + f"did not inject. action={verdict.action}. This is the same " + f"regression class as 6d2a6d6 (threshold bump 0.10→0.25). " + f"If you intentionally tightened scoring, restore single-FTS " + f"matchability — it's load-bearing for cross-tool memory." + ) + ids = verdict.metadata.get("decisions_injected") or [] + assert len(ids) >= 1, "inject happened but no decision_id surfaced" + + class TestCacheStability: """Same input must produce same bytes — for Anthropic prompt cache.""" From 947164d4d89bc3ff7428bf7a8aac8480ffa92923 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 22:23:31 +0530 Subject: [PATCH 35/44] docs: sync AGENTS.md after G3+sync+wedge-test commits --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 41d1a32..0e03a41 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ - **D00001G** v3.0.x storage prereq IMPLEMENTATION COMPLETE on branch release/3.0.1 (commits 6253940 + c1352d7). Patches 1+2+3 done: … · _memory, prereq, storage, v3.0.1_ - **D00001H** M1 Phase A origin tagging IMPLEMENTATION COMPLETE on release/3.0.1 (commits 618710a storage + ff06b3d ide_inject). orig… · _consensus, m1, memory, origin, v3.1.0_ -_+370 more decision(s) — full log in `.codevira/decisions.jsonl`._ +_+410 more decision(s) — full log in `.codevira/decisions.jsonl`._ For the full decision log + outcomes + reverts, see `.codevira/decisions.jsonl` or run `codevira list-decisions`. From 3bef507a793d342c07453922b3cc30e0e0d7baa3 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 23:41:46 +0530 Subject: [PATCH 36/44] release(v3.1.1): docs + version bump + CHANGELOG freshness gate MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit # Why 3.1.1 (and 3.1.0 yank) v3.1.0 was published 2026-05-30 with five memory subsystems + the cross-IDE consensus layer documented in its CHANGELOG entry. The same wheel ALSO contained the in-session hardening sweep (secret scrubbing across all stores, the multi-lens viewer overhaul, G3 implementation, sync auto-observe-git, 4 product bug fixes, counter-decision schema, AGENTS.md idempotency) — but none of that was in CHANGELOG. The released wheel was broader than its release notes. 3.1.1 ships the same code shape under a version that's properly documented. 3.1.0 yanks (existing pins still work; new installs land here directly). # CHANGELOG.md New `## [3.1.1] — 2026-05-30` entry covering: - Memory hardening (sanitize-all-stores + 4 silent bug fixes + counter-decision schema) - Viewer overhaul (ranked search + Q&A + outcome lens + lineage trace + rich detail panel + paranoia fixes) - `codevira sync` auto-classifies outcomes via `observe-git` - G3 real-IDE smoke script — the last permanently-skipped gate - Process notes: yank rationale, e2e-gate widening, MUST→SHOULD honesty downgrade, AGENTS.md idempotency # README.md New "What's new in v3.1.1" table at the top, before the v3.0.0 table. Points to the CHANGELOG entry + the release-notes doc. # docs/release-notes/v3.1.1.md New focused release-notes doc with: - TL;DR - Upgrade-from-3.0.x or 3.1.0 commands - The new things you'll notice (with code samples) - Bug fixes (numbered) - Honest process notes (the wedge regression I almost shipped; the MUST/SHOULD downgrade) - v3.2.0 outline # Process: CHANGELOG freshness gate `make release-verify-version` already required a CHANGELOG entry for the current version (line 269: exit 1 on missing). It did NOT check that the entry was FRESH relative to the wheel — the exact gap that let 3.1.0 ship under-documented. Added a second check: scan mcp_server/ + indexer/ for any .py or .html file newer than CHANGELOG.md. If anything is newer, the gate fails with the first 5 offenders listed and a hint to either update the entry or bump the patch version. # Version bump pyproject.toml + mcp_server/__init__.py both 3.1.0 → 3.1.1. Co-Authored-By: Claude Opus 4.7 --- CHANGELOG.md | 187 ++++++++++++++++++++++++++++++++ Makefile | 18 ++++ README.md | 22 ++++ docs/release-notes/v3.1.1.md | 200 +++++++++++++++++++++++++++++++++++ mcp_server/__init__.py | 2 +- pyproject.toml | 2 +- 6 files changed, 429 insertions(+), 2 deletions(-) create mode 100644 docs/release-notes/v3.1.1.md diff --git a/CHANGELOG.md b/CHANGELOG.md index 6324d3f..15181be 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -9,6 +9,193 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- +## [3.1.1] — 2026-05-30 — Hardening, viewer overhaul, G3, sync-observe-git + +v3.1.1 is the hardening release that supersedes the brief 3.1.0 +window. Same five memory subsystems, but the read/write surface +is tightened (secret scrubbing across every store), the viewer +graduates from "look at the graph" to "interrogate the memory" +(ranked search + Q&A + outcome lens + lineage trace), and the +last permanently-skipped gauntlet gate (G3 real-IDE smoke) ships +as a real check. + +If you installed 3.1.0, `pip install --upgrade codevira` brings +you straight to 3.1.1. 3.1.0 is yanked: undocumented at release +time (this CHANGELOG entry didn't ship with it), superseded +without code regressions. + +### Memory hardening + +- **Secret scrubbing across every store.** M8 (reflections) and + M3 (skills) already scrubbed; v3.1.1 brings parity to M2 + (working) and to `decisions_store.record` (decision text + + context). One shared module: `mcp_server/storage/sanitize.py`. + Patterns: api-key, Bearer, password, AWS AKIA, long hex, + long base64. Scrub runs at the storage-write boundary so the + secret never lands on disk in committed surfaces. +- **`commit_session` path-traversal hardening.** Previously + `commit_session("../escape")` would write outside + `.codevira/working_archived/`. Now validates `session_id` + against `[A-Za-z0-9._-]+`; non-conforming inputs raise + `ValueError`. +- **`skills_store.record(triggers={"tags": "git"})` rejected.** + Previously a bare string would silently iterate as characters + and persist `["g", "i", "t"]`. Now raises `ValueError` pointing + the caller to wrap as a list. +- **BFS query-time crash fix.** `spatial._bfs_distances` now + catches `sqlite3.DatabaseError` raised inside the query loop + (not only at connect-time). A corrupt-bytes `graph.db` or a + schema with missing `edges` table falls back to neighborhood- + only mode instead of crashing `spatial_nearby`. +- **`skills_store.list_all(limit=0)` returns `[]`.** Previously + the for-loop did append-then-check, returning the first row + instead of empty. +- **`promote_skill_to_playbook` refuses archived skills.** A + low-value skill (5+ consecutive failures OR 90+ days unused) + is now refused unless `force=True` is passed. Previously + promoted silently and competed with active skills. +- **`origin.current_origin` normalizes agent_model.** Whitespace + and the literal strings `"null"` / `"None"` (any case) collapse + to `None`. Downstream consensus-check string compares no longer + see junk values. +- **Antigravity multi-target atomicity.** `inject_global_antigravity` + + `_inject_antigravity` now snapshot each target's pre-write + content. On any write failure, all successfully-written targets + are restored from snapshot. Either every target is stamped or + none — no asymmetric provenance state from a mid-iteration + failure. + +### Counter-decision discipline (schema change, back-compat) + +`decisions_store.record` + `record_decision` MCP tool grew two +optional fields: + +- `alternatives_considered: list[str]` — the strongest options + you rejected. Surfaces in the viewer's rich-detail panel. +- `would_re_examine_if: str` — the condition that should force + a re-examination. Pair with `do_not_revert=True` to turn the + one-way ratchet into a self-documenting precondition. + +Both fields are optional, sanitized on write, and tolerated as +absent/null on read for legacy records. + +### `codevira graph` viewer overhaul + +The viewer graduates from a passive force-layout to an active +interrogation tool. Major additions: + +- **Ranked search panel.** Free-text queries now produce a + top-K ranked panel under the search box (BM25-ish: token + overlap + recency + protected boost). Each row: id, snippet, + outcome badge, protected lock, score. Click any row → + centers + selects + opens the rich detail panel. +- **Q&A intent detection** (no LLM dependency, pure regex). + Four shapes today: `what did we decide about X`, `why did + we pick X`, `what got reverted`, `what's protected`. Each + produces a synthesized answer with clickable decision-id + chips that jump in the graph. +- **Rich detail panel for decisions.** Surfaces the new + counter-decision fields (alternatives_considered as a list, + would_re_examine_if as an italic banner), context as a + scrollable block, outcome badge in the title, and the full + supersedes lineage chain (clickable predecessors + successors). +- **Outcome lens.** New "Outcome" choice in the lens dropdown. + Colors decisions by classification: `kept`=green, `modified`= + amber, `reverted`=coral, `unclassified`=gray. Legend shows + per-bucket counts. +- **Lineage trace mode.** Click "trace" in the lineage block on + any decision in a supersedes chain — everything dims, the + chain stays full opacity with extra-thick warning-colored + edges, camera fits to the chain. Esc exits. +- **`alternatives_considered` + `would_re_examine_if` surfaced** + in the rich detail panel. +- **Search debouncing** (120ms trailing-edge) so the ranked- + score pass doesn't lag on typing bursts at the 2000-node cap. + +The viewer's underlying file split: `mcp_server/cli_graph.py` +shrank 84KB → 14KB by extracting the HTML/CSS/JS template into +`mcp_server/graph/template.html`. Public API unchanged. + +### `codevira sync` auto-classifies outcomes + +Every `codevira sync` (manual or automatic) now runs +`observe-git` as a best-effort tail step. The outcome lens in +the viewer + the Q&A "what got reverted" surface now have real +data on every sync — previously stayed gray because outcome +classification was opt-in. Non-git projects degrade silently. + +### G3 — real-IDE smoke (the last stubbed gate) + +`scripts/check_real_ide_smoke.sh` was a stub returning exit 2 +("skipped") since v2.0. Now ships a real implementation: + +- Locates codevira on PATH (the same binary IDE configs invoke). +- For each detected IDE config (Claude Code, Claude Desktop, + Cursor, Windsurf, Antigravity — per-app + shared paths): + validates JSON; "empty file" treated as not-configured; + malformed JSON treated as hard fail. +- Verifies `codevira` (or `codevira-`) is registered; + reports `env.CODEVIRA_IDE` state (pre-v3.1.0 installs show + as missing with re-setup guidance). +- Spawns a real MCP stdio server (`codevira --project-dir `), + runs initialize + tools/list. Thresholds: initialize 5s budget + (warm-load OK), tools/list 1s HARD (Claude Desktop disconnect + class), tool count ≥20. + +Evidence file now records `G3_real_ide_smoke: true` for the +first time since v2.0. + +### Process / discipline + +- **`test_cross_tool_universality` added to `make test-e2e`.** + Previously the procedural lock (D000010) said "run test-e2e + before changing engine policies." The gate only included + `test_first_contact` + `test_product_invariants`. A bump to + `_DEFAULT_MIN_SCORE` 0.10 → 0.25 broke the cross-tool wedge + silently because the test that catches it wasn't in the gate. + Reverted the bump; widened the gate; added a wedge-regression + unit test (`TestCrossToolWedgeRegression`) so the same class + of regression also fails at the fast unit-test layer. +- **`make release-verify-version` BSD sed fix.** The version + drift check used `sed -E 's/.*=\s*"([^"]+)".*/\1/'` which is + GNU-only; BSD sed (macOS default) doesn't recognize `\s` in + `-E`. Replaced with `= *` (literal space). +- **CLAUDE.md "MUST"/"SHOULD" honesty.** The + "before-you-finish" contract claimed `MUST call + write_session_log` but no engine layer enforced it. Downgraded + to STRONG RECOMMENDATION with explicit "engine enforcement on + roadmap" note. +- **AGENTS.md idempotency.** `agents_md_generator.regenerate` + now compares computed content vs existing and short-circuits + when identical (no write, no mtime bump). Kills the + perpetual uncommitted-drift loop where every codevira write + bumped AGENTS.md even when content didn't change. + +### Tests + suite + +- Full project suite: 2538 → 2540 passing, 28 skipped, 0 + failures. +- Widened `make test-e2e` gate: 39 → 43 passing. +- All 4 product fixes verified end-to-end through the fresh- + built wheel + against AgentStore's real memory. + +### Locked decisions honored + +D000010 procedural gate ran on every engine-policy change. +D000001 (atomic disk writes) honored. D000012 (project-root +validation) honored. + +### Yanked + +- **3.1.0 yanked 2026-05-30.** Same code shape; released + without this CHANGELOG entry. Process gap, not code gap. + Existing pins still work; new `pip install codevira` lands + on 3.1.1 directly. + +--- + +--- + ## [3.1.0] — Five memory subsystems + cross-IDE consensus v3.1.0 adds five memory subsystems on top of the v3.0.x decision diff --git a/Makefile b/Makefile index e98b306..281876d 100644 --- a/Makefile +++ b/Makefile @@ -268,6 +268,24 @@ release-verify-version: echo " Promote the [Unreleased] section to [$(VERSION)] before releasing."; \ exit 1; \ fi; \ + \ + # v3.1.1: freshness check — if any mcp_server/ or indexer/ \ + # source file is newer than CHANGELOG.md, the entry is probably \ + # stale relative to the wheel. Catches "I edited code but didn't \ + # update changelog" — the exact gap that briefly published 3.1.0 \ + # without docs. \ + CHANGELOG_MTIME=$$(stat -f %m CHANGELOG.md 2>/dev/null || stat -c %Y CHANGELOG.md); \ + NEWER=$$(find mcp_server indexer -type f \( -name "*.py" -o -name "*.html" \) -newer CHANGELOG.md 2>/dev/null | wc -l | tr -d ' '); \ + if [ "$$NEWER" -gt "0" ]; then \ + echo " ✗ CHANGELOG.md is OLDER than $$NEWER source file(s) under mcp_server/ + indexer/."; \ + echo " The current $(VERSION) entry is probably stale relative to the wheel."; \ + echo " Either: (a) update the entry to cover the new commits, OR"; \ + echo " (b) bump the patch version + add a new entry."; \ + echo " First offenders:"; \ + find mcp_server indexer -type f \( -name "*.py" -o -name "*.html" \) -newer CHANGELOG.md 2>/dev/null | head -5 | sed 's/^/ /'; \ + exit 1; \ + fi; \ + echo " ✓ CHANGELOG.md is fresh (newer than every tracked source file)"; \ fi @# 6. Tag check: if tag exists, must point at HEAD. @if git rev-parse "v$(VERSION)" >/dev/null 2>&1; then \ diff --git a/README.md b/README.md index f88b56c..8423331 100644 --- a/README.md +++ b/README.md @@ -92,6 +92,28 @@ every AI tool, on every project, on your local machine.** --- +## What's new in v3.1.1 — hardening + interrogable memory + +> 3.1.1 supersedes the briefly-published 3.1.0 (which shipped +> without this README/CHANGELOG entry). Same code shape; this +> release is the documented one. Brings five memory subsystems +> (M1–M9 from 3.1.0) plus the v3.1.1 hardening + viewer overhaul. + +| Area | What you get | +|---|---| +| **Five memory subsystems** | Origin tagging (M1), working memory (M2), skill library with FTS5 ranking (M3), spatial memory + activity heatmap (M4), skill induction wired to outcomes (M5), cross-IDE consensus check + handshake (M6/M7), reflections (M8). 22 new MCP tools. | +| **Secret scrubbing everywhere** | Decisions, sessions, working memory, skills, reflections — every store scrubs api-key / Bearer / password / AWS AKIA / long hex / long base64 at the write boundary. One shared `mcp_server/storage/sanitize.py`. | +| **Counter-decision discipline** | `record_decision` now accepts `alternatives_considered: list[str]` and `would_re_examine_if: str` — losing options + invalidation trigger surface in the viewer's rich-detail panel. Optional + back-compat. | +| **Interrogable graph viewer** | `codevira graph` is no longer a passive force-layout. Free-text search → top-K ranked panel with score + outcome badge. Q&A intent detection ("what did we decide about X", "what got reverted", "what's protected"). Outcome lens (kept/modified/reverted). Lineage trace mode for supersession chains. | +| **Auto outcome classification** | `codevira sync` now runs `observe-git` as a best-effort tail step — outcomes flow into the viewer's outcome lens automatically. | +| **G3 real-IDE smoke** | The last permanently-skipped gauntlet gate now ships as a real check. Verifies codevira is registered in each detected IDE config + MCP `tools/list` round-trips in <1s. | +| **AGENTS.md no more churn** | `regenerate()` is now idempotent — no rewrite when content unchanged, no perpetual uncommitted-drift loop. | +| **4 silent bugs fixed** | `commit_session("../escape")` rejected; `triggers.tags="git"` rejected; `list_all(limit=0)` returns `[]`; spatial BFS catches query-time sqlite errors. | + +Full v3.1.1 release notes: [CHANGELOG.md](CHANGELOG.md#311--2026-05-30--hardening-viewer-overhaul-g3-sync-observe-git). + +--- + ## What's new in v3.0.0 — audited, lean, opinionated > Major version. v3.0.0 is the biggest API contraction since v2.0 diff --git a/docs/release-notes/v3.1.1.md b/docs/release-notes/v3.1.1.md new file mode 100644 index 0000000..57453c0 --- /dev/null +++ b/docs/release-notes/v3.1.1.md @@ -0,0 +1,200 @@ +# Codevira v3.1.1 — release notes + +**Released:** 2026-05-30 +**Supersedes:** 3.1.0 (yanked — released without this notes/CHANGELOG +entry; same code shape, the only difference is documentation +discipline) +**Install:** `pipx install --upgrade codevira` or +`pip install codevira==3.1.1` + +--- + +## TL;DR + +v3.1.1 finishes what 3.1.0 started. Five memory subsystems (M1–M9) +shipped in 3.1.0; v3.1.1 adds the hardening sweep on top: + +- Every store now scrubs secrets at the write boundary +- The graph viewer learned to **answer questions** (ranked search + + Q&A + outcome lens + lineage trace) +- The release gauntlet's last permanently-skipped gate (G3) ships + as a real check +- `codevira sync` now classifies outcomes automatically (the + viewer's outcome lens lights up on real projects) +- 4 silent bugs (path traversal, type coercion, off-by-one, spatial + BFS crash) are fixed +- A schema addition lets every new decision carry its + *alternatives_considered* and *would_re_examine_if* — turning + `do_not_revert` from a one-way ratchet into a self-documented + precondition + +Full per-line history is in [CHANGELOG.md](../../CHANGELOG.md#311--2026-05-30--hardening-viewer-overhaul-g3-sync-observe-git). + +--- + +## Upgrading from 3.0.x or 3.1.0 + +```bash +pipx upgrade codevira # or `pip install --upgrade codevira` +codevira --version # should report 3.1.1 +codevira setup # re-stamps env.CODEVIRA_IDE into every IDE +codevira sync # one-shot: rebuilds indexes + classifies outcomes +``` + +That's it. The wire format is back-compat: + +- **v3.0.x records** (no `origin` field) keep loading; readers treat + them as `ide="unknown"`. +- **3.1.0 records** that wrote *with* `origin` continue loading. +- **Counter-decision fields** (`alternatives_considered`, + `would_re_examine_if`) are optional on every legacy decision — + populate them on new writes when you have the context. + +The one user-visible change after `setup`: existing IDE configs gain +`env.CODEVIRA_IDE` so cross-IDE consensus checks can attribute +writes. Without this, every decision shows up as `ide=unknown`. + +--- + +## The new things you'll notice + +### `codevira graph` is interactive now + +Open the rendered HTML, then: + +- **Type in the search box** — top-K matches appear ranked below. + Token grammar: `tag:auth ide:cursor kind:decision protected:true + since:2026-01-01 until:2026-04-01`. +- **Ask the graph in plain English**: + - "what did we decide about auth?" + - "why did we pick bcrypt?" + - "what got reverted?" + - "what's protected?" +- **Switch the Lens dropdown** to "Outcome" — color decisions by + classified result. Run `codevira sync` first so outcomes are + populated. +- **Click any decision** — rich detail panel shows + alternatives_considered, would_re_examine_if, lineage chain, + outcome badge. +- **Click "trace" on a lineage** — graph dims, the supersession + chain is highlighted, camera fits to it. + +Press `?` in the viewer for the full key+gesture cheatsheet. + +### `record_decision` accepts losers + invalidation triggers + +```python +record_decision( + decision="Use bcrypt for password hashing", + do_not_revert=True, + alternatives_considered=[ + "argon2id (rejected: heavier on cheap mobile clients)", + "scrypt (rejected: less well-vetted in our ecosystem)", + ], + would_re_examine_if=( + "if argon2id native bindings ship in the stdlib OR " + "if we move off mobile clients" + ), +) +``` + +Future sessions see WHY the decision won and the rule for revisiting +it — instead of just the winner. The viewer's rich-detail panel +surfaces both fields. Empty values are fine; populate when you have +the context. + +### Secrets never land on disk + +If you accidentally paste a curl example into a decision, working +memory note, session log, skill procedure, or reflection: + +``` +record_decision(decision="see api_key=hunter2-deadbeefcafe for /v1/things") +``` + +The on-disk record reads `see for /v1/things`. +Six pattern families are scrubbed: api-key, Bearer, password, AWS +AKIA, long hex, long base64. Shared scrubber in +`mcp_server/storage/sanitize.py`. + +### `codevira sync` quietly does more work + +Every `sync` now runs `observe-git` as a best-effort tail step: + +``` +✓ AGENTS.md regenerated (5,137 bytes) +✓ observe-git (8 kept · 13 modified · 2 reverted · 6 unclassified, + 23 new outcome(s)) +``` + +The outcome counts populate the viewer's outcome lens + the Q&A +"what got reverted" surface. Non-git projects degrade silently. + +### G3 real-IDE smoke + +`scripts/check_real_ide_smoke.sh` was a stub for half a year. Now +it runs the real check during `make release-gauntlet`: + +- Finds every IDE config on the machine +- Verifies codevira is registered in each +- Spawns an MCP stdio server, runs initialize + tools/list +- Asserts tools/list <1s (the Claude Desktop disconnect class) + +Run it standalone any time: `scripts/check_real_ide_smoke.sh`. + +--- + +## Bugs fixed (real ones) + +1. `working_store.commit_session("../escape")` would write outside + `.codevira/working_archived/`. Now raises `ValueError`. +2. `skills_store.record(triggers={"tags": "git"})` silently stored + `["g", "i", "t"]`. Now raises `ValueError` pointing to wrap as + a list. +3. `skills_store.list_all(limit=0)` returned the first row (append- + then-check off-by-one). Now returns `[]`. +4. `spatial._bfs_distances` crashed on corrupt/schema-missing + `graph.db` (only caught connect-time errors). Now catches + query-time `sqlite3.DatabaseError` and falls back to + neighborhood-only mode. +5. `promote_skill_to_playbook` silently allowed archived (low-value) + skills. Now refuses unless `force=True`. +6. `origin.current_origin` passed `"null"` / `" "` strings through + verbatim as `agent_model`. Now normalizes to `None`. +7. `inject_global_antigravity` left partial state when target #2 + failed mid-iteration. Now snapshots + rolls back. + +--- + +## Honest process notes + +- **3.1.0 yanked**, 3.1.1 ships in its place. 3.1.0 was published + before this CHANGELOG/README entry existed. Same code; pure + documentation discipline gap. +- **A bug almost shipped.** Mid-session a tweak to + `_DEFAULT_MIN_SCORE` (0.10 → 0.25) silently broke the cross-tool + wedge — single-FTS-match decisions stopped surfacing to other + IDEs. The `make test-e2e` gate ran (per D000010's procedural + lock) but didn't include `test_cross_tool_universality.py`. A + full `pytest tests/` (no ignores) before publish caught it. + Reverted the threshold; widened the gate; added a unit-level + wedge-regression test so the same class of regression also fails + fast. +- **CLAUDE.md's "MUST" downgrade.** The "before-you-finish" contract + in CLAUDE.md said `MUST call write_session_log`. The engine + didn't enforce it. Downgraded to STRONG RECOMMENDATION with an + "engine enforcement on roadmap" note. Honesty > confidence + theater. + +--- + +## What's next (v3.2.0 outline, subject to change) + +- Real MCP `sampling/createMessage` in `reflect()` (today returns + a stubbed `sampling_supported: False`). +- Q&A vocabulary expansion: "who decided X", "when did we X", + "compare X and Y", lineage queries. +- Recency-decay on relevance scoring (was attempted in 3.1.x but + reverted — needs measurement infrastructure first). +- `do_not_revert` auto-soft-expire after N months unless + re-affirmed via `apply_outcome("still valid")`. diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py index 7be493a..bd7b59e 100644 --- a/mcp_server/__init__.py +++ b/mcp_server/__init__.py @@ -14,4 +14,4 @@ from mcp_server.cli import main __all__ = ["main"] -__version__ = "3.1.0" +__version__ = "3.1.1" diff --git a/pyproject.toml b/pyproject.toml index c8a374b..e494116 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "codevira" -version = "3.1.0" +version = "3.1.1" description = "Cross-IDE decision enforcement for AI coding agents. 1 MB per project, in your repo, no cloud, no vectors. Claude Code, Cursor, Windsurf, Antigravity, Codex all share the same in-repo memory; hooks block AI tool calls that violate prior decisions. MIT, local-first." readme = "README.md" license = { text = "MIT" } From df5dd066aef4cb4099a7336d4801e5da4059867f Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 23:42:37 +0530 Subject: [PATCH 37/44] =?UTF-8?q?fix(release):=20drop=20multi-line=20comme?= =?UTF-8?q?nts=20inside=20recipe=20=E2=80=94=20shell=20syntax=20err?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- Makefile | 6 ------ 1 file changed, 6 deletions(-) diff --git a/Makefile b/Makefile index 281876d..d1571d9 100644 --- a/Makefile +++ b/Makefile @@ -269,12 +269,6 @@ release-verify-version: exit 1; \ fi; \ \ - # v3.1.1: freshness check — if any mcp_server/ or indexer/ \ - # source file is newer than CHANGELOG.md, the entry is probably \ - # stale relative to the wheel. Catches "I edited code but didn't \ - # update changelog" — the exact gap that briefly published 3.1.0 \ - # without docs. \ - CHANGELOG_MTIME=$$(stat -f %m CHANGELOG.md 2>/dev/null || stat -c %Y CHANGELOG.md); \ NEWER=$$(find mcp_server indexer -type f \( -name "*.py" -o -name "*.html" \) -newer CHANGELOG.md 2>/dev/null | wc -l | tr -d ' '); \ if [ "$$NEWER" -gt "0" ]; then \ echo " ✗ CHANGELOG.md is OLDER than $$NEWER source file(s) under mcp_server/ + indexer/."; \ From 0f87052bfba7454d7d8a1e8295e4a4ad76d3ff3d Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sat, 30 May 2026 23:59:12 +0530 Subject: [PATCH 38/44] docs: sync AGENTS.md after v3.1.1 docs + gauntlet --- AGENTS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/AGENTS.md b/AGENTS.md index 0e03a41..2d881f2 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ - **D00001G** v3.0.x storage prereq IMPLEMENTATION COMPLETE on branch release/3.0.1 (commits 6253940 + c1352d7). Patches 1+2+3 done: … · _memory, prereq, storage, v3.0.1_ - **D00001H** M1 Phase A origin tagging IMPLEMENTATION COMPLETE on release/3.0.1 (commits 618710a storage + ff06b3d ide_inject). orig… · _consensus, m1, memory, origin, v3.1.0_ -_+410 more decision(s) — full log in `.codevira/decisions.jsonl`._ +_+430 more decision(s) — full log in `.codevira/decisions.jsonl`._ For the full decision log + outcomes + reverts, see `.codevira/decisions.jsonl` or run `codevira list-decisions`. From d5eb67f36426e6a38c3c5fc76a355d2869523ad2 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sun, 31 May 2026 12:33:17 +0530 Subject: [PATCH 39/44] =?UTF-8?q?feat(engine):=20v3.2.0=20opener=20?= =?UTF-8?q?=E2=80=94=20session=5Flog=5Fenforcer=20(warn-mode)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Closes the CLAUDE.md "before-you-finish" honesty gap v3.1.1 left on the honor system. New policy fires on SESSION_START + STOP events: - SESSION_START: records {session_id, started_at, project_root} to .codevira-cache/active_sessions.jsonl (per-machine, gitignored) - STOP: counts commits in project_root since started_at; scans .codevira/sessions.jsonl for any entry in [started_at, now] - If commits > 0 AND no in-window log entry -> warn via Claude Code's systemMessage channel with a write_session_log(...) call template Default mode: warn (non-blocking). Opt-in block via CODEVIRA_SESSION_LOG_ENFORCER_MODE=block. v3.2.1 plans to flip the default to block once warn-mode instrumentation confirms low noise. Uses git's --since=@ rather than --since= so the count is correct on non-UTC machines (git's default ISO parser is locale- dependent). CLAUDE.md: removed the "Honest accounting (v3.1.x)" footnote; replaced with engine-enforcement description + mode switch docs. 23 new unit tests pin every branch including registration, mode switching, timezone-correctness, and message templating. Drift- guard test in test_qa_round_week13.py updated to include the new policy in the default-set. G1: 2471 passed, 12 skipped. G2: 43 passed, 9 skipped. Co-Authored-By: Claude Opus 4.7 --- AGENTS.md | 2 +- CHANGELOG.md | 23 + CLAUDE.md | 2 +- mcp_server/engine/__init__.py | 2 + mcp_server/engine/policies/__init__.py | 2 + .../engine/policies/session_log_enforcer.py | 349 ++++++++++++ tests/engine/test_qa_round_week13.py | 4 +- tests/engine/test_session_log_enforcer.py | 504 ++++++++++++++++++ 8 files changed, 885 insertions(+), 3 deletions(-) create mode 100644 mcp_server/engine/policies/session_log_enforcer.py create mode 100644 tests/engine/test_session_log_enforcer.py diff --git a/AGENTS.md b/AGENTS.md index 2d881f2..5e492bc 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -33,7 +33,7 @@ - **D00001G** v3.0.x storage prereq IMPLEMENTATION COMPLETE on branch release/3.0.1 (commits 6253940 + c1352d7). Patches 1+2+3 done: … · _memory, prereq, storage, v3.0.1_ - **D00001H** M1 Phase A origin tagging IMPLEMENTATION COMPLETE on release/3.0.1 (commits 618710a storage + ff06b3d ide_inject). orig… · _consensus, m1, memory, origin, v3.1.0_ -_+430 more decision(s) — full log in `.codevira/decisions.jsonl`._ +_+450 more decision(s) — full log in `.codevira/decisions.jsonl`._ For the full decision log + outcomes + reverts, see `.codevira/decisions.jsonl` or run `codevira list-decisions`. diff --git a/CHANGELOG.md b/CHANGELOG.md index 15181be..1affc9a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,29 @@ This project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.htm --- +## [Unreleased] — v3.2.0 work-in-progress + +### Added + +- **`session_log_enforcer` policy** — closes the "before-you-finish" + honesty gap CLAUDE.md downgraded to honor-system in v3.1.1. Listens + on `SESSION_START` (records `{session_id, started_at, project_root}` + to `.codevira-cache/active_sessions.jsonl`) and `STOP` (counts + git commits since `started_at`, scans `.codevira/sessions.jsonl` + for any entry in the session window). If commits > 0 AND no + in-window log → emits `warn` via Claude Code's `systemMessage` + channel with a `write_session_log(...)` call template. Default + mode `warn`; opt-in `block` via `CODEVIRA_SESSION_LOG_ENFORCER_MODE=block`. +- Session log enforcer uses `git log --since=@` rather than + the locale-dependent ISO form so the count is correct on machines + whose TZ is not UTC. + +### Changed + +- CLAUDE.md: removed the "Honest accounting (v3.1.x)" footnote that + explained the gap was on the honor system. Replaced with engine- + enforcement description + mode-switch documentation. + --- ## [3.1.1] — 2026-05-30 — Hardening, viewer overhaul, G3, sync-observe-git diff --git a/CLAUDE.md b/CLAUDE.md index 5cae93d..39e057b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -38,7 +38,7 @@ Codevira's promise is "the project remembers what you did." That promise breaks A session that ships code WITHOUT a codevira write call leaves the project's memory stale for the next AI. That's the most common way the wedge breaks. Treat it as part of the definition-of-done. -**Honest accounting (v3.1.x):** This is currently a "should" enforced by convention, not a "must" enforced by the engine. Codevira does not (today) block your final response when commits exist and no session log was written. Enforcement at the hook layer is on the roadmap. Until it lands, this is on the honor system — and on your judgment for what counts as "meaningful." If you shipped code, you should log it; if you only answered a question, you don't need to. +**Engine enforcement (v3.2.0+):** The `session_log_enforcer` policy fires on `Stop` events. If the session shipped commits AND no `write_session_log` was called between `SESSION_START` and now, it emits a `warn` via Claude Code's `systemMessage` channel. Default mode is `warn` (non-blocking nudge); set `CODEVIRA_SESSION_LOG_ENFORCER_MODE=block` to force the AI to retry, or `off` to disable. v3.2.1 plans to default to `block` once warn-mode instrumentation confirms low noise. Logging is still your judgment call for what counts as "meaningful" — if you only answered a question with no commits, the policy stays silent. ### When you see "Roadmap drift detected" in your SessionStart context diff --git a/mcp_server/engine/__init__.py b/mcp_server/engine/__init__.py index 473feae..5a31c13 100644 --- a/mcp_server/engine/__init__.py +++ b/mcp_server/engine/__init__.py @@ -50,6 +50,7 @@ def register_default_policies() -> None: from mcp_server.engine.policies.decision_lock import DecisionLock from mcp_server.engine.policies.post_edit_refresh import PostEditGraphRefresh from mcp_server.engine.policies.relevance_inject import RelevanceInject + from mcp_server.engine.policies.session_log_enforcer import SessionLogEnforcer from mcp_server.engine.policies.token_budget import TokenBudgetPersist # v2.2.0+ surface cut (2026-05-22 audit): LiveStyleEnforcement, @@ -64,6 +65,7 @@ def register_default_policies() -> None: TokenBudgetPersist, # Hero 6 AntiRegression, # Hero 2 PostEditGraphRefresh, # v2.1.2 Item 4 + SessionLogEnforcer, # v3.2.0 hook-layer enforcement of write_session_log ): if not policy_cls.enabled_by_default: continue # opt-in only — caller registers manually diff --git a/mcp_server/engine/policies/__init__.py b/mcp_server/engine/policies/__init__.py index 2d4d91f..3a07923 100644 --- a/mcp_server/engine/policies/__init__.py +++ b/mcp_server/engine/policies/__init__.py @@ -34,6 +34,7 @@ from mcp_server.engine.policies.blast_radius import BlastRadiusVeto from mcp_server.engine.policies.decision_lock import DecisionLock from mcp_server.engine.policies.relevance_inject import RelevanceInject +from mcp_server.engine.policies.session_log_enforcer import SessionLogEnforcer from mcp_server.engine.policies.token_budget import TokenBudgetPersist __all__ = [ @@ -41,5 +42,6 @@ "BlastRadiusVeto", "DecisionLock", "RelevanceInject", + "SessionLogEnforcer", "TokenBudgetPersist", ] diff --git a/mcp_server/engine/policies/session_log_enforcer.py b/mcp_server/engine/policies/session_log_enforcer.py new file mode 100644 index 0000000..881df1a --- /dev/null +++ b/mcp_server/engine/policies/session_log_enforcer.py @@ -0,0 +1,349 @@ +""" +session_log_enforcer.py — v3.2.0 hook-layer enforcement of write_session_log. + +CLAUDE.md's "before-you-finish" contract told the AI to call +``write_session_log`` whenever a session shipped commits. v3.1.x left this +on the honor system because no engine layer detected the gap. This policy +closes that gap. + +Fires on two events: + + - ``SESSION_START`` → appends ``{session_id, started_at, project_root}`` + to ``/.codevira-cache/active_sessions.jsonl``. Per-machine, + gitignored. ~5ms. + + - ``STOP`` → looks up the active-session record, then: + 1. Counts commits in ``project_root`` since ``started_at``. + (Any commit, regardless of author — recommendation accepted + interactively 2026-05-31.) + 2. Scans ``.codevira/sessions.jsonl`` for an entry whose timestamp + falls in [started_at, now]. (Claude Code's session_id is a UUID; + the user picks a short slug for sessions.jsonl — so we match by + time window, not by id.) + 3. If commits > 0 AND no matching session entry → emit ``warn`` + with a call-template the AI can paste into its next turn. + +Failure modes: + - Non-git project → ``git rev-parse`` fails → policy returns ``allow``. + - No SESSION_START record → ``allow`` (Claude Code may have restored + a cached session without firing SESSION_START; better to under-fire + than over-warn). + - Any disk/parse error → ``allow`` + ``metadata.error`` for ``codevira + doctor`` to surface. + +Ship plan: + - v3.2.0: ``warn`` only — non-blocking, instruments how often the gap + exists in real sessions. + - v3.2.1 (planned): once data confirms the warn isn't noisy, upgrade + to ``block`` so Claude Code's Stop hook re-engages the AI until the + log lands. +""" + +from __future__ import annotations + +import json +import os +import subprocess +import time +from pathlib import Path +from typing import Any + +from mcp_server.engine.events import EventType, HookEvent +from mcp_server.engine.policy import Policy, PolicyVerdict +from mcp_server.engine.signals import SignalContext + +_CACHE_REL = ".codevira-cache" +_ACTIVE_FILENAME = "active_sessions.jsonl" +_SESSIONS_REL = ".codevira/sessions.jsonl" + +_DEFAULT_MODE = "warn" +_MODES = ("off", "warn", "block") + + +class SessionLogEnforcer(Policy): + """Nudge the AI to call write_session_log whenever a session shipped commits.""" + + name = "session_log_enforcer" + handles = (EventType.SESSION_START, EventType.STOP) + enabled_by_default = True + priority = 5 + + def _config(self) -> dict[str, Any]: + mode_raw = ( + os.environ.get( + "CODEVIRA_SESSION_LOG_ENFORCER_MODE", + _DEFAULT_MODE, + ) + .strip() + .lower() + ) + mode = mode_raw if mode_raw in _MODES else _DEFAULT_MODE + return {"mode": mode} + + def config_schema(self) -> dict[str, Any]: + return { + "mode": { + "type": "string", + "enum": list(_MODES), + "default": _DEFAULT_MODE, + "env": "CODEVIRA_SESSION_LOG_ENFORCER_MODE", + "description": ( + "off (disabled) | warn (v3.2.0 default — non-blocking " + "nudge) | block (planned for v3.2.1 once warn-mode " + "instrumentation confirms low noise)" + ), + }, + } + + def evaluate( + self, + event: HookEvent, + signals: SignalContext | None = None, + ) -> PolicyVerdict: + config = self._config() + if config["mode"] == "off": + return PolicyVerdict.allow() + + if event.event_type == EventType.SESSION_START: + return self._on_session_start(event) + if event.event_type == EventType.STOP: + return self._on_stop(event, config["mode"]) + return PolicyVerdict.allow() + + # ------------------------------------------------------------------ + # SESSION_START — record start marker + # ------------------------------------------------------------------ + + def _on_session_start(self, event: HookEvent) -> PolicyVerdict: + if not event.session_id: + return PolicyVerdict.allow() + try: + active_path = _active_path(event.project_root) + active_path.parent.mkdir(parents=True, exist_ok=True) + record = { + "session_id": event.session_id, + "started_at": event.timestamp or time.time(), + "project_root": str(event.project_root), + } + with active_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps(record) + "\n") + except OSError as exc: + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "session_start", + "error": f"write_failed:{type(exc).__name__}", + } + ) + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "session_start", + "recorded": True, + } + ) + + # ------------------------------------------------------------------ + # STOP — enforce + # ------------------------------------------------------------------ + + def _on_stop(self, event: HookEvent, mode: str) -> PolicyVerdict: + if not event.session_id: + return PolicyVerdict.allow() + + try: + record = _lookup_active(event.project_root, event.session_id) + except OSError as exc: + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "stop", + "error": f"active_lookup_failed:{type(exc).__name__}", + } + ) + + if record is None: + # SESSION_START never fired for this id (cached/restored session, + # or v3.1.x machine without the marker). Don't warn — better + # to under-fire than over-warn in warn-mode. + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "stop", + "reason": "no_active_record", + } + ) + + started_at = float(record.get("started_at", 0.0)) + if started_at <= 0: + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "stop", + "reason": "invalid_start_time", + } + ) + + commit_count = _count_commits_since(event.project_root, started_at) + if commit_count == 0: + # No commits this session — nothing meaningful to log. + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "stop", + "commit_count": 0, + } + ) + + if _session_log_written(event.project_root, started_at): + # The AI already called write_session_log — honor it. + return PolicyVerdict.allow( + metadata={ + "policy": self.name, + "stage": "stop", + "commit_count": commit_count, + "log_present": True, + } + ) + + # GAP DETECTED — emit warn (or block once v3.2.1 lands). + message = _format_message( + commit_count=commit_count, + session_id=event.session_id, + ) + metadata = { + "policy": self.name, + "stage": "stop", + "commit_count": commit_count, + "log_present": False, + "mode": mode, + } + if mode == "block": + return PolicyVerdict.block(message, metadata=metadata) + return PolicyVerdict.warn(message, metadata=metadata) + + +# ---------------------------------------------------------------------- +# Helpers — pulled out so tests can target them directly +# ---------------------------------------------------------------------- + + +def _active_path(project_root: Path) -> Path: + return project_root / _CACHE_REL / _ACTIVE_FILENAME + + +def _lookup_active(project_root: Path, session_id: str) -> dict[str, Any] | None: + """Find the most recent SESSION_START record for ``session_id``. + + JSONL is append-only, so a session that restarted mid-day would have + multiple rows. Take the latest. + """ + path = _active_path(project_root) + if not path.exists(): + return None + latest: dict[str, Any] | None = None + with path.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + except json.JSONDecodeError: + continue + if row.get("session_id") != session_id: + continue + latest = row + return latest + + +def _count_commits_since(project_root: Path, started_at: float) -> int: + """Count commits in ``project_root`` since ``started_at`` (epoch seconds). + + Uses ``--since=@`` so we don't depend on the user's locale / + timezone (git's default ``--since=`` parses in *local* time, which + silently miscounts on machines whose TZ != UTC). + + Returns 0 when the directory isn't a git repo OR git fails for any reason. + """ + try: + result = subprocess.run( + ["git", "log", f"--since=@{int(started_at)}", "--oneline"], + cwd=str(project_root), + capture_output=True, + text=True, + timeout=5, + ) + except (OSError, subprocess.SubprocessError): + return 0 + if result.returncode != 0: + return 0 + return sum(1 for line in result.stdout.splitlines() if line.strip()) + + +def _session_log_written(project_root: Path, started_at: float) -> bool: + """True if sessions.jsonl has any entry whose ``ts`` is >= ``started_at``. + + Heuristic: Claude Code's hook session_id is a UUID; the user-supplied + slug stored in sessions.jsonl is unrelated. Matching by time window + rather than id is the cleanest cross-walk we can do without modifying + the sessions.jsonl schema. + """ + path = project_root / _SESSIONS_REL + if not path.exists(): + return False + try: + with path.open("r", encoding="utf-8") as fh: + for line in fh: + line = line.strip() + if not line: + continue + try: + row = json.loads(line) + except json.JSONDecodeError: + continue + ts_str = row.get("ts") or row.get("created_at") + if not isinstance(ts_str, str): + continue + ts = _parse_iso_to_epoch(ts_str) + if ts is None: + continue + if ts >= started_at: + return True + except OSError: + return False + return False + + +def _parse_iso_to_epoch(ts: str) -> float | None: + """Parse an ISO-8601 timestamp (with or without Z) into epoch seconds.""" + from datetime import datetime + + s = ts.strip() + if s.endswith("Z"): + s = s[:-1] + "+00:00" + try: + dt = datetime.fromisoformat(s) + except ValueError: + return None + return dt.timestamp() + + +def _format_message(*, commit_count: int, session_id: str) -> str: + plural = "" if commit_count == 1 else "s" + return ( + f"[codevira] This session shipped {commit_count} commit{plural} " + f"but no write_session_log call landed. CLAUDE.md's " + f"'before-you-finish' contract asks for one before stopping. " + f"Drop this into your final response:\n\n" + f" write_session_log(\n" + f" session_id='',\n" + f" task='',\n" + f" phase='',\n" + f" files_changed=[...],\n" + f" decisions=[{{'decision': '...', 'context': '...'}}],\n" + f" next_steps=[...],\n" + f" )\n\n" + f"(Stop event session_id={session_id}; commits counted since " + f"SESSION_START.)" + ) diff --git a/tests/engine/test_qa_round_week13.py b/tests/engine/test_qa_round_week13.py index e50de35..52c0834 100644 --- a/tests/engine/test_qa_round_week13.py +++ b/tests/engine/test_qa_round_week13.py @@ -101,7 +101,8 @@ def test_ten_default_policies_registered(self): # v2.2.0+ surface cut (2026-05-22 audit): Hero 7 # (LiveStyleEnforcement), Hero 10 (AIPromotionScore), Hero 9 # (ProactiveIntentInference), and Hero 3 (ProactiveScopeContractLock) - # were all DELETED. Default set: 6 (5 heroes + 1 v2.1.2 item). + # were all DELETED. v3.2.0 ADDED session_log_enforcer. + # Default set: 7 (5 heroes + 1 v2.1.2 item + 1 v3.2.0 enforcer). expected = { "blast_radius_veto", # Hero 4 "decision_lock", # Hero 1 (unique enforcement wedge) @@ -109,6 +110,7 @@ def test_ten_default_policies_registered(self): "token_budget_persist", # Hero 6 "anti_regression", # Hero 2 "post_edit_graph_refresh", # v2.1.2 Item 4 + "session_log_enforcer", # v3.2.0 — hook-layer write_session_log enforcement # Hero 8 (Decision Replay) is a browse surface, not a policy. } assert names == expected, ( diff --git a/tests/engine/test_session_log_enforcer.py b/tests/engine/test_session_log_enforcer.py new file mode 100644 index 0000000..0cbb9a0 --- /dev/null +++ b/tests/engine/test_session_log_enforcer.py @@ -0,0 +1,504 @@ +""" +test_session_log_enforcer.py — v3.2.0 enforcement of write_session_log. + +These tests are deliberately built against REAL filesystems + git repos +(no mocks) so a future schema drift in sessions.jsonl or active_sessions.jsonl +fails fast. +""" + +from __future__ import annotations + +import json +import subprocess +import time +from pathlib import Path + +import pytest + +from mcp_server.engine.events import EventType, HookEvent +from mcp_server.engine.policies.session_log_enforcer import ( + SessionLogEnforcer, + _active_path, + _count_commits_since, + _lookup_active, + _session_log_written, +) + + +# ===================================================================== +# Fixtures +# ===================================================================== + + +@pytest.fixture(autouse=True) +def _clear_env(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.delenv("CODEVIRA_SESSION_LOG_ENFORCER_MODE", raising=False) + + +@pytest.fixture +def project_root(tmp_path: Path) -> Path: + """A tmp project with .codevira/ + .codevira-cache/ pre-created.""" + root = tmp_path / "proj" + root.mkdir() + (root / ".codevira").mkdir() + (root / ".codevira-cache").mkdir() + return root + + +@pytest.fixture +def git_project(project_root: Path) -> Path: + """Project with an initialized git repo + a baseline commit.""" + subprocess.run(["git", "init", "-q"], cwd=str(project_root), check=True) + subprocess.run( + ["git", "config", "user.email", "test@codevira.local"], + cwd=str(project_root), + check=True, + ) + subprocess.run( + ["git", "config", "user.name", "Test"], + cwd=str(project_root), + check=True, + ) + subprocess.run( + ["git", "config", "commit.gpgsign", "false"], + cwd=str(project_root), + check=True, + ) + (project_root / "README.md").write_text("baseline\n") + subprocess.run(["git", "add", "."], cwd=str(project_root), check=True) + subprocess.run( + ["git", "commit", "-q", "-m", "baseline"], + cwd=str(project_root), + check=True, + ) + return project_root + + +def _make_event( + event_type: EventType, + project_root: Path, + *, + session_id: str | None = "session-uuid-abc", + timestamp: float | None = None, +) -> HookEvent: + return HookEvent( + event_type=event_type, + project_root=project_root, + session_id=session_id, + timestamp=timestamp if timestamp is not None else time.time(), + ) + + +def _git_commit(project_root: Path, msg: str, *, at_epoch: float | None = None) -> None: + """Add an empty commit so we can advance HEAD without churning files. + + ``at_epoch``: override commit timestamp via GIT_COMMITTER_DATE + + GIT_AUTHOR_DATE. Necessary because git's --since is 1s-resolution; + pinning explicit timestamps avoids same-second collisions on fast + test fixtures. + """ + import os as _os + + env = _os.environ.copy() + if at_epoch is not None: + date_str = f"@{int(at_epoch)} +0000" + env["GIT_COMMITTER_DATE"] = date_str + env["GIT_AUTHOR_DATE"] = date_str + subprocess.run( + ["git", "commit", "-q", "--allow-empty", "-m", msg], + cwd=str(project_root), + check=True, + env=env, + ) + + +def _head_commit_epoch(project_root: Path) -> float: + """Epoch seconds of HEAD's commit timestamp.""" + result = subprocess.run( + ["git", "log", "-1", "--format=%ct"], + cwd=str(project_root), + capture_output=True, + text=True, + check=True, + ) + return float(result.stdout.strip()) + + +def _ts_after_head(project_root: Path) -> float: + """Anchor ``started_at`` 1s after HEAD — git's --since is second-resolution.""" + return _head_commit_epoch(project_root) + 1.0 + + +def _write_session_log_entry(project_root: Path, *, ts_epoch: float) -> None: + """Append a sessions.jsonl entry stamped at ``ts_epoch``.""" + from datetime import datetime, timezone + + ts_iso = datetime.fromtimestamp(ts_epoch, tz=timezone.utc).isoformat() + entry = { + "ts": ts_iso, + "session_id": "user-slug", + "task": "test task", + "phase": "test phase", + "summary": None, + "decision_ids": [], + "outcome": None, + "id": "S999999", + } + sessions_path = project_root / ".codevira" / "sessions.jsonl" + with sessions_path.open("a", encoding="utf-8") as fh: + fh.write(json.dumps(entry) + "\n") + + +# ===================================================================== +# SESSION_START stage +# ===================================================================== + + +class TestSessionStart: + def test_no_session_id_allows_no_record(self, project_root: Path) -> None: + policy = SessionLogEnforcer() + event = _make_event( + EventType.SESSION_START, + project_root, + session_id=None, + ) + verdict = policy.evaluate(event, None) + assert verdict.is_allowing() + assert not _active_path(project_root).exists() + + def test_records_marker(self, project_root: Path) -> None: + policy = SessionLogEnforcer() + event = _make_event( + EventType.SESSION_START, + project_root, + session_id="abc-123", + timestamp=1_000_000.0, + ) + verdict = policy.evaluate(event, None) + assert verdict.is_allowing() + assert verdict.metadata.get("recorded") is True + + path = _active_path(project_root) + assert path.exists() + rows = [json.loads(ln) for ln in path.read_text().splitlines() if ln.strip()] + assert len(rows) == 1 + assert rows[0]["session_id"] == "abc-123" + assert rows[0]["started_at"] == 1_000_000.0 + assert rows[0]["project_root"] == str(project_root) + + def test_creates_cache_dir_when_missing(self, tmp_path: Path) -> None: + # No .codevira-cache/ pre-created — policy should create it. + root = tmp_path / "fresh" + root.mkdir() + policy = SessionLogEnforcer() + event = _make_event(EventType.SESSION_START, root, session_id="s1") + verdict = policy.evaluate(event, None) + assert verdict.is_allowing() + assert (root / ".codevira-cache").is_dir() + assert _active_path(root).exists() + + def test_multiple_session_starts_returns_latest(self, project_root: Path) -> None: + policy = SessionLogEnforcer() + for ts in (100.0, 200.0, 300.0): + policy.evaluate( + _make_event( + EventType.SESSION_START, + project_root, + session_id="dup", + timestamp=ts, + ), + None, + ) + latest = _lookup_active(project_root, "dup") + assert latest is not None + assert latest["started_at"] == 300.0 + + +# ===================================================================== +# STOP stage — happy paths +# ===================================================================== + + +class TestStopNoOp: + def test_no_session_id_allows(self, project_root: Path) -> None: + policy = SessionLogEnforcer() + event = _make_event(EventType.STOP, project_root, session_id=None) + verdict = policy.evaluate(event, None) + assert verdict.is_allowing() + + def test_no_active_record_allows(self, project_root: Path) -> None: + """Cached/restored session w/o SESSION_START: don't warn.""" + policy = SessionLogEnforcer() + event = _make_event(EventType.STOP, project_root, session_id="ghost") + verdict = policy.evaluate(event, None) + assert verdict.is_allowing() + assert verdict.metadata.get("reason") == "no_active_record" + + def test_non_git_project_allows(self, project_root: Path) -> None: + """No git repo → commit count is 0 → no warn.""" + policy = SessionLogEnforcer() + ts = time.time() - 60 + policy.evaluate( + _make_event( + EventType.SESSION_START, project_root, session_id="s1", timestamp=ts + ), + None, + ) + verdict = policy.evaluate( + _make_event(EventType.STOP, project_root, session_id="s1"), + None, + ) + assert verdict.is_allowing() + assert verdict.metadata.get("commit_count") == 0 + + def test_git_no_commits_since_start_allows(self, git_project: Path) -> None: + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.is_allowing() + assert verdict.metadata.get("commit_count") == 0 + + +# ===================================================================== +# STOP stage — enforcement paths +# ===================================================================== + + +class TestStopEnforcement: + def test_commits_without_log_warns(self, git_project: Path) -> None: + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "feat: in-session work", at_epoch=ts + 10) + _git_commit(git_project, "fix: more work", at_epoch=ts + 20) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.action == "warn" + assert "2 commits" in (verdict.message or "") + assert "write_session_log" in (verdict.message or "") + assert verdict.metadata["commit_count"] == 2 + assert verdict.metadata["log_present"] is False + + def test_commits_with_log_allows(self, git_project: Path) -> None: + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "feat: shipped", at_epoch=ts + 10) + # AI called write_session_log — entry lands after session start + _write_session_log_entry(git_project, ts_epoch=ts + 5) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.is_allowing() + assert verdict.metadata.get("log_present") is True + + def test_stale_log_before_session_does_not_satisfy( + self, + git_project: Path, + ) -> None: + """A pre-session log doesn't count — must be in window.""" + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + + # Stale log written WAY before this session started + _write_session_log_entry(git_project, ts_epoch=ts - 3600) + + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "feat: stuff", at_epoch=ts + 10) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.action == "warn" + + def test_single_commit_uses_singular(self, git_project: Path) -> None: + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "fix: one thing", at_epoch=ts + 10) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.action == "warn" + assert "1 commit " in (verdict.message or "") + assert "1 commits" not in (verdict.message or "") + + +# ===================================================================== +# Mode switching +# ===================================================================== + + +class TestMode: + def test_block_mode_blocks( + self, + git_project: Path, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setenv("CODEVIRA_SESSION_LOG_ENFORCER_MODE", "block") + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "feat: a", at_epoch=ts + 10) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.is_blocking() + assert verdict.metadata["mode"] == "block" + + def test_off_mode_allows_even_with_gap( + self, + git_project: Path, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setenv("CODEVIRA_SESSION_LOG_ENFORCER_MODE", "off") + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "feat: untracked", at_epoch=ts + 10) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.is_allowing() + + def test_unknown_mode_defaults_to_warn( + self, + git_project: Path, + monkeypatch: pytest.MonkeyPatch, + ) -> None: + monkeypatch.setenv("CODEVIRA_SESSION_LOG_ENFORCER_MODE", "nonsense") + policy = SessionLogEnforcer() + ts = _ts_after_head(git_project) + policy.evaluate( + _make_event( + EventType.SESSION_START, git_project, session_id="s1", timestamp=ts + ), + None, + ) + _git_commit(git_project, "feat: x", at_epoch=ts + 10) + + verdict = policy.evaluate( + _make_event(EventType.STOP, git_project, session_id="s1"), + None, + ) + assert verdict.action == "warn" + + +# ===================================================================== +# Direct helper coverage +# ===================================================================== + + +class TestHelpers: + def test_count_commits_non_git_returns_zero(self, project_root: Path) -> None: + assert _count_commits_since(project_root, 0.0) == 0 + + def test_count_commits_baseline_only(self, git_project: Path) -> None: + # threshold AFTER baseline → 0 commits since + assert _count_commits_since(git_project, _ts_after_head(git_project)) == 0 + + def test_count_commits_counts_after_threshold(self, git_project: Path) -> None: + ts = _ts_after_head(git_project) + _git_commit(git_project, "feat: post-threshold", at_epoch=ts + 10) + assert _count_commits_since(git_project, ts) == 1 + + def test_session_log_written_missing_file(self, project_root: Path) -> None: + # remove the sessions file the fixture pre-created + (project_root / ".codevira" / "sessions.jsonl").unlink(missing_ok=True) + assert _session_log_written(project_root, time.time()) is False + + def test_session_log_written_in_window(self, project_root: Path) -> None: + threshold = time.time() - 100 + _write_session_log_entry(project_root, ts_epoch=threshold + 50) + assert _session_log_written(project_root, threshold) is True + + def test_session_log_written_only_before_threshold( + self, + project_root: Path, + ) -> None: + threshold = time.time() + _write_session_log_entry(project_root, ts_epoch=threshold - 1000) + assert _session_log_written(project_root, threshold) is False + + +# ===================================================================== +# Cross-tool registration — confirms register_default_policies wires us in +# ===================================================================== + + +class TestRegistration: + def test_session_log_enforcer_registered_by_default(self) -> None: + from mcp_server.engine import ( + register_default_policies, + registered_policies, + reset_policies, + ) + + reset_policies() + register_default_policies() + names = {p.name for p in registered_policies()} + assert "session_log_enforcer" in names + + def test_register_is_idempotent(self) -> None: + from mcp_server.engine import ( + register_default_policies, + registered_policies, + reset_policies, + ) + + reset_policies() + register_default_policies() + register_default_policies() + names = [p.name for p in registered_policies()] + assert names.count("session_log_enforcer") == 1 From 6c567e0b0b20eec26c2635d862b953541ce16957 Mon Sep 17 00:00:00 2001 From: Sachin Shelke Date: Sun, 31 May 2026 12:53:10 +0530 Subject: [PATCH 40/44] =?UTF-8?q?feat(graph):=20v3.2.0=20=E2=80=94=20Q&A?= =?UTF-8?q?=20vocab=20expansion=20(who/when/compare)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three new intent patterns + answer renderers in the viewer's ask-the-graph surface: - "who decided X" / "which IDE decided X" → groups matching decisions by ide, surfaces cross-tool authorship that's invisible in the rank-only view. - "when did we X" / "timeline of X" → chronological sort with first/last dates and date-stamped result list. - "compare X and Y" / "X vs Y" → two-column side-by-side of the top match per topic, with outcome/protected badges. Each follows the existing _scoreForQuery + filter pattern so behavior is consistent with the v3.1.x ranked search. Cheatsheet in qHelp updated to surface the new vocab. Drift-guard tests in test_cli_graph.py extended to require the three new JS symbols + the cheatsheet phrases. Co-Authored-By: Claude Opus 4.7 --- mcp_server/graph/template.html | 110 ++++++++++++++++++++++++++++++++- tests/test_cli_graph.py | 14 +++++ 2 files changed, 123 insertions(+), 1 deletion(-) diff --git a/mcp_server/graph/template.html b/mcp_server/graph/template.html index c67241c..bb38536 100644 --- a/mcp_server/graph/template.html +++ b/mcp_server/graph/template.html @@ -446,7 +446,7 @@
tokens: tag: ide: kind: protected: since: until: -
ask: what about X · why did we · what got reverted · what's protected +
ask: what about X · why did we · who decided X · when did we X · compare X and Y · what got reverted · what's protected