diff --git a/cheetahclaws/commands/advanced.py b/cheetahclaws/commands/advanced.py index 433d06c..1134e25 100644 --- a/cheetahclaws/commands/advanced.py +++ b/cheetahclaws/commands/advanced.py @@ -2126,7 +2126,7 @@ def cmd_summarize(args: str, _state, config) -> bool: def cmd_memory(args: str, _state, config) -> bool: from cheetahclaws.memory import search_memory, load_index - from cheetahclaws.memory.scan import scan_all_memories, format_memory_manifest, memory_freshness_text + from cheetahclaws.memory.scan import scan_all_memories, format_memory_manifest, memory_freshness_text, verified_epoch stripped = args.strip() @@ -2160,7 +2160,7 @@ def cmd_memory(args: str, _state, config) -> bool: return True info(f" {len(headers)} memory/memories (newest first):") for h in headers: - fresh_warn = " ⚠ stale" if memory_freshness_text(h.mtime_s) else "" + fresh_warn = " ⚠ stale" if memory_freshness_text(verified_epoch(h.last_verified, h.created, h.mtime_s)) else "" tag = f"[{h.type or '?':9s}|{h.scope:7s}]" info(f" {tag} {h.filename}{fresh_warn}") if h.description: diff --git a/cheetahclaws/memory/context.py b/cheetahclaws/memory/context.py index d875f38..115df8b 100644 --- a/cheetahclaws/memory/context.py +++ b/cheetahclaws/memory/context.py @@ -19,7 +19,7 @@ load_entries, search_memory, ) -from .scan import scan_all_memories, format_memory_manifest, memory_freshness_text +from .scan import scan_all_memories, format_memory_manifest, memory_freshness_text, verified_epoch from .types import MEMORY_SYSTEM_PROMPT @@ -126,7 +126,9 @@ def find_relevant_memories( return [] if not use_ai or not config: - # Return top max_results by recency (newest first) + # Return top max_results by recency-of-verification (most recently + # verified first). mtime is kept only as a legacy fallback inside + # verified_epoch for files that predate the date fields. from .scan import scan_all_memories headers = scan_all_memories() path_to_mtime = {h.file_path: h.mtime_s for h in headers} @@ -134,6 +136,7 @@ def find_relevant_memories( results = [] for entry in keyword_results[:max_results * 3]: mtime_s = path_to_mtime.get(entry.file_path, 0) + vepoch = verified_epoch(entry.last_verified, entry.created, mtime_s) results.append({ "name": entry.name, "description": entry.description, @@ -142,11 +145,13 @@ def find_relevant_memories( "content": entry.content, "file_path": entry.file_path, "mtime_s": mtime_s, - "freshness_text": memory_freshness_text(mtime_s), + "verified_s": vepoch, + "last_verified": entry.last_verified or entry.created, + "freshness_text": memory_freshness_text(vepoch), "confidence": entry.confidence, "source": entry.source, }) - results.sort(key=lambda r: r["mtime_s"], reverse=True) + results.sort(key=lambda r: r["verified_s"], reverse=True) return results[:max_results] # Step 2: AI-powered relevance selection (optional, lightweight) @@ -210,6 +215,7 @@ def _ai_select_memories( continue entry = candidates[i] mtime_s = path_to_mtime.get(entry.file_path, 0) if "path_to_mtime" in dir() else 0 + vepoch = verified_epoch(entry.last_verified, entry.created, mtime_s) results.append({ "name": entry.name, "description": entry.description, @@ -218,7 +224,9 @@ def _ai_select_memories( "content": entry.content, "file_path": entry.file_path, "mtime_s": mtime_s, - "freshness_text": memory_freshness_text(mtime_s), + "verified_s": vepoch, + "last_verified": entry.last_verified or entry.created, + "freshness_text": memory_freshness_text(vepoch), "confidence": entry.confidence, "source": entry.source, }) diff --git a/cheetahclaws/memory/scan.py b/cheetahclaws/memory/scan.py index b982770..85d2e22 100644 --- a/cheetahclaws/memory/scan.py +++ b/cheetahclaws/memory/scan.py @@ -31,6 +31,9 @@ class MemoryHeader: description: value from frontmatter `description:` field type: value from frontmatter `type:` field scope: "user" or "project" + created: value from frontmatter `created:` field (may be "") + last_verified: value from frontmatter `last_verified:` field, falling back + to `created`. Anchors staleness (see verified_epoch). """ filename: str file_path: str @@ -38,6 +41,8 @@ class MemoryHeader: description: str type: str scope: str + created: str = "" + last_verified: str = "" # ── Scanning ─────────────────────────────────────────────────────────────── @@ -68,6 +73,8 @@ def scan_memory_dir(mem_dir: Path, scope: str) -> list[MemoryHeader]: description=meta.get("description", ""), type=meta.get("type", ""), scope=scope, + created=meta.get("created", ""), + last_verified=meta.get("last_verified", "") or meta.get("created", ""), )) except Exception: continue @@ -111,6 +118,12 @@ def memory_freshness_text(mtime_s: float) -> str: Motivated by user reports of stale code-state memories (file:line citations to code that has since changed) being asserted as fact. + + Note: callers should pass a *verification* timestamp (see verified_epoch), + not a raw filesystem mtime — otherwise simply retrieving a memory (which + can rewrite the file) would suppress this warning. The parameter name is + kept for backward compatibility; semantically it is "seconds since the + memory's claim was last established". """ d = memory_age_days(mtime_s) if d <= 1: @@ -123,6 +136,55 @@ def memory_freshness_text(mtime_s: float) -> str: ) +# ── Verification-anchored staleness ──────────────────────────────────────── + +def parse_date_epoch(date_str: str) -> float: + """Parse a 'YYYY-MM-DD' (or full ISO) date string to an epoch in seconds. + + Returns 0.0 if the string is empty or unparseable. Day granularity is + sufficient here: both the staleness warning and the recency decay work in + whole days. + """ + if not date_str: + return 0.0 + try: + from datetime import datetime + return datetime.fromisoformat(str(date_str).strip()).timestamp() + except (ValueError, TypeError): + return 0.0 + + +def verified_epoch(last_verified: str, created: str, mtime_s: float = 0.0) -> float: + """Resolve the timestamp that anchors a memory's staleness. + + Preference order: last_verified -> created -> filesystem mtime. + + The filesystem mtime is only a last-resort fallback for legacy memory + files written before the date fields existed. Because an explicit date is + always preferred over mtime, a *read* of the memory (which may rewrite the + file, and which we additionally guard with os.utime in touch_last_used) + can never reset its staleness once a date is present. This is the fix for + the "retrieval resets staleness" bug: freshness reflects when the claim + was last *verified*, not when the file was last *touched*. + """ + e = parse_date_epoch(last_verified) or parse_date_epoch(created) + return e if e else (mtime_s or 0.0) + + +def trust_recency(verified_s: float, now: float | None = None) -> float: + """Exponential recency weight from the last-verified time. + + exp(-age_days / 30) → half-life ≈ 21 days. Older-since-verified yields a + smaller weight. Used as the recency factor in confidence × recency + retrieval ranking, replacing the previous mtime-based recency that a read + could reset to 1.0. + """ + if now is None: + now = time.time() + age_days = max(0.0, (now - (verified_s or 0.0)) / 86_400.0) + return math.exp(-age_days / 30.0) + + # ── Manifest formatting ──────────────────────────────────────────────────── def format_memory_manifest(headers: list[MemoryHeader]) -> str: diff --git a/cheetahclaws/memory/store.py b/cheetahclaws/memory/store.py index 74a9fdf..3b556a0 100644 --- a/cheetahclaws/memory/store.py +++ b/cheetahclaws/memory/store.py @@ -58,7 +58,11 @@ class MemoryEntry: scope: "user" | "project" — which directory this was loaded from confidence: 0.0–1.0 reliability score (default 1.0 = explicit user statement) source: origin: "user" | "model" | "tool" | "consolidator" - last_used_at: ISO date of last retrieval (updated on MemorySearch hits) + last_used_at: ISO date of last *retrieval* (updated on MemorySearch hits; + utility/analytics signal only — does NOT affect staleness) + last_verified: ISO date the memory's claim was last *re-checked* against the + live environment. Anchors the staleness clock. Defaults to + `created` at save time; refreshed only via mark_verified(). conflict_group: tag linking related/conflicting memories (e.g. "writing_style") """ name: str @@ -71,6 +75,7 @@ class MemoryEntry: confidence: float = 1.0 source: str = "user" last_used_at: str = "" + last_verified: str = "" conflict_group: str = "" @@ -111,6 +116,12 @@ def _format_entry_md(entry: MemoryEntry) -> str: f"type: {entry.type}", f"created: {entry.created}", ] + # last_verified anchors the staleness clock. A freshly written memory is + # "verified now", so if it is not set explicitly it defaults to the + # creation date. It is refreshed ONLY by mark_verified(), never by a read. + lv = entry.last_verified or entry.created + if lv: + lines.append(f"last_verified: {lv}") if entry.confidence != 1.0: lines.append(f"confidence: {entry.confidence:.2f}") if entry.source and entry.source != "user": @@ -187,6 +198,7 @@ def load_entries(scope: str = "user") -> list[MemoryEntry]: confidence=float(meta.get("confidence", 1.0)), source=meta.get("source", "user"), last_used_at=meta.get("last_used_at", ""), + last_verified=meta.get("last_verified", "") or meta.get("created", ""), conflict_group=meta.get("conflict_group", ""), )) return entries @@ -272,14 +284,22 @@ def check_conflict(entry: "MemoryEntry", scope: str = "user") -> dict | None: def touch_last_used(file_path: str) -> None: """Update the last_used_at frontmatter field of a memory file to today. - Called by MemorySearch when a memory is returned so staleness/utility + Called by MemorySearch when a memory is returned, so retrieval/utility tracking stays current. Silent on any error. + + Importantly, this is a *read*-side bookkeeping write: it must NOT make the + memory look freshly verified. We therefore (a) never touch last_verified, + and (b) restore the file's original mtime after rewriting, so any legacy + mtime-based staleness consumer is not reset merely because the memory was + retrieved. Staleness is anchored to last_verified (see mark_verified). """ from datetime import date + import os fp = Path(file_path) if not fp.exists(): return try: + st = fp.stat() # capture original (a,m)time before the bookkeeping write text = fp.read_text() meta, body = parse_frontmatter(text) today = date.today().isoformat() @@ -288,13 +308,50 @@ def touch_last_used(file_path: str) -> None: meta["last_used_at"] = today # Rebuild frontmatter fm_lines = ["---"] - for k in ("name", "description", "type", "created", "confidence", - "source", "last_used_at", "conflict_group"): + for k in ("name", "description", "type", "created", "last_verified", + "confidence", "source", "last_used_at", "conflict_group"): v = meta.get(k) if v is not None and str(v): fm_lines.append(f"{k}: {v}") fm_lines.append("---") new_text = "\n".join(fm_lines) + "\n" + body + "\n" fp.write_text(new_text) + # A read must not look like a write: restore the original mtime. + os.utime(fp, (st.st_atime, st.st_mtime)) except Exception: pass + + +def mark_verified(file_path: str) -> bool: + """Stamp last_verified = today after a memory's claim was re-checked against + the live environment. + + This is the ONLY operation that refreshes the staleness clock; plain + retrieval (touch_last_used) deliberately does not. Call this once the + agent has confirmed the memory still holds (e.g. the file/function/flag it + cites still exists). Unlike touch_last_used, this is a genuine freshness + event, so the file's mtime is allowed to advance. + + Returns True if the field is set to today, False on any error. + """ + from datetime import date + fp = Path(file_path) + if not fp.exists(): + return False + try: + meta, body = parse_frontmatter(fp.read_text()) + today = date.today().isoformat() + if meta.get("last_verified") == today: + return True + meta["last_verified"] = today + fm_lines = ["---"] + for k in ("name", "description", "type", "created", "last_verified", + "confidence", "source", "last_used_at", "conflict_group"): + v = meta.get(k) + if v is not None and str(v): + fm_lines.append(f"{k}: {v}") + fm_lines.append("---") + fp.write_text("\n".join(fm_lines) + "\n" + body + "\n") + return True + except Exception: + return False diff --git a/cheetahclaws/memory/tools.py b/cheetahclaws/memory/tools.py index ad3cbb5..238bcd9 100644 --- a/cheetahclaws/memory/tools.py +++ b/cheetahclaws/memory/tools.py @@ -7,9 +7,12 @@ from datetime import datetime from cheetahclaws.tool_registry import ToolDef, register_tool -from .store import MemoryEntry, save_memory, delete_memory, load_index, check_conflict, touch_last_used +from .store import ( + MemoryEntry, save_memory, delete_memory, load_index, check_conflict, + touch_last_used, mark_verified, +) from .context import find_relevant_memories -from .scan import scan_all_memories, format_memory_manifest +from .scan import scan_all_memories, format_memory_manifest, trust_recency # ── Tool implementations ─────────────────────────────────────────────────── @@ -57,9 +60,13 @@ def _memory_delete(params: dict, config: dict) -> str: def _memory_search(params: dict, config: dict) -> str: """Search memories by keyword query with optional AI relevance filtering. - Results are ranked by: confidence × recency (30-day exponential decay). + Results are ranked by confidence × recency, where recency decays from the + time the memory was last *verified* against the environment (half-life + ≈ 21 days) — not from when the file was last touched. Retrieving a memory + updates last_used_at for analytics but does NOT make a stale memory look + fresh. """ - import math, time as _time + import time as _time query = params["query"] use_ai = params.get("use_ai", False) max_results = params.get("max_results", 5) @@ -71,16 +78,15 @@ def _memory_search(params: dict, config: dict) -> str: if not results: return f"No memories found matching '{query}'." - # Re-rank by confidence × recency score + # Re-rank by confidence × verification-anchored recency. now = _time.time() for r in results: - age_days = max(0, (now - r["mtime_s"]) / 86400) - recency = math.exp(-age_days / 30) # half-life ≈ 21 days - r["_rank"] = r.get("confidence", 1.0) * recency + verified_s = r.get("verified_s", r.get("mtime_s", 0.0)) + r["_rank"] = r.get("confidence", 1.0) * trust_recency(verified_s, now) results.sort(key=lambda r: r["_rank"], reverse=True) results = results[:max_results] - # Touch last_used_at for returned memories + # Touch last_used_at for returned memories (does not affect staleness). for r in results: if r.get("file_path"): touch_last_used(r["file_path"]) @@ -129,6 +135,31 @@ def _memory_list(params: dict, config: dict) -> str: return "\n".join(lines) +def _memory_verify(params: dict, config: dict) -> str: + """Refresh a memory's staleness clock after re-checking it against reality. + + Call this AFTER confirming the memory's claim still holds (e.g. the file, + function, or flag it cites still exists). This is the only thing that + resets staleness — plain MemorySearch does not. Keeps trustworthy memory a + runtime decision rather than a property of a stored item. + """ + from .store import get_memory_dir, _slugify + name = params["name"] + scope = params.get("scope", "all") + scopes = ["user", "project"] if scope == "all" else [scope] + slug = _slugify(name) + for s in scopes: + fp = get_memory_dir(s) / f"{slug}.md" + if fp.exists(): + if mark_verified(str(fp)): + return ( + f"Memory verified: '{name}' [{s}] — staleness clock reset to today. " + "Its retrieval ranking and freshness warning now reflect this re-check." + ) + return f"Memory '{name}' found in {s} scope but could not be updated." + return f"No memory named '{name}' found to verify (scope: {scope})." + + # ── Tool registrations ───────────────────────────────────────────────────── register_tool(ToolDef( @@ -286,3 +317,33 @@ def _memory_list(params: dict, config: dict) -> str: read_only=True, concurrent_safe=True, )) + +register_tool(ToolDef( + name="MemoryVerify", + schema={ + "name": "MemoryVerify", + "description": ( + "Mark a memory as re-verified against the live environment, refreshing " + "its staleness clock. Call this AFTER you have confirmed the memory's " + "claim still holds (e.g. the file/function/flag it references still " + "exists, or you re-read the current code). Plain MemorySearch does NOT " + "refresh staleness — only this does. Use it to keep a still-correct but " + "old memory ranked highly and free of the stale-memory warning." + ), + "input_schema": { + "type": "object", + "properties": { + "name": {"type": "string", "description": "Name of the memory that was re-checked"}, + "scope": { + "type": "string", + "enum": ["user", "project", "all"], + "description": "Which scope to look in (default: 'all')", + }, + }, + "required": ["name"], + }, + }, + func=_memory_verify, + read_only=False, + concurrent_safe=False, +)) diff --git a/tests/test_memory_staleness.py b/tests/test_memory_staleness.py new file mode 100644 index 0000000..7c0d563 --- /dev/null +++ b/tests/test_memory_staleness.py @@ -0,0 +1,199 @@ +"""Regression tests for the "retrieval resets staleness" memory bug. + +Before the fix, both the retrieval recency score and the staleness warning +were computed from a memory file's filesystem mtime, while ``touch_last_used`` +rewrote the file on every MemorySearch hit — bumping that mtime. The effect: +a single *read* of a stale, never-re-verified memory reset its recency to ~1.0 +and suppressed its "verify against current code" warning, exactly the +"stale-but-confident" failure the project's own design warns against. + +The fix anchors staleness to a ``last_verified`` date (falling back to +``created``, then mtime for legacy files), refreshes it only via +``mark_verified`` / the MemoryVerify tool, and makes ``touch_last_used`` +preserve the file mtime. These tests would FAIL against the pre-fix code and +PASS after it. +""" +import os +import time +from datetime import date, timedelta + +import pytest + +import cheetahclaws.memory.store as _store +from cheetahclaws.memory.store import ( + parse_frontmatter, + touch_last_used, + mark_verified, +) +from cheetahclaws.memory.context import find_relevant_memories +from cheetahclaws.memory.scan import ( + verified_epoch, + trust_recency, + memory_freshness_text, + parse_date_epoch, +) + + +# ── Fixtures / helpers ────────────────────────────────────────────────────── + +@pytest.fixture(autouse=True) +def redirect_memory_dirs(tmp_path, monkeypatch): + """Point user + project memory dirs at a temp location for every test.""" + user_mem = tmp_path / "user_memory" + user_mem.mkdir() + proj_mem = tmp_path / "project_memory" + proj_mem.mkdir() + monkeypatch.setattr(_store, "USER_MEMORY_DIR", user_mem) + monkeypatch.setattr(_store, "get_project_memory_dir", lambda: proj_mem) + return user_mem, proj_mem + + +def _write_memory(mem_dir, slug, *, content, created, last_verified, + mtime_days_ago=None, confidence=1.0): + """Create a memory .md file with explicit dates and (optionally) a + backdated filesystem mtime. Returns the file path.""" + fp = mem_dir / f"{slug}.md" + fm = ["---", f"name: {slug}", f"description: {slug} desc", "type: project", + f"created: {created}"] + if last_verified: + fm.append(f"last_verified: {last_verified}") + if confidence != 1.0: + fm.append(f"confidence: {confidence:.2f}") + fm.append("---") + fp.write_text("\n".join(fm) + "\n" + content + "\n") + if mtime_days_ago is not None: + epoch = time.time() - mtime_days_ago * 86_400 + os.utime(fp, (epoch, epoch)) + return fp + + +def _find_one(query, name): + """Run keyword retrieval and return the single result dict for ``name``.""" + results = find_relevant_memories(query, max_results=5, use_ai=False) + matches = [r for r in results if r["name"] == name] + assert matches, f"expected to retrieve {name!r}, got {[r['name'] for r in results]}" + return matches[0] + + +# ── The bug: a read must not reset staleness ──────────────────────────────── + +class TestRetrievalDoesNotResetStaleness: + def test_stale_memory_stays_stale_after_retrieval(self, redirect_memory_dirs): + user_mem, _ = redirect_memory_dirs + old = (date.today() - timedelta(days=60)).isoformat() + fp = _write_memory( + user_mem, "loader_location", + content="The data loader is defined in utils/loader.py", + created=old, last_verified=old, mtime_days_ago=60, + ) + + # Before any retrieval: clearly stale. + before = _find_one("loader", "loader_location") + assert before["freshness_text"], "a 60-day-old memory should warn as stale" + assert trust_recency(before["verified_s"]) < 0.2 + + # Simulate a MemorySearch hit writing last_used_at bookkeeping. + touch_last_used(str(fp)) + + # After retrieval: STILL stale. (Pre-fix, mtime was bumped to now and + # both signals were derived from mtime, so this assertion failed.) + after = _find_one("loader", "loader_location") + assert after["freshness_text"], "retrieval must not suppress the stale warning" + assert trust_recency(after["verified_s"]) < 0.2 + assert after["verified_s"] == before["verified_s"] + + def test_touch_last_used_preserves_file_mtime(self, redirect_memory_dirs): + user_mem, _ = redirect_memory_dirs + old = (date.today() - timedelta(days=45)).isoformat() + fp = _write_memory(user_mem, "note", content="loader note", + created=old, last_verified=old, mtime_days_ago=45) + mtime_before = fp.stat().st_mtime + + touch_last_used(str(fp)) # a read-side write + + # last_used_at was recorded... + meta, _ = parse_frontmatter(fp.read_text()) + assert meta.get("last_used_at") == date.today().isoformat() + # ...but the mtime was restored, and last_verified untouched. + assert abs(fp.stat().st_mtime - mtime_before) < 1.0 + assert meta.get("last_verified") == old + + +# ── The correct refresh path: explicit verification ───────────────────────── + +class TestExplicitVerificationRefreshes: + def test_mark_verified_clears_staleness(self, redirect_memory_dirs): + user_mem, _ = redirect_memory_dirs + old = (date.today() - timedelta(days=90)).isoformat() + fp = _write_memory(user_mem, "loader_location", + content="loader lives in utils/loader.py", + created=old, last_verified=old, mtime_days_ago=90) + + assert _find_one("loader", "loader_location")["freshness_text"] + + assert mark_verified(str(fp)) is True + meta, _ = parse_frontmatter(fp.read_text()) + assert meta.get("last_verified") == date.today().isoformat() + + after = _find_one("loader", "loader_location") + assert after["freshness_text"] == "", "a just-verified memory is fresh" + assert trust_recency(after["verified_s"]) > 0.95 + + +# ── Ranking: fresh-verified beats stale, even after the stale one is read ──── + +class TestVerificationAnchoredRanking: + def test_recently_verified_outranks_stale(self, redirect_memory_dirs): + user_mem, _ = redirect_memory_dirs + old = (date.today() - timedelta(days=120)).isoformat() + recent = date.today().isoformat() + stale_fp = _write_memory(user_mem, "stale_loader", + content="loader info (old)", + created=old, last_verified=old, mtime_days_ago=120) + _write_memory(user_mem, "fresh_loader", + content="loader info (verified today)", + created=old, last_verified=recent, mtime_days_ago=120) + + # Read the stale one (which, pre-fix, would have made it look fresh). + touch_last_used(str(stale_fp)) + + results = find_relevant_memories("loader", max_results=5, use_ai=False) + ranked = sorted( + results, + key=lambda r: r.get("confidence", 1.0) * trust_recency(r["verified_s"]), + reverse=True, + ) + assert ranked[0]["name"] == "fresh_loader" + assert ranked[-1]["name"] == "stale_loader" + + +# ── Backward compatibility: legacy files without date fields ──────────────── + +class TestLegacyFallback: + def test_missing_dates_fall_back_to_mtime(self, redirect_memory_dirs): + user_mem, _ = redirect_memory_dirs + fp = user_mem / "legacy.md" + # No created / last_verified frontmatter at all. + fp.write_text("---\nname: legacy\ndescription: loader legacy\ntype: project\n---\nloader legacy body\n") + os.utime(fp, ((time.time() - 50 * 86_400,) * 2)) + + r = _find_one("loader", "legacy") + # verified_epoch falls back to mtime → recognised as ~50 days stale, + # and nothing crashes on the empty date fields. + assert r["freshness_text"] + assert 0.1 < trust_recency(r["verified_s"]) < 0.3 + + def test_parse_date_epoch_handles_garbage(self): + assert parse_date_epoch("") == 0.0 + assert parse_date_epoch("not-a-date") == 0.0 + assert parse_date_epoch("2026-04-02") > 0.0 + + def test_verified_epoch_preference_order(self): + lv = parse_date_epoch("2026-06-01") + cr = parse_date_epoch("2026-01-01") + # last_verified wins over created + assert verified_epoch("2026-06-01", "2026-01-01", 123.0) == lv + # created used when last_verified missing + assert verified_epoch("", "2026-01-01", 123.0) == cr + # mtime only as last resort + assert verified_epoch("", "", 123.0) == 123.0