diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md index 0d90a265..21e70613 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/learn/SKILL.md @@ -116,38 +116,6 @@ The script will: - Deduplicate against existing entities - Display confirmation with the total count -### Step 5: Assess Influence of Recalled Entities - -Regardless of whether Step 4 saved new entities, judge whether the guidelines the recall hook served to *this* session were actually followed, contradicted, or simply irrelevant. This closes the provenance loop: the recall hook records *what* was served; this step records *what effect* it had. - -1. Derive this session's `session_id` from the `saved_trajectory_path` extracted in Step 0: strip the directory prefix and the `claude-transcript_` / `.jsonl` affixes. For `.evolve/trajectories/claude-transcript_abc-123.jsonl` the `session_id` is `abc-123`. - -2. Read `.evolve/audit.log` (JSONL, one object per line). Find every line where `event == "recall"` and `session_id` matches. Take the union of their `entities` arrays — that is the set of guideline identifiers served to this session. Each identifier is a relative path from `.evolve/entities/` without the `.md` suffix (e.g. `guideline/foo` for a local entity, or `subscribed/alice/guideline/foo` for a subscribed one), so it unambiguously names one file. If the set is empty, skip this step. - -3. For each identifier, open `.evolve/entities/.md` with the Read tool. Read its content + trigger — that is the guideline's intent. Skip the identifier (log it as an assessment-less entry) if the file is not found. - -4. Compare against the transcript loaded in Step 0. For each identifier, pick one verdict: - - `followed` — the agent's actual actions are consistent with the guideline's recommendation. - - `contradicted` — the guideline's trigger matched the task but the agent did the opposite, or hit the dead end the guideline would have prevented. - - `not_applicable` — the guideline's trigger didn't match what this session was about. - - Keep `evidence` to one short sentence citing a specific action or tool call from the transcript. - -5. Emit one JSON payload and pipe it to the helper: - -```bash -echo '{ - "session_id": "", - "assessments": [ - {"entity": "guideline/", "verdict": "followed", "evidence": "Agent imported struct and parsed APP1 directly"} - ] -}' | python3 ${CLAUDE_PLUGIN_ROOT}/skills/learn/scripts/log_influence.py -``` - -The `entity` value must match exactly what appeared in the recall event — include the `subscribed//` prefix if the entity came from a subscribed repo. - -Emit zero assessments (empty `assessments` list) when no recall events exist for this session. - ## Quality Gate Before saving, review each entity against this checklist: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/learn/scripts/log_influence.py b/platform-integrations/claude/plugins/evolve-lite/skills/learn/scripts/log_influence.py deleted file mode 100644 index 9a00ddee..00000000 --- a/platform-integrations/claude/plugins/evolve-lite/skills/learn/scripts/log_influence.py +++ /dev/null @@ -1,79 +0,0 @@ -#!/usr/bin/env python3 -"""Append post-hoc influence assessments to .evolve/audit.log. - -Reads JSON from stdin of the form: - { - "session_id": "", - "assessments": [ - {"entity": "", "verdict": "followed|contradicted|not_applicable", - "evidence": ""}, - ... - ] - } -""" - -import json -import sys -from pathlib import Path - -sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent / "lib")) -from entity_io import get_evolve_dir, log as _log # noqa: E402 -import audit # noqa: E402 - - -_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} - - -def log(message): - _log("influence", message) - - -def main(): - try: - payload = json.load(sys.stdin) - except json.JSONDecodeError as exc: - log(f"Invalid JSON input: {exc}") - print(f"Error: invalid JSON input - {exc}", file=sys.stderr) - sys.exit(1) - - if not isinstance(payload, dict): - log(f"Bad payload type: {type(payload).__name__}") - print("Error: payload must be a JSON object.", file=sys.stderr) - sys.exit(1) - - session_id = payload.get("session_id") - assessments = payload.get("assessments", []) - if not session_id or not isinstance(assessments, list): - log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") - print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) - sys.exit(1) - - project_root = str(get_evolve_dir().resolve().parent) - - written = 0 - for a in assessments: - if not isinstance(a, dict): - log(f"Skipping non-dict assessment item: {a!r}") - continue - entity = a.get("entity") - verdict = a.get("verdict") - evidence = a.get("evidence", "") - if not entity or verdict not in _ALLOWED_VERDICTS: - log(f"Skipping invalid assessment: {a}") - continue - audit.append( - project_root=project_root, - event="influence", - session_id=session_id, - entity=entity, - verdict=verdict, - evidence=evidence, - ) - written += 1 - - log(f"Wrote {written} influence record(s) for session {session_id}") - print(f"Recorded {written} influence assessment(s).") - - -if __name__ == "__main__": - main() diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py b/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py index 1cdae7fa..6043c345 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/recall/scripts/retrieve_entities.py @@ -8,8 +8,7 @@ # Add lib to path so we can import entity_io sys.path.insert(0, str(Path(__file__).resolve().parent.parent.parent.parent / "lib")) -from entity_io import find_recall_entity_dirs, get_evolve_dir, markdown_to_entity, log as _log -import audit +from entity_io import find_recall_entity_dirs, markdown_to_entity, log as _log def log(message): @@ -83,13 +82,6 @@ def load_entities_with_source(entities_dir): entity = markdown_to_entity(md) if not entity.get("content"): continue - # Record the on-disk path relative to entities_dir (without the - # .md suffix) as a qualified identifier. This distinguishes - # same-named entities in different trees — e.g. - # "guideline/foo" (local) vs "subscribed/alice/guideline/foo" - # (from a subscribed repo) — so downstream auditing doesn't - # collapse them into one. - entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) # Detect subscribed entities by path: .../entities/subscribed/{name}/... parts = md.parts try: @@ -137,24 +129,6 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") - # Audit: record which entities were served to which session. Must not - # fail the hook if logging errors — recall is the user-visible path. - try: - transcript_path = input_data.get("transcript_path", "") - session_id = Path(transcript_path).stem if transcript_path else None - entity_ids = sorted({e["_id"] for e in entities if e.get("_id")}) - if session_id and entity_ids: - project_root = get_evolve_dir().resolve().parent - audit.append( - project_root=str(project_root), - event="recall", - session_id=session_id, - entities=entity_ids, - ) - log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") - except Exception as exc: - log(f"Audit append failed (non-fatal): {exc}") - if __name__ == "__main__": main() diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_sandbox_learn_recall.py index 0731730a..d39293a0 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_sandbox_learn_recall.py @@ -164,36 +164,3 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a # valid guideline as "install via pip and use", so we don't ban them. assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands) - - # --- Usage provenance: audit.log should record recall + influence --- - audit_log = sandbox_workspace / ".evolve" / "audit.log" - assert audit_log.is_file(), f"{audit_log} was not created — recall did not append audit events" - - events = [] - for line in audit_log.read_text().splitlines(): - line = line.strip() - if not line: - continue - events.append(json.loads(line)) - - session2_id = session2_transcript.stem.removeprefix("claude-transcript_") - # Recall audit records qualified ids — path relative to .evolve/entities/ - # without the .md suffix — so we match session 1's entities the same way. - session1_ids = {str(p.relative_to(entities_dir).with_suffix("")) for p in entity_files} - - recall_events = [e for e in events if e.get("event") == "recall" and e.get("session_id") == session2_id] - assert recall_events, f"no recall audit event for session 2 ({session2_id}). all events: {events}" - recalled_ids = {eid for e in recall_events for eid in e.get("entities", [])} - assert recalled_ids & session1_ids, f"recall event entities {recalled_ids} did not include any id from session 1 ({session1_ids})" - log.info(f"session 2: audit recorded recall of {recalled_ids}") - - influence_events = [e for e in events if e.get("event") == "influence" and e.get("session_id") == session2_id] - assert influence_events, ( - f"no influence audit event for session 2 ({session2_id}). recall events exist but learn did not emit assessments." - ) - for ie in influence_events: - assert ie.get("verdict") in {"followed", "contradicted", "not_applicable"}, f"influence event has invalid verdict: {ie}" - log.info( - f"session 2: audit recorded {len(influence_events)} influence assessment(s): " - f"{[(e['entity'], e['verdict']) for e in influence_events]}" - ) diff --git a/tests/platform_integrations/test_log_influence.py b/tests/platform_integrations/test_log_influence.py deleted file mode 100644 index 09745054..00000000 --- a/tests/platform_integrations/test_log_influence.py +++ /dev/null @@ -1,199 +0,0 @@ -"""Tests for the Claude plugin's skills/learn/scripts/log_influence.py.""" - -import json -import os -import subprocess -import sys -from pathlib import Path - -import pytest - -pytestmark = [pytest.mark.platform_integrations, pytest.mark.e2e] - -_PLUGIN_ROOT = Path(__file__).parent.parent.parent / "platform-integrations/claude/plugins/evolve-lite" -LOG_INFLUENCE_SCRIPT = _PLUGIN_ROOT / "skills/learn/scripts/log_influence.py" - - -def run_log_influence(project_dir, payload, *, raw_input=None, evolve_dir=None): - """Invoke log_influence.py with the given payload (dict) or raw_input (str).""" - env = {**os.environ} - if evolve_dir: - env["EVOLVE_DIR"] = str(evolve_dir) - stdin = raw_input if raw_input is not None else json.dumps(payload) - return subprocess.run( - [sys.executable, str(LOG_INFLUENCE_SCRIPT)], - input=stdin, - capture_output=True, - text=True, - cwd=str(project_dir), - env=env, - check=False, - ) - - -def read_audit(evolve_dir): - path = evolve_dir / "audit.log" - if not path.is_file(): - return [] - return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] - - -class TestLogInfluence: - def test_writes_single_assessment(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - { - "session_id": "abc-123", - "assessments": [ - {"entity": "slug-a", "verdict": "followed", "evidence": "because"}, - ], - }, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - events = read_audit(evolve_dir) - assert len(events) == 1 - assert events[0] == { - "event": "influence", - "session_id": "abc-123", - "entity": "slug-a", - "verdict": "followed", - "evidence": "because", - "ts": events[0]["ts"], - } - - def test_writes_multiple_assessments(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - { - "session_id": "sess-1", - "assessments": [ - {"entity": "slug-a", "verdict": "followed", "evidence": "e1"}, - {"entity": "slug-b", "verdict": "not_applicable", "evidence": "e2"}, - {"entity": "slug-c", "verdict": "contradicted", "evidence": "e3"}, - ], - }, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - events = read_audit(evolve_dir) - assert len(events) == 3 - verdicts = {e["entity"]: e["verdict"] for e in events} - assert verdicts == {"slug-a": "followed", "slug-b": "not_applicable", "slug-c": "contradicted"} - - def test_skips_assessments_with_invalid_verdict(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - { - "session_id": "sess-1", - "assessments": [ - {"entity": "slug-a", "verdict": "bogus", "evidence": "no"}, - {"entity": "slug-b", "verdict": "followed", "evidence": "yes"}, - ], - }, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - events = read_audit(evolve_dir) - assert len(events) == 1 - assert events[0]["entity"] == "slug-b" - - def test_skips_assessments_missing_entity(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - { - "session_id": "sess-1", - "assessments": [ - {"verdict": "followed", "evidence": "no entity"}, - {"entity": "slug-b", "verdict": "followed", "evidence": "ok"}, - ], - }, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - events = read_audit(evolve_dir) - assert len(events) == 1 - assert events[0]["entity"] == "slug-b" - - def test_skips_non_dict_assessment_items(self, temp_project_dir): - """Non-dict items in the assessments list must not raise AttributeError.""" - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - { - "session_id": "sess-1", - "assessments": [ - "not-a-dict", - 42, - None, - {"entity": "slug-ok", "verdict": "followed", "evidence": "yes"}, - ], - }, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - events = read_audit(evolve_dir) - assert len(events) == 1 - assert events[0]["entity"] == "slug-ok" - - def test_empty_assessments_list_is_ok(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - {"session_id": "sess-1", "assessments": []}, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - assert read_audit(evolve_dir) == [] - - def test_evidence_defaults_to_empty_string(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - { - "session_id": "sess-1", - "assessments": [{"entity": "slug-a", "verdict": "followed"}], - }, - evolve_dir=evolve_dir, - ) - assert result.returncode == 0, result.stderr - events = read_audit(evolve_dir) - assert events[0]["evidence"] == "" - - def test_rejects_non_dict_payload(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence(temp_project_dir, ["not", "a", "dict"], evolve_dir=evolve_dir) - assert result.returncode == 1 - assert "payload" in result.stderr.lower() - assert read_audit(evolve_dir) == [] - - def test_rejects_missing_session_id(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - {"assessments": [{"entity": "a", "verdict": "followed"}]}, - evolve_dir=evolve_dir, - ) - assert result.returncode == 1 - assert read_audit(evolve_dir) == [] - - def test_rejects_non_list_assessments(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence( - temp_project_dir, - {"session_id": "sess-1", "assessments": "oops"}, - evolve_dir=evolve_dir, - ) - assert result.returncode == 1 - assert read_audit(evolve_dir) == [] - - def test_rejects_invalid_json(self, temp_project_dir): - evolve_dir = temp_project_dir / ".evolve" - result = run_log_influence(temp_project_dir, None, raw_input="{not valid json", evolve_dir=evolve_dir) - assert result.returncode == 1 - assert "json" in result.stderr.lower() - assert read_audit(evolve_dir) == []