From 5805abc3dc62946eda4ba5a7e5d5cb78c50f372a Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Wed, 6 May 2026 11:58:49 -0500 Subject: [PATCH 01/13] feat(platform-integrations): add provenance usage audits Restores and extends the PR 239 usage-provenance flow on top of the unified plugin source. Adds offline provenance analysis for recalled guidelines, stores trajectories for the supported harnesses, and adds Docker e2e coverage for Claude and Codex learn/recall/provenance flows. The audit path now writes recall and influence events under the configured EVOLVE_DIR instead of deriving a parent project root, so custom evolve data directories keep recall, entities, and provenance together. Influence writes are also idempotent per session/entity so rerunning provenance does not double-count usage. --- .../commands/evolve-lite-provenance.md | 4 + .../bob/evolve-lite/lib/audit.py | 7 +- .../skills/evolve-lite-learn/SKILL.md | 13 +- .../skills/evolve-lite-provenance/SKILL.md | 63 ++++ .../scripts/log_influence.py | 117 ++++++++ .../scripts/retrieve_entities.py | 29 +- .../claude/plugins/evolve-lite/lib/audit.py | 7 +- .../skills/evolve-lite/learn/SKILL.md | 7 +- .../skills/evolve-lite/provenance/SKILL.md | 63 ++++ .../provenance/scripts/log_influence.py | 117 ++++++++ .../recall/scripts/retrieve_entities.py | 29 +- .../evolve-lite/save-trajectory/SKILL.md | 2 +- .../plugins/evolve-lite/lib/audit.py | 7 +- .../skills/evolve-lite/learn/SKILL.md | 13 +- .../skills/evolve-lite/provenance/SKILL.md | 63 ++++ .../provenance/scripts/log_influence.py | 117 ++++++++ .../recall/scripts/retrieve_entities.py | 29 +- .../evolve-lite/.codex-plugin/plugin.json | 1 + .../codex/plugins/evolve-lite/lib/audit.py | 7 +- .../skills/evolve-lite/learn/SKILL.md | 13 +- .../skills/evolve-lite/provenance/SKILL.md | 63 ++++ .../provenance/scripts/log_influence.py | 117 ++++++++ .../recall/scripts/retrieve_entities.py | 29 +- .../evolve-lite/save-trajectory/SKILL.md | 2 +- plugin-source/lib/audit.py | 7 +- plugin-source/plugin.toml | 1 + .../skills/evolve-lite/learn/SKILL.md.j2 | 14 +- .../skills/evolve-lite/provenance/SKILL.md.j2 | 64 ++++ .../provenance/scripts/log_influence.py | 117 ++++++++ .../recall/scripts/retrieve_entities.py | 29 +- .../evolve-lite/save-trajectory/SKILL.md.j2 | 4 +- sandbox/README.md | 30 +- ...py => test_claude_sandbox_learn_recall.py} | 58 +++- tests/e2e/test_codex_sandbox_learn_recall.py | 279 ++++++++++++++++++ tests/platform_integrations/test_codex.py | 4 + .../test_log_influence.py | 243 +++++++++++++++ .../test_plugin_structure.py | 7 + tests/platform_integrations/test_retrieve.py | 77 +++++ .../test_skill_directory_names.py | 6 + tests/smoke_skills.py | 29 +- 40 files changed, 1848 insertions(+), 40 deletions(-) create mode 100644 platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md create mode 100644 platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md create mode 100644 platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py create mode 100644 platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md create mode 100644 platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py create mode 100644 platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md create mode 100644 platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py create mode 100644 platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md create mode 100644 platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py create mode 100644 plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 create mode 100644 plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py rename tests/e2e/{test_sandbox_learn_recall.py => test_claude_sandbox_learn_recall.py} (67%) create mode 100644 tests/e2e/test_codex_sandbox_learn_recall.py create mode 100644 tests/platform_integrations/test_log_influence.py diff --git a/platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md b/platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md new file mode 100644 index 00000000..2d9bd0fe --- /dev/null +++ b/platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md @@ -0,0 +1,4 @@ +--- +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- +Use the `evolve-lite-provenance` skill on the current conversation. Follow the skill's instructions exactly. diff --git a/platform-integrations/bob/evolve-lite/lib/audit.py b/platform-integrations/bob/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/bob/evolve-lite/lib/audit.py +++ b/platform-integrations/bob/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md index b2f82264..cab3f129 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md @@ -33,9 +33,15 @@ Unless that artifact happens to be: ## Workflow +### Step 0: Save and Load the Conversation + +First, use the evolve-lite:save-trajectory skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + ### Step 1: Analyze the Conversation -Identify from your current conversation: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -76,6 +82,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md new file mode 100644 index 00000000..47e234d6 --- /dev/null +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md @@ -0,0 +1,63 @@ +--- +name: evolve-lite:provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Supported trajectory names: +- `claude-transcript_.jsonl` +- `trajectory_*.json` when its content corresponds to the session being assessed + +If you cannot confidently match a recall event to a trajectory, skip it. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | python3 .bob/skills/evolve-lite-provenance/scripts/log_influence.py +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py new file mode 100644 index 00000000..c22c6870 --- /dev/null +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not entity or verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment: {assessment}") + continue + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py index ade892fe..2d54e439 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,31 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") + elif isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/claude/plugins/evolve-lite/lib/audit.py b/platform-integrations/claude/plugins/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/claude/plugins/evolve-lite/lib/audit.py +++ b/platform-integrations/claude/plugins/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md index 1de6b643..5e33e376 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md @@ -48,7 +48,7 @@ The transcript is JSONL: each line is a separate JSON object. Filter for `"type" ### Step 1: Analyze the Conversation -Identify from your current conversation (loaded from the transcript): +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -89,6 +89,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md new file mode 100644 index 00000000..0d8eaa75 --- /dev/null +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -0,0 +1,63 @@ +--- +name: provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Supported trajectory names: +- `claude-transcript_.jsonl` +- `trajectory_*.json` when its content corresponds to the session being assessed + +If you cannot confidently match a recall event to a trajectory, skip it. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | python3 ${CLAUDE_PLUGIN_ROOT}/skills/evolve-lite/provenance/scripts/log_influence.py +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..c22c6870 --- /dev/null +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not entity or verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment: {assessment}") + continue + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..2d54e439 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,31 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") + elif isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 0c518694..7ce54252 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -115,7 +115,7 @@ Write the trajectory JSON to a temporary file using the **Write** tool, then pas 2. Run the helper script with the file path as an argument: ```bash -tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/save-trajectory/scripts/save_trajectory.py" "$tmp" +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" ``` **Important**: Do NOT use inline Python scripts, heredocs, or stdin piping to pass the trajectory JSON. Always use the Write tool to create a temp file first. This avoids escaping issues with backslashes, quotes, and newlines in conversation content. diff --git a/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py b/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md index ad0fef58..e6174e3a 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md @@ -33,9 +33,15 @@ Unless that artifact happens to be: ## Workflow +### Step 0: Save and Load the Conversation + +First, use the /evolve-lite:save-trajectory skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + ### Step 1: Analyze the Conversation -Identify from your current conversation: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -76,6 +82,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md new file mode 100644 index 00000000..e885d4ee --- /dev/null +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -0,0 +1,63 @@ +--- +name: provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Supported trajectory names: +- `claude-transcript_.jsonl` +- `trajectory_*.json` when its content corresponds to the session being assessed + +If you cannot confidently match a recall event to a trajectory, skip it. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | sh -lc 'real_home="$(python3 -c "import os,pwd; print(pwd.getpwuid(os.getuid()).pw_dir)")"; config_home="${CLAW_CONFIG_HOME:-$real_home/.claw}"; script=".claw/skills/evolve-lite:provenance/scripts/log_influence.py"; [ -f "$script" ] || script="$config_home/skills/evolve-lite:provenance/scripts/log_influence.py"; python3 "$script"' +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..c22c6870 --- /dev/null +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not entity or verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment: {assessment}") + continue + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..2d54e439 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,31 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") + elif isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json b/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json index bf5ab1dd..0632a6d5 100644 --- a/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json +++ b/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json @@ -28,6 +28,7 @@ "defaultPrompt": [ "Recall Evolve entities for this task.", "Save new Evolve learnings from this session.", + "Analyze saved trajectories for Evolve guideline provenance.", "Show me the entities stored for this repo.", "Publish one of my Evolve guidelines.", "Subscribe to a teammate's Evolve guidelines repo." diff --git a/platform-integrations/codex/plugins/evolve-lite/lib/audit.py b/platform-integrations/codex/plugins/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/codex/plugins/evolve-lite/lib/audit.py +++ b/platform-integrations/codex/plugins/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md index 086cf355..13d436e4 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md @@ -33,9 +33,15 @@ Unless that artifact happens to be: ## Workflow +### Step 0: Save and Load the Conversation + +First, use the evolve-lite:save-trajectory skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + ### Step 1: Analyze the Conversation -Identify from your current conversation: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -76,6 +82,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md new file mode 100644 index 00000000..3e1d8ab2 --- /dev/null +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -0,0 +1,63 @@ +--- +name: provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Supported trajectory names: +- `claude-transcript_.jsonl` +- `trajectory_*.json` when its content corresponds to the session being assessed + +If you cannot confidently match a recall event to a trajectory, skip it. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | python3 "$(git rev-parse --show-toplevel 2>/dev/null || pwd)/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py" +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..c22c6870 --- /dev/null +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not entity or verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment: {assessment}") + continue + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..2d54e439 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,31 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") + elif isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index ad37821b..aa0b1bdb 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -114,7 +114,7 @@ Write the trajectory JSON to a temporary file using the **Write** tool, then pas 2. Run the helper script with the file path as an argument: ```bash - +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "$(git rev-parse --show-toplevel 2>/dev/null || pwd)/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" ``` **Important**: Do NOT use inline Python scripts, heredocs, or stdin piping to pass the trajectory JSON. Always use the Write tool to create a temp file first. This avoids escaping issues with backslashes, quotes, and newlines in conversation content. diff --git a/plugin-source/lib/audit.py b/plugin-source/lib/audit.py index fd5c535a..fa43846b 100644 --- a/plugin-source/lib/audit.py +++ b/plugin-source/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/plugin-source/plugin.toml b/plugin-source/plugin.toml index aa920e48..35881f22 100644 --- a/plugin-source/plugin.toml +++ b/plugin-source/plugin.toml @@ -51,6 +51,7 @@ brand_color = "#2563EB" default_prompt = [ "Recall Evolve entities for this task.", "Save new Evolve learnings from this session.", + "Analyze saved trajectories for Evolve guideline provenance.", "Show me the entities stored for this repo.", "Publish one of my Evolve guidelines.", "Subscribe to a teammate's Evolve guidelines repo.", diff --git a/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 b/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 index 8cfaa975..ee7d6c51 100644 --- a/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 @@ -50,10 +50,17 @@ If the saved trajectory file does not exist (e.g., the save-trajectory hook did The transcript is JSONL: each line is a separate JSON object. Filter for `"type": "assistant"` and `"type": "human"` lines, then reconstruct the flow from `message.content`. Look for tool calls, errors in tool results, and user corrections. +{% else -%} +### Step 0: Save and Load the Conversation + +First, use the {{ skill_ref("save-trajectory") }} skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + {% endif -%} ### Step 1: Analyze the Conversation -Identify from your current conversation{% if forked_context | default(false) %} (loaded from the transcript){% endif %}: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -94,6 +101,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 b/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 new file mode 100644 index 00000000..7c72298d --- /dev/null +++ b/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 @@ -0,0 +1,64 @@ +{%- from "_macros.j2" import invoke with context -%} +--- +name: {% if platform == "bob" %}evolve-lite:{% endif %}provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Supported trajectory names: +- `claude-transcript_.jsonl` +- `trajectory_*.json` when its content corresponds to the session being assessed + +If you cannot confidently match a recall event to a trajectory, skip it. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | {{ invoke("provenance", "log_influence.py") }} +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py b/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..c22c6870 --- /dev/null +++ b/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not entity or verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment: {assessment}") + continue + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py b/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..2d54e439 100644 --- a/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,31 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") + elif isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 index 5ce79420..33541a48 100644 --- a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 @@ -118,9 +118,11 @@ Write the trajectory JSON to a temporary file using the **Write** tool, then pas ```bash {% if platform == "claude" -%} -tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/save-trajectory/scripts/save_trajectory.py" "$tmp" +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" {%- elif platform == "claw-code" -%} tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; real_home="$(python3 -c "import os,pwd; print(pwd.getpwuid(os.getuid()).pw_dir)")"; config_home="${CLAW_CONFIG_HOME:-$real_home/.claw}"; script=".claw/skills/evolve-lite:save-trajectory/scripts/save_trajectory.py"; [ -f "$script" ] || script="$config_home/skills/evolve-lite:save-trajectory/scripts/save_trajectory.py"; python3 "$script" "$tmp" +{%- elif platform == "codex" -%} +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "$(git rev-parse --show-toplevel 2>/dev/null || pwd)/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" {%- elif platform == "bob" -%} tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 .bob/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py "$tmp" {%- endif %} diff --git a/sandbox/README.md b/sandbox/README.md index 8ca8151d..877cebc9 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -1,6 +1,7 @@ -# Claude Code Sandbox +# Claude Code / Codex Sandbox -A Docker image for running Claude Code in a sandboxed Debian environment with Python and common Linux tools. +Docker images for running Claude Code or Codex in a sandboxed Debian +environment with Python and common Linux tools. ## Build @@ -32,7 +33,7 @@ docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you" ## Automated E2E Test -`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite +`tests/e2e/test_claude_sandbox_learn_recall.py` exercises the full evolve-lite learn + recall loop end-to-end inside this sandbox. It runs two Claude sessions: @@ -78,11 +79,11 @@ CLAUDE_CODE_SKIP_BEDROCK_AUTH=1 ```bash # If creds live in an env file: dotenv -e path/to/your.env -- \ - uv run pytest tests/e2e/test_sandbox_learn_recall.py \ + uv run pytest tests/e2e/test_claude_sandbox_learn_recall.py \ --run-e2e -m e2e -v --log-cli-level=INFO # Or, with vars already exported: -uv run pytest tests/e2e/test_sandbox_learn_recall.py \ +uv run pytest tests/e2e/test_claude_sandbox_learn_recall.py \ --run-e2e -m e2e -v --log-cli-level=INFO ``` @@ -90,3 +91,22 @@ The `--log-cli-level=INFO` flag streams per-session progress lines live (~4 minutes total). The test skips if Docker, the sandbox image, or credentials are missing. +## Codex Automated E2E Test + +`tests/e2e/test_codex_sandbox_learn_recall.py` runs the same learn + recall +flow against the Dockerized Codex sandbox. Build the image, then load the +Codex sandbox env file with `dotenv`: + +```bash +just sandbox-build codex + +dotenv -e ~/data/creds/codex-sandbox.env -- \ + uv run pytest tests/e2e/test_codex_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO +``` + +The env file should export the provider credential and Codex provider settings +as environment variables, for example `CODEX_MODEL_PROVIDER`, +`CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and +`CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable +values into Docker; it does not mount host credential or Codex config files. diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_claude_sandbox_learn_recall.py similarity index 67% rename from tests/e2e/test_sandbox_learn_recall.py rename to tests/e2e/test_claude_sandbox_learn_recall.py index d39293a0..b7368605 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_claude_sandbox_learn_recall.py @@ -6,6 +6,8 @@ transcript and extracts a guideline. 2. Ask about focal length — UserPromptSubmit recall hook injects the guideline from session 1, so Claude should skip the dead ends. + 3. Run the offline provenance skill to record whether the recalled + guideline influenced session 2. Assertions: - Session 1 produces a guideline file under .evolve/entities/. @@ -19,6 +21,7 @@ import logging import os import re +import shlex import shutil import subprocess import time @@ -76,6 +79,7 @@ def sandbox_workspace(tmp_path): def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess: plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + command = "claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p " + shlex.quote(prompt) cmd = ["docker", "run", "--rm"] for var in FORWARDED_ENV_VARS: if os.environ.get(var): @@ -90,7 +94,7 @@ def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedPro SANDBOX_IMAGE, "bash", "-c", - f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"', + command, ] return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) @@ -116,8 +120,8 @@ def _bash_commands(transcript_path: Path) -> list[str]: @pytest.mark.e2e -def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): - """Session 1 extracts a guideline; session 2 benefits from recall.""" +def test_claude_learn_then_recall_flow(sandbox_ready, sandbox_workspace): + """Session 1 learns, session 2 recalls, session 3 records influence.""" del sandbox_ready # only used for its skip side effect # --- Session 1: location query — expected dead ends then recovery --- @@ -164,3 +168,51 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a # valid guideline as "install via pip and use", so we don't ban them. assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands) + + # --- Usage provenance: audit.log should record recall --- + audit_log = sandbox_workspace / ".evolve" / "audit.log" + assert audit_log.is_file(), f"{audit_log} was not created — recall did not append audit events" + + events = [] + for line in audit_log.read_text().splitlines(): + line = line.strip() + if line: + events.append(json.loads(line)) + + session2_id = session2_transcript.stem.removeprefix("claude-transcript_") + session1_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} + + recall_events = [event for event in events if event.get("event") == "recall" and event.get("session_id") == session2_id] + assert recall_events, f"no recall audit event for session 2 ({session2_id}). all events: {events}" + recalled_ids = {entity_id for event in recall_events for entity_id in event.get("entities", [])} + assert recalled_ids & session1_ids, f"recall event entities {recalled_ids} did not include any id from session 1 ({session1_ids})" + log.info(f"session 2: audit recorded recall of {recalled_ids}") + + # --- Offline provenance: audit.log should record usefulness verdicts --- + log.info("session 3: running offline provenance analysis...") + t2 = time.time() + result3 = _run_sandbox_prompt( + sandbox_workspace, + ( + "Run /evolve-lite:provenance now. Analyze the saved trajectories and " + "the recall events in .evolve/audit.log. Record influence verdicts " + "for any recalled guideline that can be matched to the focal-length " + "photo session. Do not modify source files." + ), + ) + log.info(f"session 3: exited {result3.returncode} after {time.time() - t2:.0f}s") + assert result3.returncode == 0, f"session 3 exited {result3.returncode}\nstderr:\n{result3.stderr[-2000:]}" + + events = [] + for line in audit_log.read_text().splitlines(): + line = line.strip() + if line: + events.append(json.loads(line)) + + influence_events = [event for event in events if event.get("event") == "influence"] + assert influence_events, f"no influence audit event recorded. all events: {events}" + influenced_ids = {event.get("entity") for event in influence_events} + assert influenced_ids & recalled_ids, f"influence events {influence_events} did not assess any recalled ids {recalled_ids}" + for event in influence_events: + assert event.get("verdict") in {"followed", "contradicted", "not_applicable"} + assert event.get("evidence"), f"influence event missing evidence: {event}" diff --git a/tests/e2e/test_codex_sandbox_learn_recall.py b/tests/e2e/test_codex_sandbox_learn_recall.py new file mode 100644 index 00000000..9bd9af69 --- /dev/null +++ b/tests/e2e/test_codex_sandbox_learn_recall.py @@ -0,0 +1,279 @@ +"""End-to-end test of the evolve-lite learn + recall flow in the Codex sandbox. + +Runs two sequential Codex sessions against the Dockerized Codex sandbox: + 1. Session 1 performs an EXIF task, then explicitly runs save-trajectory + and learn so a trajectory and guideline are saved. + 2. Session 2 asks a related EXIF question. The Codex UserPromptSubmit hook + should inject recalled guidance before the prompt is handled. + 3. Session 3 runs the offline provenance skill so the recall audit gets + follow-up influence verdicts. + +Requires Docker, the `evolve-codex-sandbox` image built, and Codex credentials +exported in the environment. +""" + +import json +import logging +import os +import shutil +import subprocess +import time +from pathlib import Path +from typing import Iterable + +import pytest + + +log = logging.getLogger(__name__) + + +SANDBOX_IMAGE = "evolve-codex-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[2] +SESSION_TIMEOUT_SECONDS = 600 +FORWARDED_ENV_VARS = ( + "OPENAI_API_KEY", + "OPENAI_BASE_URL", + "OPENAI_ORG_ID", + "OPENAI_PROJECT_ID", + "CODEX_MODEL", +) +CODEX_PROVIDER_ENV_KEY_VAR = "CODEX_MODEL_PROVIDER_ENV_KEY" + + +@pytest.fixture(scope="session") +def codex_sandbox_ready(): + """Skip if Docker, the Codex sandbox image, or credentials aren't available.""" + if shutil.which("docker") is None: + pytest.skip("docker not installed") + + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + pytest.skip("docker daemon not running") + + image_check = subprocess.run( + ["docker", "image", "inspect", SANDBOX_IMAGE], + capture_output=True, + ) + if image_check.returncode != 0: + pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built - run `just sandbox-build codex`") + + credential_env_var = os.environ.get(CODEX_PROVIDER_ENV_KEY_VAR, "OPENAI_API_KEY") + if not os.environ.get(credential_env_var): + pytest.skip(f"{credential_env_var} not set in environment") + + return True + + +@pytest.fixture +def codex_workspace(tmp_path): + """Copy demo/workspace and install the Codex plugin into it.""" + src = REPO_ROOT / "demo" / "workspace" + workspace = tmp_path / "workspace" + shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + + install_script = REPO_ROOT / "platform-integrations" / "install.sh" + result = subprocess.run( + ["bash", str(install_script), "install", "--platform", "codex", "--dir", str(workspace)], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"codex install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + + _register_codex_plugin_for_container(workspace) + return workspace + + +def _toml_str(value: str) -> str: + return json.dumps(value) + + +def _codex_config_lines() -> list[str]: + lines: list[str] = [] + if model := os.environ.get("CODEX_MODEL"): + lines.append(f"model = {_toml_str(model)}") + + provider = os.environ.get("CODEX_MODEL_PROVIDER") + if provider: + lines.append(f"model_provider = {_toml_str(provider)}") + + base_url = os.environ.get("CODEX_MODEL_PROVIDER_BASE_URL") or os.environ.get("OPENAI_BASE_URL") + if provider and base_url: + provider_name = os.environ.get("CODEX_MODEL_PROVIDER_NAME", provider) + provider_env_key = os.environ.get(CODEX_PROVIDER_ENV_KEY_VAR, "OPENAI_API_KEY") + lines.extend( + [ + "", + f"[model_providers.{_toml_str(provider)}]", + f"name = {_toml_str(provider_name)}", + f"base_url = {_toml_str(base_url)}", + f"env_key = {_toml_str(provider_env_key)}", + ] + ) + if wire_api := os.environ.get("CODEX_MODEL_PROVIDER_WIRE_API"): + lines.append(f"wire_api = {_toml_str(wire_api)}") + + if lines: + lines.append("") + return lines + + +def _register_codex_plugin_for_container(workspace: Path) -> None: + """Pre-populate /codex-home with a local marketplace plugin cache. + + This mirrors the headless registration used by tests/smoke_skills.py, but + writes paths as the container sees them: workspace is mounted at /workspace + and CODEX_HOME is mounted at /codex-home. + """ + codex_home = workspace / ".codex-home" + codex_home.mkdir(parents=True, exist_ok=True) + + plugin_src = workspace / "plugins" / "evolve-lite" + plugin_json = plugin_src / ".codex-plugin" / "plugin.json" + version = json.loads(plugin_json.read_text(encoding="utf-8")).get("version", "0.0.0") + cache_dir = codex_home / "plugins" / "cache" / "evolve-local" / "evolve-lite" / version + cache_dir.mkdir(parents=True, exist_ok=True) + + shutil.copytree(plugin_src / ".codex-plugin", cache_dir / ".codex-plugin", dirs_exist_ok=True) + shutil.copytree(plugin_src / "lib", cache_dir / "lib", dirs_exist_ok=True) + shutil.copytree(plugin_src / "skills" / "evolve-lite", cache_dir / "skills", dirs_exist_ok=True) + + config = "\n".join(_codex_config_lines()) + config += """[marketplaces.evolve-local] +source = "/workspace" + +[plugins."evolve-lite@evolve-local"] +enabled = true +""" + (codex_home / "config.toml").write_text(config, encoding="utf-8") + + +def _forwarded_env_vars() -> Iterable[str]: + yield from FORWARDED_ENV_VARS + provider_env_key = os.environ.get(CODEX_PROVIDER_ENV_KEY_VAR) + if provider_env_key: + yield provider_env_key + + +def _run_codex_prompt(workspace: Path, prompt: str, *, enable_hooks: bool = True) -> subprocess.CompletedProcess: + codex_home = workspace / ".codex-home" + cmd = ["docker", "run", "--rm"] + for var in _forwarded_env_vars(): + if os.environ.get(var): + cmd += ["-e", var] + cmd += [ + "-e", + "EVOLVE_DEBUG=1", + "-e", + "TMPDIR=/workspace/.evolve/tmp", + "-v", + f"{workspace}:/workspace", + "-v", + f"{codex_home}:/codex-home", + SANDBOX_IMAGE, + "codex", + "exec", + "--skip-git-repo-check", + "--ephemeral", + "--dangerously-bypass-approvals-and-sandbox", + "-c", + f"features.codex_hooks={str(enable_hooks).lower()}", + "-C", + "/workspace", + prompt, + ] + return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + + +def _audit_events(evolve_dir: Path) -> list[dict]: + audit_log = evolve_dir / "audit.log" + if not audit_log.is_file(): + return [] + return [json.loads(line) for line in audit_log.read_text().splitlines() if line.strip()] + + +@pytest.mark.e2e +def test_codex_learn_then_recall_flow(codex_sandbox_ready, codex_workspace): + """Session 1 learns, session 2 recalls, session 3 records influence.""" + del codex_sandbox_ready + + evolve_dir = codex_workspace / ".evolve" + + log.info("codex session 1: running seed task with save-trajectory + learn...") + t0 = time.time() + result1 = _run_codex_prompt( + codex_workspace, + ( + "Where was the photo @sample.jpg taken? Use EXIF metadata. " + "When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. " + "Do not skip either evolve-lite skill." + ), + ) + log.info(f"codex session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") + assert result1.returncode == 0, ( + f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}" + ) + + trajectories_dir = evolve_dir / "trajectories" + entities_dir = evolve_dir / "entities" + assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created" + trajectories = list(trajectories_dir.glob("*.json")) + assert trajectories, f"no Codex trajectory JSON files found in {trajectories_dir}" + assert entities_dir.is_dir(), f"{entities_dir} was not created" + entity_files = list(entities_dir.rglob("*.md")) + assert entity_files, f"no guideline files found in {entities_dir}" + + log.info("codex session 2: running related task to exercise recall hook...") + t1 = time.time() + result2 = _run_codex_prompt( + codex_workspace, + ( + "What focal length was used to take the photo @sample.jpg? Use EXIF metadata. " + "When done, invoke the evolve-lite save-trajectory skill. Do not invoke the learn skill." + ), + ) + log.info(f"codex session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") + assert result2.returncode == 0, ( + f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}" + ) + + session2_trajectories = {path for path in trajectories_dir.glob("*.json")} - set(trajectories) + assert session2_trajectories, f"no Codex trajectory saved for session 2 in {trajectories_dir}" + + events = _audit_events(evolve_dir) + recall_events = [event for event in events if event.get("event") == "recall"] + assert recall_events, f"no recall audit event recorded. all events: {events}" + task_recall_event = recall_events[-1] + task_session_id = task_recall_event["session_id"] + recalled_ids = {entity_id for event in recall_events for entity_id in event.get("entities", [])} + task_recalled_ids = set(task_recall_event.get("entities", [])) + learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} + assert recalled_ids & learned_ids, f"recalled ids {recalled_ids} did not include learned ids {learned_ids}" + + log.info("codex session 3: running offline provenance analysis...") + t2 = time.time() + result3 = _run_codex_prompt( + codex_workspace, + ( + "Run the evolve-lite provenance skill now. Analyze the saved trajectories and " + "the recall events in .evolve/audit.log. Record influence verdicts " + f"for recalled guidelines in session {task_session_id}, the focal-length " + "photo session. Do not modify source files." + ), + enable_hooks=False, + ) + log.info(f"codex session 3: exited {result3.returncode} after {time.time() - t2:.0f}s") + assert result3.returncode == 0, ( + f"session 3 exited {result3.returncode}\nstdout:\n{result3.stdout[-2000:]}\nstderr:\n{result3.stderr[-2000:]}" + ) + + events = _audit_events(evolve_dir) + influence_events = [event for event in events if event.get("event") == "influence" and event.get("session_id") == task_session_id] + assert influence_events, f"no influence audit event recorded. all events: {events}" + influenced_ids = {event.get("entity") for event in influence_events} + assert influenced_ids & task_recalled_ids, f"influence events {influence_events} did not assess task recall ids {task_recalled_ids}" + assert any(event.get("verdict") == "followed" for event in influence_events), ( + f"no recalled guideline was followed. influence events: {influence_events}" + ) + for event in influence_events: + assert event.get("verdict") in {"followed", "contradicted", "not_applicable"} + assert event.get("evidence"), f"influence event missing evidence: {event}" diff --git a/tests/platform_integrations/test_codex.py b/tests/platform_integrations/test_codex.py index 1bbc6d8d..03a5dd59 100644 --- a/tests/platform_integrations/test_codex.py +++ b/tests/platform_integrations/test_codex.py @@ -62,12 +62,16 @@ def test_install_creates_expected_files(self, temp_project_dir, install_runner, file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "learn") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "recall") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "publish") + file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "provenance") + file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "save-trajectory") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "subscribe") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "unsubscribe") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "sync") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "learn" / "scripts" / "save_entities.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "recall" / "scripts" / "retrieve_entities.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "publish" / "scripts" / "publish.py") + file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "provenance" / "scripts" / "log_influence.py") + file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "save-trajectory" / "scripts" / "save_trajectory.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "subscribe" / "scripts" / "subscribe.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "unsubscribe" / "scripts" / "unsubscribe.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "sync" / "scripts" / "sync.py") diff --git a/tests/platform_integrations/test_log_influence.py b/tests/platform_integrations/test_log_influence.py new file mode 100644 index 00000000..91b1f625 --- /dev/null +++ b/tests/platform_integrations/test_log_influence.py @@ -0,0 +1,243 @@ +"""Tests for skills/evolve-lite/provenance/scripts/log_influence.py.""" + +import json +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +pytestmark = [pytest.mark.platform_integrations, pytest.mark.e2e] + +_REPO_ROOT = Path(__file__).parent.parent.parent +LOG_INFLUENCE_SCRIPT = ( + _REPO_ROOT / "platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py" +) + + +def run_log_influence(project_dir, payload, *, raw_input=None, evolve_dir=None): + """Invoke log_influence.py with the given payload dict or raw input string.""" + env = {**os.environ} + if evolve_dir: + env["EVOLVE_DIR"] = str(evolve_dir) + stdin = raw_input if raw_input is not None else json.dumps(payload) + return subprocess.run( + [sys.executable, str(LOG_INFLUENCE_SCRIPT)], + input=stdin, + capture_output=True, + text=True, + cwd=str(project_dir), + env=env, + check=False, + ) + + +def read_audit(evolve_dir): + path = evolve_dir / "audit.log" + if not path.is_file(): + return [] + return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + + +class TestLogInfluence: + def test_writes_single_assessment(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "abc-123", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "because"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0] == { + "event": "influence", + "session_id": "abc-123", + "entity": "guideline/slug-a", + "verdict": "followed", + "evidence": "because", + "ts": events[0]["ts"], + } + + def test_writes_under_custom_evolve_dir(self, temp_project_dir): + evolve_dir = temp_project_dir / "custom-evolve-data" + result = run_log_influence( + temp_project_dir, + { + "session_id": "abc-123", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "because"}, + ], + }, + evolve_dir=evolve_dir, + ) + + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["event"] == "influence" + assert not (temp_project_dir / ".evolve" / "audit.log").exists() + + def test_writes_multiple_assessments(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "e1"}, + {"entity": "guideline/slug-b", "verdict": "not_applicable", "evidence": "e2"}, + {"entity": "guideline/slug-c", "verdict": "contradicted", "evidence": "e3"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 3 + verdicts = {event["entity"]: event["verdict"] for event in events} + assert verdicts == { + "guideline/slug-a": "followed", + "guideline/slug-b": "not_applicable", + "guideline/slug-c": "contradicted", + } + + def test_skips_duplicate_assessments_on_rerun(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + payload = { + "session_id": "sess-1", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "e1"}, + {"entity": "guideline/slug-a", "verdict": "contradicted", "evidence": "e2"}, + ], + } + + first = run_log_influence(temp_project_dir, payload, evolve_dir=evolve_dir) + second = run_log_influence(temp_project_dir, payload, evolve_dir=evolve_dir) + + assert first.returncode == 0, first.stderr + assert second.returncode == 0, second.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-a" + assert events[0]["verdict"] == "followed" + + def test_skips_assessments_with_invalid_verdict(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "bogus", "evidence": "no"}, + {"entity": "guideline/slug-b", "verdict": "followed", "evidence": "yes"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-b" + + def test_skips_assessments_missing_entity(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"verdict": "followed", "evidence": "no entity"}, + {"entity": "guideline/slug-b", "verdict": "followed", "evidence": "ok"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-b" + + def test_skips_non_dict_assessment_items(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + "not-a-dict", + 42, + None, + {"entity": "guideline/slug-ok", "verdict": "followed", "evidence": "yes"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-ok" + + def test_empty_assessments_list_is_ok(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"session_id": "sess-1", "assessments": []}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + assert read_audit(evolve_dir) == [] + + def test_evidence_defaults_to_empty_string(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [{"entity": "guideline/slug-a", "verdict": "followed"}], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert events[0]["evidence"] == "" + + def test_rejects_non_dict_payload(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence(temp_project_dir, ["not", "a", "dict"], evolve_dir=evolve_dir) + assert result.returncode == 1 + assert "payload" in result.stderr.lower() + assert read_audit(evolve_dir) == [] + + def test_rejects_missing_session_id(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"assessments": [{"entity": "guideline/a", "verdict": "followed"}]}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 1 + assert read_audit(evolve_dir) == [] + + def test_rejects_non_list_assessments(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"session_id": "sess-1", "assessments": "oops"}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 1 + assert read_audit(evolve_dir) == [] + + def test_rejects_invalid_json(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence(temp_project_dir, None, raw_input="{not valid json", evolve_dir=evolve_dir) + assert result.returncode == 1 + assert "json" in result.stderr.lower() + assert read_audit(evolve_dir) == [] diff --git a/tests/platform_integrations/test_plugin_structure.py b/tests/platform_integrations/test_plugin_structure.py index 495e8ca7..8702562a 100644 --- a/tests/platform_integrations/test_plugin_structure.py +++ b/tests/platform_integrations/test_plugin_structure.py @@ -8,6 +8,7 @@ pytestmark = pytest.mark.platform_integrations _PLUGIN_ROOT = Path(__file__).parent.parent.parent / "platform-integrations/claude/plugins/evolve-lite" +_CODEX_PLUGIN_ROOT = Path(__file__).parent.parent.parent / "platform-integrations/codex/plugins/evolve-lite" class TestPluginManifest: @@ -69,12 +70,18 @@ class TestSkillScripts: "skills/evolve-lite/sync/scripts/sync.py", "skills/evolve-lite/recall/scripts/retrieve_entities.py", "skills/evolve-lite/learn/scripts/save_entities.py", + "skills/evolve-lite/provenance/scripts/log_influence.py", ], ) def test_script_exists(self, script_rel): script = _PLUGIN_ROOT / script_rel assert script.exists(), f"Script not found: {script}" + def test_codex_save_trajectory_skill_documents_helper_invocation(self): + skill = _CODEX_PLUGIN_ROOT / "skills/evolve-lite/save-trajectory/SKILL.md" + content = skill.read_text() + assert "plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" in content + class TestLibModules: """Verify that the shared lib modules the scripts depend on exist.""" diff --git a/tests/platform_integrations/test_retrieve.py b/tests/platform_integrations/test_retrieve.py index fb7cf32f..d59418b7 100644 --- a/tests/platform_integrations/test_retrieve.py +++ b/tests/platform_integrations/test_retrieve.py @@ -127,3 +127,80 @@ def test_skips_symlinked_markdown_entities(self, temp_project_dir, retrieve_scri assert result.returncode == 0 assert result.stdout.count("Real content.") == 1 + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_writes_recall_audit_event_with_qualified_entity_ids(self, evolve_dir, retrieve_script, expected_header, platform_name): + result = run_retrieve( + retrieve_script, + evolve_dir=evolve_dir, + stdin_data=json.dumps( + { + "prompt": "How do I write clean code?", + "transcript_path": "/tmp/claude-transcript_session-123.jsonl", + } + ), + ) + + assert result.returncode == 0 + events = [json.loads(line) for line in (evolve_dir / "audit.log").read_text().splitlines() if line.strip()] + assert len(events) == 1 + assert events[0]["event"] == "recall" + assert events[0]["session_id"] == "session-123" + assert events[0]["entities"] == [ + "guideline/guideline", + "subscribed/alice/guideline/alice-guideline", + ] + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_writes_recall_audit_event_with_session_id_fallback(self, evolve_dir, retrieve_script, expected_header, platform_name): + result = run_retrieve( + retrieve_script, + evolve_dir=evolve_dir, + stdin_data=json.dumps( + { + "prompt": "How do I write clean code?", + "session_id": "codex-session-123", + } + ), + ) + + assert result.returncode == 0 + events = [json.loads(line) for line in (evolve_dir / "audit.log").read_text().splitlines() if line.strip()] + assert len(events) == 1 + assert events[0]["event"] == "recall" + assert events[0]["session_id"] == "codex-session-123" + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_writes_recall_audit_under_custom_evolve_dir( + self, temp_project_dir, file_assertions, retrieve_script, expected_header, platform_name + ): + custom_evolve_dir = temp_project_dir / "custom-evolve-data" + file_assertions.write_text( + custom_evolve_dir / "entities" / "guideline" / "guideline.md", + "---\ntype: guideline\n---\n\nKeep functions small.\n", + ) + + result = run_retrieve( + retrieve_script, + evolve_dir=custom_evolve_dir, + stdin_data=json.dumps( + { + "prompt": "How do I write clean code?", + "session_id": "custom-session-123", + } + ), + ) + + assert result.returncode == 0 + events = [json.loads(line) for line in (custom_evolve_dir / "audit.log").read_text().splitlines() if line.strip()] + assert len(events) == 1 + assert events[0]["event"] == "recall" + assert events[0]["session_id"] == "custom-session-123" + assert not (temp_project_dir / ".evolve" / "audit.log").exists() + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_does_not_write_recall_audit_without_transcript_path(self, evolve_dir, retrieve_script, expected_header, platform_name): + result = run_retrieve(retrieve_script, evolve_dir=evolve_dir) + + assert result.returncode == 0 + assert not (evolve_dir / "audit.log").exists() diff --git a/tests/platform_integrations/test_skill_directory_names.py b/tests/platform_integrations/test_skill_directory_names.py index b6ff6c0d..bd28cd97 100644 --- a/tests/platform_integrations/test_skill_directory_names.py +++ b/tests/platform_integrations/test_skill_directory_names.py @@ -21,6 +21,9 @@ def test_bob_lite_skill_directories_exist(self, platform_integrations_dir): "evolve-lite-learn", "evolve-lite-recall", "evolve-lite-publish", + "evolve-lite-provenance", + "evolve-lite-save", + "evolve-lite-save-trajectory", "evolve-lite-subscribe", "evolve-lite-unsubscribe", "evolve-lite-sync", @@ -102,6 +105,9 @@ def test_bob_lite_installation_succeeds(self, temp_project_dir, install_runner, "evolve-lite-learn", "evolve-lite-recall", "evolve-lite-publish", + "evolve-lite-provenance", + "evolve-lite-save", + "evolve-lite-save-trajectory", "evolve-lite-subscribe", "evolve-lite-unsubscribe", "evolve-lite-sync", diff --git a/tests/smoke_skills.py b/tests/smoke_skills.py index 2ce9fd6c..2fcd6447 100644 --- a/tests/smoke_skills.py +++ b/tests/smoke_skills.py @@ -1024,6 +1024,7 @@ def run_bob(prompt: str, *, cwd: Path, evolve_dir: Path, log_file: Path, label: class PlatformPlan: name: str cli: str # binary on PATH + save_trajectory_cmd: str # slash command text to save the current conversation learn_cmd: str # slash command text to send for learn publish_cmd: str # slash command text to invoke publish recall_prompt: str # full prompt for recall @@ -1033,6 +1034,7 @@ def claude_plan() -> PlatformPlan: return PlatformPlan( name="claude", cli="claude", + save_trajectory_cmd="/evolve-lite:save-trajectory", learn_cmd="/evolve-lite:learn", publish_cmd="/evolve-lite:publish", recall_prompt=( @@ -1054,6 +1056,7 @@ def codex_plan() -> PlatformPlan: return PlatformPlan( name="codex", cli="codex", + save_trajectory_cmd="$evolve-lite:save-trajectory", learn_cmd="$evolve-lite:learn", publish_cmd="$evolve-lite:publish", recall_prompt=( @@ -1069,6 +1072,7 @@ def bob_plan() -> PlatformPlan: return PlatformPlan( name="bob", cli="bob", + save_trajectory_cmd="/evolve-lite-save-trajectory", learn_cmd="/evolve-lite-learn", publish_cmd="/evolve-lite-publish", recall_prompt=( @@ -1201,11 +1205,9 @@ def invoke(prompt: str, label: str) -> tuple[int, str]: # The chain differs by platform — see the module docstring for why: # * claude: seed task alone; Stop hooks auto-fire save-trajectory + learn, # and we do an extra explicit /evolve-lite:learn pass afterwards. - # * codex/bob: no Stop hooks for this. Suffix the seed prompt with the - # learn slash command so the same session invokes learn at the end - # (learn is main-context on those platforms — build_plugins.py only - # sets forked_context=True for claude — so it reads the conversation - # directly, no trajectory file needed). + # * codex/bob: no Stop hooks for this. Suffix the seed prompt with + # save-trajectory and learn so the same session saves the conversation + # before extracting entities. baseline_entities = entity_count(evolve_dir) if platform == "claude": t0 = time.time() @@ -1223,19 +1225,28 @@ def invoke(prompt: str, label: str) -> tuple[int, str]: seed_and_learn_prompt = ( f"{SEED_PROMPT}\n\n" f"After completing (or attempting) the task above, your final " - f"action MUST be to run {plan.learn_cmd} so it can extract " - f"learnings from this conversation." + f"actions MUST be to run {plan.save_trajectory_cmd}, then " + f"{plan.learn_cmd}, so learnings are extracted from a saved " + f"trajectory." ) t0 = time.time() rc, _ = invoke(seed_and_learn_prompt, "seed-and-learn") dt = time.time() - t0 post_learn = entity_count(evolve_dir) + trajectory_count = sum(1 for _ in (evolve_dir / "trajectories").glob("*")) if (evolve_dir / "trajectories").is_dir() else 0 ok = (rc == 0) and (post_learn > baseline_entities) + if platform == "codex": + ok = ok and trajectory_count > 0 if not ok and rc == 0: - detail = f"exit=0 in {dt:.1f}s but entities still {post_learn} (baseline {baseline_entities}); learn extracted nothing" + problems = [] + if post_learn <= baseline_entities: + problems.append(f"entities still {post_learn} (baseline {baseline_entities}); learn extracted nothing") + if platform == "codex" and trajectory_count == 0: + problems.append("no trajectory saved") + detail = f"exit=0 in {dt:.1f}s but " + "; ".join(problems) else: - detail = f"exit={rc} in {dt:.1f}s; entities {baseline_entities}→{post_learn}" + detail = f"exit={rc} in {dt:.1f}s; entities {baseline_entities}→{post_learn}; trajectories={trajectory_count}" record_skill(result, "learn", ok, detail) # ── recall (seed entity, prompt agent to echo it) From 3ee80a4e6d2cbe49862315b5fbb87b5eb32f03a5 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 00:08:23 -0500 Subject: [PATCH 02/13] fix(subscribe): roll back clone and config when audit append fails Addresses review feedback from visahak Restores the all-or-nothing contract: if audit_append raises after a successful clone + config save, the new repo entry and cloned directory are removed before exiting non-zero with a "failed to record subscription" diagnostic on stderr. Previously the command swallowed the failure and reported success, leaving the clone and config mutation in place even though tests and callers expect rollback on partial failure. --- .../evolve-lite-subscribe/scripts/subscribe.py | 15 +++++++++------ .../evolve-lite/subscribe/scripts/subscribe.py | 15 +++++++++------ .../evolve-lite/subscribe/scripts/subscribe.py | 15 +++++++++------ .../evolve-lite/subscribe/scripts/subscribe.py | 15 +++++++++------ .../evolve-lite/subscribe/scripts/subscribe.py | 15 +++++++++------ 5 files changed, 45 insertions(+), 30 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py index ef6b0cd0..c195426c 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py @@ -132,12 +132,15 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + try: + save_config(cfg, project_root) + except Exception: + pass + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..c195426c 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,15 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + try: + save_config(cfg, project_root) + except Exception: + pass + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..c195426c 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,15 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + try: + save_config(cfg, project_root) + except Exception: + pass + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..c195426c 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,15 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + try: + save_config(cfg, project_root) + except Exception: + pass + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..c195426c 100755 --- a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,15 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + try: + save_config(cfg, project_root) + except Exception: + pass + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") From 9b53d155f095921160886c4e8cf1951079f1eebb Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 00:24:20 -0500 Subject: [PATCH 03/13] fix(subscribe): persist config rollback when audit append fails Addresses review feedback from visahak The previous commit popped the repo entry from the in-memory list but did not call set_repos(cfg, repos), so the subsequent save_config call re-wrote the same state to disk and the config still contained the new subscription. Update the in-memory cfg before the compensating save_config and align the codex-sharing test, which previously expected the old warn-and-succeed behavior, with the all-or-nothing contract asserted by test_rolls_back_clone_if_audit_write_fails. --- .../skills/evolve-lite-subscribe/scripts/subscribe.py | 1 + .../skills/evolve-lite/subscribe/scripts/subscribe.py | 1 + .../skills/evolve-lite/subscribe/scripts/subscribe.py | 1 + .../skills/evolve-lite/subscribe/scripts/subscribe.py | 1 + .../skills/evolve-lite/subscribe/scripts/subscribe.py | 1 + tests/platform_integrations/test_codex_sharing.py | 11 ++++++----- 6 files changed, 11 insertions(+), 5 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py index c195426c..732c266b 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py @@ -133,6 +133,7 @@ def main(): ) except Exception as exc: repos.pop() + set_repos(cfg, repos) try: save_config(cfg, project_root) except Exception: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index c195426c..732c266b 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -133,6 +133,7 @@ def main(): ) except Exception as exc: repos.pop() + set_repos(cfg, repos) try: save_config(cfg, project_root) except Exception: diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index c195426c..732c266b 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -133,6 +133,7 @@ def main(): ) except Exception as exc: repos.pop() + set_repos(cfg, repos) try: save_config(cfg, project_root) except Exception: diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index c195426c..732c266b 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -133,6 +133,7 @@ def main(): ) except Exception as exc: repos.pop() + set_repos(cfg, repos) try: save_config(cfg, project_root) except Exception: diff --git a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py index c195426c..732c266b 100755 --- a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -133,6 +133,7 @@ def main(): ) except Exception as exc: repos.pop() + set_repos(cfg, repos) try: save_config(cfg, project_root) except Exception: diff --git a/tests/platform_integrations/test_codex_sharing.py b/tests/platform_integrations/test_codex_sharing.py index c67b10ca..4e3a303e 100644 --- a/tests/platform_integrations/test_codex_sharing.py +++ b/tests/platform_integrations/test_codex_sharing.py @@ -367,7 +367,7 @@ def test_subscribe_rolls_back_clone_when_config_save_fails(self, temp_project_di assert result.returncode != 0 assert not (evolve_dir / "entities" / "subscribed" / "alice").exists() - def test_subscribe_warns_when_audit_write_fails(self, temp_project_dir, local_repo): + def test_subscribe_rolls_back_when_audit_write_fails(self, temp_project_dir, local_repo): evolve_dir = temp_project_dir / ".evolve" (evolve_dir / "audit.log").mkdir(parents=True) @@ -376,13 +376,14 @@ def test_subscribe_warns_when_audit_write_fails(self, temp_project_dir, local_re project_dir=temp_project_dir, args=["--name", "alice", "--remote", str(local_repo["bare"]), "--branch", "main"], evolve_dir=evolve_dir, + expect_success=False, ) - assert result.returncode == 0 - assert "Warning: failed to append audit entry for subscribe" in result.stderr - assert (evolve_dir / "entities" / "subscribed" / "alice").is_dir() + assert result.returncode != 0 + assert "failed to record subscription" in result.stderr + assert not (evolve_dir / "entities" / "subscribed" / "alice").exists() config_text = (temp_project_dir / "evolve.config.yaml").read_text() - assert "name: alice" in config_text + assert "name: alice" not in config_text def test_subscribe_rejects_path_traversal_in_name(self, temp_project_dir, local_repo): result = run_script( From 06c3d578ee61d2ff249dffed4aafedfed7de3e9f Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 00:25:02 -0500 Subject: [PATCH 04/13] fix(sync): emit invalid-subscription rejections on stderr Addresses review feedback from visahak Rejected subscription entries (invalid names, missing remotes, unknown scopes) and path-traversal guard trips were being folded into the stdout "Synced N repo(s):" summary, making diagnostics indistinguishable from normal sync output and breaking the platform-integration tests that assert diagnostics appear on stderr. Route each rejection through stderr instead and exclude rejected entries from the stdout summary count. --- .../bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py | 4 ++-- .../evolve-lite/skills/evolve-lite/sync/scripts/sync.py | 4 ++-- .../evolve-lite/skills/evolve-lite/sync/scripts/sync.py | 4 ++-- .../evolve-lite/skills/evolve-lite/sync/scripts/sync.py | 4 ++-- plugin-source/skills/evolve-lite/sync/scripts/sync.py | 4 ++-- 5 files changed, 10 insertions(+), 10 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/plugin-source/skills/evolve-lite/sync/scripts/sync.py b/plugin-source/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/plugin-source/skills/evolve-lite/sync/scripts/sync.py +++ b/plugin-source/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): From 0068873d60e6ddb448f0b58ae838e37a2b483aed Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 00:43:26 -0500 Subject: [PATCH 05/13] fix(save-trajectory): embed session_id in trajectory filename for provenance Addresses review feedback from visahak The provenance skill advertised deterministic session-to-trajectory matching via session_id, but save-trajectory wrote files as trajectory_.json with no persisted session identifier, so two back-to-back codex sessions produced indistinguishable files and the e2e test had to pass the target session id in through the prompt. Extend the envelope to include session_id (Step 4 of save-trajectory) and thread that id into the output filename as trajectory__.json so provenance can resolve a recall event to exactly one trajectory by inspecting the filename. The filename slice is sanitised to filesystem-safe characters and capped at 64 chars. When no session id is available the filename falls back to the original trajectory_.json form. Provenance SKILL Step 2 now documents a three-step matching strategy (claude transcript filename; session-id suffix on trajectory files; envelope session_id field as a last resort) so agents do not have to guess from content alone. --- .../skills/evolve-lite-provenance/SKILL.md | 9 ++++--- .../evolve-lite-save-trajectory/SKILL.md | 2 ++ .../scripts/save_trajectory.py | 27 +++++++++++++++---- .../skills/evolve-lite/provenance/SKILL.md | 9 ++++--- .../evolve-lite/save-trajectory/SKILL.md | 2 ++ .../scripts/save_trajectory.py | 27 +++++++++++++++---- .../skills/evolve-lite/provenance/SKILL.md | 9 ++++--- .../evolve-lite/save-trajectory/SKILL.md | 2 ++ .../scripts/save_trajectory.py | 27 +++++++++++++++---- .../skills/evolve-lite/provenance/SKILL.md | 9 ++++--- .../evolve-lite/save-trajectory/SKILL.md | 2 ++ .../scripts/save_trajectory.py | 27 +++++++++++++++---- .../skills/evolve-lite/provenance/SKILL.md.j2 | 11 ++++---- .../evolve-lite/save-trajectory/SKILL.md.j2 | 2 ++ .../scripts/save_trajectory.py | 27 +++++++++++++++---- 15 files changed, 146 insertions(+), 46 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md index 47e234d6..25ee891a 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md @@ -23,11 +23,12 @@ Skip any recall event that already has `influence` entries for the same `session List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. -Supported trajectory names: -- `claude-transcript_.jsonl` -- `trajectory_*.json` when its content corresponds to the session being assessed +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. -If you cannot confidently match a recall event to a trajectory, skip it. +If none of the above yields a confident match for a recall event, skip it. Do not guess. ### Step 3: Read Recalled Entities diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md index 25b2bcac..6287e65c 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md @@ -99,12 +99,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. ### Step 5: Save via Helper Script diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md index 0d8eaa75..e6ff7825 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -23,11 +23,12 @@ Skip any recall event that already has `influence` entries for the same `session List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. -Supported trajectory names: -- `claude-transcript_.jsonl` -- `trajectory_*.json` when its content corresponds to the session being assessed +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the /evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. -If you cannot confidently match a recall event to a trajectory, skip it. +If none of the above yields a confident match for a recall event, skip it. Do not guess. ### Step 3: Read Recalled Entities diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 7ce54252..aa0b62fa 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -100,12 +100,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. ### Step 5: Save via Helper Script diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md index e885d4ee..de5023bb 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -23,11 +23,12 @@ Skip any recall event that already has `influence` entries for the same `session List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. -Supported trajectory names: -- `claude-transcript_.jsonl` -- `trajectory_*.json` when its content corresponds to the session being assessed +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the /evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. -If you cannot confidently match a recall event to a trajectory, skip it. +If none of the above yields a confident match for a recall event, skip it. Do not guess. ### Step 3: Read Recalled Entities diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 77f455b3..1335a3e2 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -99,12 +99,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. ### Step 5: Save via Helper Script diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md index 3e1d8ab2..349ac090 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -23,11 +23,12 @@ Skip any recall event that already has `influence` entries for the same `session List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. -Supported trajectory names: -- `claude-transcript_.jsonl` -- `trajectory_*.json` when its content corresponds to the session being assessed +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. -If you cannot confidently match a recall event to a trajectory, skip it. +If none of the above yields a confident match for a recall event, skip it. Do not guess. ### Step 3: Read Recalled Entities diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index aa0b1bdb..816a64e2 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -99,12 +99,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. ### Step 5: Save via Helper Script diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 b/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 index 7c72298d..ee704616 100644 --- a/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 @@ -1,4 +1,4 @@ -{%- from "_macros.j2" import invoke with context -%} +{%- from "_macros.j2" import invoke, skill_ref with context -%} --- name: {% if platform == "bob" %}evolve-lite:{% endif %}provenance description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. @@ -24,11 +24,12 @@ Skip any recall event that already has `influence` entries for the same `session List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. -Supported trajectory names: -- `claude-transcript_.jsonl` -- `trajectory_*.json` when its content corresponds to the session being assessed +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the {{ skill_ref("save-trajectory") }} skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. -If you cannot confidently match a recall event to a trajectory, skip it. +If none of the above yields a confident match for a recall event, skip it. Do not guess. ### Step 3: Read Recalled Entities diff --git a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 index 33541a48..4ce8c34c 100644 --- a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 @@ -102,12 +102,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. ### Step 5: Save via Helper Script diff --git a/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: From 4e4358b666a0e10d97600b1929437f358c4f86a1 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:10:59 -0500 Subject: [PATCH 06/13] fix(subscribe): warn on rollback save_config failure instead of silently swallowing Addresses CodeRabbit review finding: Silent rollback-save failure leaves config and filesystem inconsistent The compensating save_config call in the audit-failure rollback path previously swallowed every exception with a bare `except Exception: pass`, which could leave the on-disk evolve.config.yaml still listing the freshly-added repo even though the clone was removed. Print a clear stderr warning that names the affected project_root, the caught exception, and the subscription entry that may still need manual removal so the user can repair the config themselves. --- .../skills/evolve-lite-subscribe/scripts/subscribe.py | 9 +++++++-- .../skills/evolve-lite/subscribe/scripts/subscribe.py | 9 +++++++-- .../skills/evolve-lite/subscribe/scripts/subscribe.py | 9 +++++++-- .../skills/evolve-lite/subscribe/scripts/subscribe.py | 9 +++++++-- .../skills/evolve-lite/subscribe/scripts/subscribe.py | 9 +++++++-- 5 files changed, 35 insertions(+), 10 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py index 732c266b..3f50b3a6 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py @@ -136,8 +136,13 @@ def main(): set_repos(cfg, repos) try: save_config(cfg, project_root) - except Exception: - pass + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) if dest.exists(): shutil.rmtree(dest, ignore_errors=True) print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index 732c266b..3f50b3a6 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -136,8 +136,13 @@ def main(): set_repos(cfg, repos) try: save_config(cfg, project_root) - except Exception: - pass + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) if dest.exists(): shutil.rmtree(dest, ignore_errors=True) print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index 732c266b..3f50b3a6 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -136,8 +136,13 @@ def main(): set_repos(cfg, repos) try: save_config(cfg, project_root) - except Exception: - pass + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) if dest.exists(): shutil.rmtree(dest, ignore_errors=True) print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index 732c266b..3f50b3a6 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -136,8 +136,13 @@ def main(): set_repos(cfg, repos) try: save_config(cfg, project_root) - except Exception: - pass + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) if dest.exists(): shutil.rmtree(dest, ignore_errors=True) print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) diff --git a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py index 732c266b..3f50b3a6 100755 --- a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -136,8 +136,13 @@ def main(): set_repos(cfg, repos) try: save_config(cfg, project_root) - except Exception: - pass + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) if dest.exists(): shutil.rmtree(dest, ignore_errors=True) print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) From df3ac0286575cc9b595dfdd3e419353677eaffb7 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:11:29 -0500 Subject: [PATCH 07/13] fix(provenance): harden payload type validation in log_influence Addresses CodeRabbit review finding: Harden type validation before verdict checks and dedupe keying Require session_id to be a non-empty string at the payload level, and require each assessment's entity field to be a non-empty string inside the loop; malformed items are logged and skipped instead of risking a TypeError when the (session_id, entity) dedupe key is built. Coerce evidence to a string for the same reason so the audit schema stays stable even if callers hand us a numeric or null evidence field. --- .../evolve-lite-provenance/scripts/log_influence.py | 13 +++++++++---- .../evolve-lite/provenance/scripts/log_influence.py | 13 +++++++++---- .../evolve-lite/provenance/scripts/log_influence.py | 13 +++++++++---- .../evolve-lite/provenance/scripts/log_influence.py | 13 +++++++++---- .../evolve-lite/provenance/scripts/log_influence.py | 13 +++++++++---- 5 files changed, 45 insertions(+), 20 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py index c22c6870..79bdd28a 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py @@ -75,9 +75,9 @@ def main(): session_id = payload.get("session_id") assessments = payload.get("assessments", []) - if not session_id or not isinstance(assessments, list): + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") - print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) sys.exit(1) evolve_dir = get_evolve_dir().resolve() @@ -91,9 +91,14 @@ def main(): entity = assessment.get("entity") verdict = assessment.get("verdict") evidence = assessment.get("evidence", "") - if not entity or verdict not in _ALLOWED_VERDICTS: - log(f"Skipping invalid assessment: {assessment}") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) key = (session_id, entity) if key in existing_keys: log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py index c22c6870..79bdd28a 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -75,9 +75,9 @@ def main(): session_id = payload.get("session_id") assessments = payload.get("assessments", []) - if not session_id or not isinstance(assessments, list): + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") - print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) sys.exit(1) evolve_dir = get_evolve_dir().resolve() @@ -91,9 +91,14 @@ def main(): entity = assessment.get("entity") verdict = assessment.get("verdict") evidence = assessment.get("evidence", "") - if not entity or verdict not in _ALLOWED_VERDICTS: - log(f"Skipping invalid assessment: {assessment}") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) key = (session_id, entity) if key in existing_keys: log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py index c22c6870..79bdd28a 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -75,9 +75,9 @@ def main(): session_id = payload.get("session_id") assessments = payload.get("assessments", []) - if not session_id or not isinstance(assessments, list): + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") - print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) sys.exit(1) evolve_dir = get_evolve_dir().resolve() @@ -91,9 +91,14 @@ def main(): entity = assessment.get("entity") verdict = assessment.get("verdict") evidence = assessment.get("evidence", "") - if not entity or verdict not in _ALLOWED_VERDICTS: - log(f"Skipping invalid assessment: {assessment}") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) key = (session_id, entity) if key in existing_keys: log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py index c22c6870..79bdd28a 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -75,9 +75,9 @@ def main(): session_id = payload.get("session_id") assessments = payload.get("assessments", []) - if not session_id or not isinstance(assessments, list): + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") - print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) sys.exit(1) evolve_dir = get_evolve_dir().resolve() @@ -91,9 +91,14 @@ def main(): entity = assessment.get("entity") verdict = assessment.get("verdict") evidence = assessment.get("evidence", "") - if not entity or verdict not in _ALLOWED_VERDICTS: - log(f"Skipping invalid assessment: {assessment}") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) key = (session_id, entity) if key in existing_keys: log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") diff --git a/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py b/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py index c22c6870..79bdd28a 100644 --- a/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py +++ b/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py @@ -75,9 +75,9 @@ def main(): session_id = payload.get("session_id") assessments = payload.get("assessments", []) - if not session_id or not isinstance(assessments, list): + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") - print("Error: payload must include `session_id` and a list `assessments`.", file=sys.stderr) + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) sys.exit(1) evolve_dir = get_evolve_dir().resolve() @@ -91,9 +91,14 @@ def main(): entity = assessment.get("entity") verdict = assessment.get("verdict") evidence = assessment.get("evidence", "") - if not entity or verdict not in _ALLOWED_VERDICTS: - log(f"Skipping invalid assessment: {assessment}") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) key = (session_id, entity) if key in existing_keys: log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") From 90b3570642f2fac82a3651dac60ed2f67bfbf868 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:11:30 -0500 Subject: [PATCH 08/13] fix(recall): prefer explicit session_id over raw transcript_path stem Addresses CodeRabbit review finding: transcript_path priority silently shadows the explicit session_id for non-Claude platforms removeprefix is a no-op on stems that do not start with claude-transcript_, so on non-Claude platforms that pass both transcript_path and session_id the raw filename stem was winning and the explicit session_id was ignored. Only consume transcript_path when the stem actually carries the Claude prefix; otherwise fall through to input_data["session_id"] so Codex/Bob/Claw Code get the session id their hook actually provided. --- .../skills/evolve-lite-recall/scripts/retrieve_entities.py | 6 ++++-- .../skills/evolve-lite/recall/scripts/retrieve_entities.py | 6 ++++-- .../skills/evolve-lite/recall/scripts/retrieve_entities.py | 6 ++++-- .../skills/evolve-lite/recall/scripts/retrieve_entities.py | 6 ++++-- .../skills/evolve-lite/recall/scripts/retrieve_entities.py | 6 ++++-- 5 files changed, 20 insertions(+), 10 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py index 2d54e439..9daa7d38 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py @@ -151,8 +151,10 @@ def main(): transcript_path = "" session_id = None if transcript_path: - session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") - elif isinstance(input_data.get("session_id"), str): + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): session_id = input_data["session_id"] entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) if session_id and entity_ids: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index 2d54e439..9daa7d38 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -151,8 +151,10 @@ def main(): transcript_path = "" session_id = None if transcript_path: - session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") - elif isinstance(input_data.get("session_id"), str): + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): session_id = input_data["session_id"] entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) if session_id and entity_ids: diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index 2d54e439..9daa7d38 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -151,8 +151,10 @@ def main(): transcript_path = "" session_id = None if transcript_path: - session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") - elif isinstance(input_data.get("session_id"), str): + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): session_id = input_data["session_id"] entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) if session_id and entity_ids: diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index 2d54e439..9daa7d38 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -151,8 +151,10 @@ def main(): transcript_path = "" session_id = None if transcript_path: - session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") - elif isinstance(input_data.get("session_id"), str): + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): session_id = input_data["session_id"] entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) if session_id and entity_ids: diff --git a/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py b/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py index 2d54e439..9daa7d38 100644 --- a/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -151,8 +151,10 @@ def main(): transcript_path = "" session_id = None if transcript_path: - session_id = Path(transcript_path).stem.removeprefix("claude-transcript_") - elif isinstance(input_data.get("session_id"), str): + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): session_id = input_data["session_id"] entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) if session_id and entity_ids: From 9b549282c8ff861c83508684f6aca8670ea60ec6 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:15:05 -0500 Subject: [PATCH 09/13] fix(save-trajectory): drop vendor-specific env-var reference from session_id guidance MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit review finding: CLAUDE_SESSION_ID is a Claude-specific env var referenced in a Codex SKILL.md CLAUDE_SESSION_ID does not exist on Codex, Claw Code, or Bob, so the rendered SKILL.md on those platforms invited agents to chase an identifier that would never resolve. Replace the concrete vendor env var with generic guidance — "whatever the harness exposes" plus the existing fallback behavior — so every platform gets the same instruction and no platform-specific symbol leaks into the others' rendered output. --- .../bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md | 2 +- .../evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md | 2 +- .../evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md | 2 +- .../evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md | 2 +- plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 | 2 +- 5 files changed, 5 insertions(+), 5 deletions(-) diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md index 6287e65c..509c0734 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md @@ -106,7 +106,7 @@ Wrap the messages array in a trajectory envelope: - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp -- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index aa0b62fa..f657ec27 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -107,7 +107,7 @@ Wrap the messages array in a trajectory envelope: - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp -- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 1335a3e2..beb924e2 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -106,7 +106,7 @@ Wrap the messages array in a trajectory envelope: - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp -- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 816a64e2..58883ec2 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -106,7 +106,7 @@ Wrap the messages array in a trajectory envelope: - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp -- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script diff --git a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 index 4ce8c34c..d9380c17 100644 --- a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 @@ -109,7 +109,7 @@ Wrap the messages array in a trajectory envelope: - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp -- **session_id**: The current session identifier. Read it from the session's environment context (e.g., `CLAUDE_SESSION_ID`, the `session_id` passed into the skill, or the session identifier exposed by the harness). Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. If no session id is available, omit the field. +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script From 3314c615a69c417115396cb3cf3629e7112889d7 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:15:07 -0500 Subject: [PATCH 10/13] test(codex-e2e): scope recall check to task session and relax verdict assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit review findings: Use task-scoped recalled IDs in the learned-ID check; The hard followed requirement is flaky for stochastic e2e model behavior The learned-vs-recalled intersection was computed over the aggregated recalled_ids across every recall event in the log, which could let the assertion pass even when the task session itself never actually recalled a learned id. Intersect task_recalled_ids (just the final recall event) with learned_ids instead so we verify the specific task session recalled what it learned. Separately, "followed" is only one of three valid influence verdicts and the real model can legitimately pick "contradicted" or "not_applicable" on any given run. Relax the hard-followed assertion to "any verdict in the allowed set" — the test now guards the shape of the influence audit rather than pinning a stochastic outcome. --- tests/e2e/test_codex_sandbox_learn_recall.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/tests/e2e/test_codex_sandbox_learn_recall.py b/tests/e2e/test_codex_sandbox_learn_recall.py index 9bd9af69..7bf26615 100644 --- a/tests/e2e/test_codex_sandbox_learn_recall.py +++ b/tests/e2e/test_codex_sandbox_learn_recall.py @@ -244,10 +244,9 @@ def test_codex_learn_then_recall_flow(codex_sandbox_ready, codex_workspace): assert recall_events, f"no recall audit event recorded. all events: {events}" task_recall_event = recall_events[-1] task_session_id = task_recall_event["session_id"] - recalled_ids = {entity_id for event in recall_events for entity_id in event.get("entities", [])} task_recalled_ids = set(task_recall_event.get("entities", [])) learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} - assert recalled_ids & learned_ids, f"recalled ids {recalled_ids} did not include learned ids {learned_ids}" + assert task_recalled_ids & learned_ids, f"task recall ids {task_recalled_ids} did not include learned ids {learned_ids}" log.info("codex session 3: running offline provenance analysis...") t2 = time.time() @@ -271,9 +270,10 @@ def test_codex_learn_then_recall_flow(codex_sandbox_ready, codex_workspace): assert influence_events, f"no influence audit event recorded. all events: {events}" influenced_ids = {event.get("entity") for event in influence_events} assert influenced_ids & task_recalled_ids, f"influence events {influence_events} did not assess task recall ids {task_recalled_ids}" - assert any(event.get("verdict") == "followed" for event in influence_events), ( - f"no recalled guideline was followed. influence events: {influence_events}" + allowed_verdicts = {"followed", "contradicted", "not_applicable"} + assert any(event.get("verdict") in allowed_verdicts for event in influence_events), ( + f"no recalled guideline was assessed with an allowed verdict. influence events: {influence_events}" ) for event in influence_events: - assert event.get("verdict") in {"followed", "contradicted", "not_applicable"} + assert event.get("verdict") in allowed_verdicts assert event.get("evidence"), f"influence event missing evidence: {event}" From 16543df9483b657573a95196602d55212de518a7 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:15:38 -0500 Subject: [PATCH 11/13] test(codex-sharing): tolerate missing config after rollback in audit-fail test Addresses CodeRabbit review finding: Guard against FileNotFoundError when rollback deletes the newly-created config Once the rollback path removes the subscription entry and rewrites the config, a future implementation could reasonably end up with the config file absent (e.g., after removing the only repo). Guard the read_text() call with an exists() check so the assertion continues to verify that 'name: alice' is not present regardless of whether the file is empty or gone. --- tests/platform_integrations/test_codex_sharing.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/tests/platform_integrations/test_codex_sharing.py b/tests/platform_integrations/test_codex_sharing.py index 4e3a303e..dcb35a20 100644 --- a/tests/platform_integrations/test_codex_sharing.py +++ b/tests/platform_integrations/test_codex_sharing.py @@ -382,7 +382,8 @@ def test_subscribe_rolls_back_when_audit_write_fails(self, temp_project_dir, loc assert result.returncode != 0 assert "failed to record subscription" in result.stderr assert not (evolve_dir / "entities" / "subscribed" / "alice").exists() - config_text = (temp_project_dir / "evolve.config.yaml").read_text() + config_path = temp_project_dir / "evolve.config.yaml" + config_text = config_path.read_text() if config_path.exists() else "" assert "name: alice" not in config_text def test_subscribe_rejects_path_traversal_in_name(self, temp_project_dir, local_repo): From da73c8ecf79f7bc570e82fae227a7ea16c1258ab Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:15:48 -0500 Subject: [PATCH 12/13] test(provenance): pin utf-8 when reading audit.log and drop fragile order assertion MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Addresses CodeRabbit review findings: read_audit missing encoding=utf-8 may fail in non-UTF-8 locales; Entity list order assertion is fragile — rglob/os.walk order is not guaranteed read_text() falls back to the platform default encoding, which is not utf-8 on every CI host. Pin the decoder to utf-8 for both the recall audit parser in test_retrieve and the read_audit helper in test_log_influence so non-ASCII audit entries decode reliably. The recall test also asserted strict list equality on the entities field, but retrieve_entities orders them via rglob which is not guaranteed across platforms. Switch to a set comparison so we assert on membership rather than traversal order. --- tests/platform_integrations/test_log_influence.py | 2 +- tests/platform_integrations/test_retrieve.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/platform_integrations/test_log_influence.py b/tests/platform_integrations/test_log_influence.py index 91b1f625..9ba0fd66 100644 --- a/tests/platform_integrations/test_log_influence.py +++ b/tests/platform_integrations/test_log_influence.py @@ -37,7 +37,7 @@ def read_audit(evolve_dir): path = evolve_dir / "audit.log" if not path.is_file(): return [] - return [json.loads(line) for line in path.read_text().splitlines() if line.strip()] + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] class TestLogInfluence: diff --git a/tests/platform_integrations/test_retrieve.py b/tests/platform_integrations/test_retrieve.py index d59418b7..71b6a328 100644 --- a/tests/platform_integrations/test_retrieve.py +++ b/tests/platform_integrations/test_retrieve.py @@ -142,14 +142,14 @@ def test_writes_recall_audit_event_with_qualified_entity_ids(self, evolve_dir, r ) assert result.returncode == 0 - events = [json.loads(line) for line in (evolve_dir / "audit.log").read_text().splitlines() if line.strip()] + events = [json.loads(line) for line in (evolve_dir / "audit.log").read_text(encoding="utf-8").splitlines() if line.strip()] assert len(events) == 1 assert events[0]["event"] == "recall" assert events[0]["session_id"] == "session-123" - assert events[0]["entities"] == [ + assert set(events[0]["entities"]) == { "guideline/guideline", "subscribed/alice/guideline/alice-guideline", - ] + } @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) def test_writes_recall_audit_event_with_session_id_fallback(self, evolve_dir, retrieve_script, expected_header, platform_name): From 2052a45e8d5b59072bff06289067322ae7c78727 Mon Sep 17 00:00:00 2001 From: Vinod Muthusamy Date: Thu, 7 May 2026 01:15:58 -0500 Subject: [PATCH 13/13] test(smoke): restrict codex trajectory gate to actual trajectory files Addresses CodeRabbit review finding: Count only trajectory files in the codex learn gate glob("*") on .evolve/trajectories/ counts any directory or stray artifact (e.g., a lock dir from a previous run) the harness happens to leave behind. Restrict the count to files whose name matches trajectory_*.json so the codex branch only passes when the learn flow actually produced a saved trajectory. --- tests/smoke_skills.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tests/smoke_skills.py b/tests/smoke_skills.py index 2fcd6447..e9fca881 100644 --- a/tests/smoke_skills.py +++ b/tests/smoke_skills.py @@ -1234,7 +1234,11 @@ def invoke(prompt: str, label: str) -> tuple[int, str]: dt = time.time() - t0 post_learn = entity_count(evolve_dir) - trajectory_count = sum(1 for _ in (evolve_dir / "trajectories").glob("*")) if (evolve_dir / "trajectories").is_dir() else 0 + trajectory_count = ( + sum(1 for path in (evolve_dir / "trajectories").glob("trajectory_*.json") if path.is_file()) + if (evolve_dir / "trajectories").is_dir() + else 0 + ) ok = (rc == 0) and (post_learn > baseline_entities) if platform == "codex": ok = ok and trajectory_count > 0