diff --git a/platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md b/platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md new file mode 100644 index 00000000..2d9bd0fe --- /dev/null +++ b/platform-integrations/bob/evolve-lite/commands/evolve-lite-provenance.md @@ -0,0 +1,4 @@ +--- +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- +Use the `evolve-lite-provenance` skill on the current conversation. Follow the skill's instructions exactly. diff --git a/platform-integrations/bob/evolve-lite/lib/audit.py b/platform-integrations/bob/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/bob/evolve-lite/lib/audit.py +++ b/platform-integrations/bob/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md index b2f82264..cab3f129 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-learn/SKILL.md @@ -33,9 +33,15 @@ Unless that artifact happens to be: ## Workflow +### Step 0: Save and Load the Conversation + +First, use the evolve-lite:save-trajectory skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + ### Step 1: Analyze the Conversation -Identify from your current conversation: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -76,6 +82,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md new file mode 100644 index 00000000..25ee891a --- /dev/null +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/SKILL.md @@ -0,0 +1,64 @@ +--- +name: evolve-lite:provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. + +If none of the above yields a confident match for a recall event, skip it. Do not guess. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | python3 .bob/skills/evolve-lite-provenance/scripts/log_influence.py +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py new file mode 100644 index 00000000..79bdd28a --- /dev/null +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-provenance/scripts/log_influence.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") + continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py index ade892fe..9daa7d38 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,33 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md index 25b2bcac..509c0734 100644 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/SKILL.md @@ -99,12 +99,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py index ef6b0cd0..3f50b3a6 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-subscribe/scripts/subscribe.py @@ -132,12 +132,21 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + set_repos(cfg, repos) + try: + save_config(cfg, project_root) + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py b/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py +++ b/platform-integrations/bob/evolve-lite/skills/evolve-lite-sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/platform-integrations/claude/plugins/evolve-lite/lib/audit.py b/platform-integrations/claude/plugins/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/claude/plugins/evolve-lite/lib/audit.py +++ b/platform-integrations/claude/plugins/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md index 1de6b643..5e33e376 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md @@ -48,7 +48,7 @@ The transcript is JSONL: each line is a separate JSON object. Filter for `"type" ### Step 1: Analyze the Conversation -Identify from your current conversation (loaded from the transcript): +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -89,6 +89,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md new file mode 100644 index 00000000..e6ff7825 --- /dev/null +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -0,0 +1,64 @@ +--- +name: provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the /evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. + +If none of the above yields a confident match for a recall event, skip it. Do not guess. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | python3 ${CLAUDE_PLUGIN_ROOT}/skills/evolve-lite/provenance/scripts/log_influence.py +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..79bdd28a --- /dev/null +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") + continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..9daa7d38 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,33 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 0c518694..f657ec27 100644 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -100,12 +100,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script @@ -115,7 +117,7 @@ Write the trajectory JSON to a temporary file using the **Write** tool, then pas 2. Run the helper script with the file path as an argument: ```bash -tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/save-trajectory/scripts/save_trajectory.py" "$tmp" +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" ``` **Important**: Do NOT use inline Python scripts, heredocs, or stdin piping to pass the trajectory JSON. Always use the Write tool to create a temp file first. This avoids escaping issues with backslashes, quotes, and newlines in conversation content. diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..3f50b3a6 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,21 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + set_repos(cfg, repos) + try: + save_config(cfg, project_root) + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py +++ b/platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py b/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md index ad0fef58..e6174e3a 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md @@ -33,9 +33,15 @@ Unless that artifact happens to be: ## Workflow +### Step 0: Save and Load the Conversation + +First, use the /evolve-lite:save-trajectory skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + ### Step 1: Analyze the Conversation -Identify from your current conversation: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -76,6 +82,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md new file mode 100644 index 00000000..de5023bb --- /dev/null +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -0,0 +1,64 @@ +--- +name: provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the /evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. + +If none of the above yields a confident match for a recall event, skip it. Do not guess. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | sh -lc 'real_home="$(python3 -c "import os,pwd; print(pwd.getpwuid(os.getuid()).pw_dir)")"; config_home="${CLAW_CONFIG_HOME:-$real_home/.claw}"; script=".claw/skills/evolve-lite:provenance/scripts/log_influence.py"; [ -f "$script" ] || script="$config_home/skills/evolve-lite:provenance/scripts/log_influence.py"; python3 "$script"' +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..79bdd28a --- /dev/null +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") + continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..9daa7d38 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,33 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index 77f455b3..beb924e2 100644 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -99,12 +99,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..3f50b3a6 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,21 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + set_repos(cfg, repos) + try: + save_config(cfg, project_root) + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py +++ b/platform-integrations/claw-code/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json b/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json index bf5ab1dd..0632a6d5 100644 --- a/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json +++ b/platform-integrations/codex/plugins/evolve-lite/.codex-plugin/plugin.json @@ -28,6 +28,7 @@ "defaultPrompt": [ "Recall Evolve entities for this task.", "Save new Evolve learnings from this session.", + "Analyze saved trajectories for Evolve guideline provenance.", "Show me the entities stored for this repo.", "Publish one of my Evolve guidelines.", "Subscribe to a teammate's Evolve guidelines repo." diff --git a/platform-integrations/codex/plugins/evolve-lite/lib/audit.py b/platform-integrations/codex/plugins/evolve-lite/lib/audit.py index fd5c535a..fa43846b 100644 --- a/platform-integrations/codex/plugins/evolve-lite/lib/audit.py +++ b/platform-integrations/codex/plugins/evolve-lite/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md index 086cf355..13d436e4 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/learn/SKILL.md @@ -33,9 +33,15 @@ Unless that artifact happens to be: ## Workflow +### Step 0: Save and Load the Conversation + +First, use the evolve-lite:save-trajectory skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + ### Step 1: Analyze the Conversation -Identify from your current conversation: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -76,6 +82,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md new file mode 100644 index 00000000..349ac090 --- /dev/null +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/SKILL.md @@ -0,0 +1,64 @@ +--- +name: provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the evolve-lite:save-trajectory skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. + +If none of the above yields a confident match for a recall event, skip it. Do not guess. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | python3 "$(git rev-parse --show-toplevel 2>/dev/null || pwd)/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py" +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..79bdd28a --- /dev/null +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") + continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..9daa7d38 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,33 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md index ad37821b..58883ec2 100644 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/SKILL.md @@ -99,12 +99,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script @@ -114,7 +116,7 @@ Write the trajectory JSON to a temporary file using the **Write** tool, then pas 2. Run the helper script with the file path as an argument: ```bash - +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "$(git rev-parse --show-toplevel 2>/dev/null || pwd)/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" ``` **Important**: Do NOT use inline Python scripts, heredocs, or stdin piping to pass the trajectory JSON. Always use the Write tool to create a temp file first. This avoids escaping issues with backslashes, quotes, and newlines in conversation content. diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..3f50b3a6 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,21 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + set_repos(cfg, repos) + try: + save_config(cfg, project_root) + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py +++ b/platform-integrations/codex/plugins/evolve-lite/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/plugin-source/lib/audit.py b/plugin-source/lib/audit.py index fd5c535a..fa43846b 100644 --- a/plugin-source/lib/audit.py +++ b/plugin-source/lib/audit.py @@ -5,14 +5,17 @@ import pathlib -def append(project_root=".", **fields): +def append(project_root=".", evolve_dir=None, **fields): """Append a JSON audit entry to .evolve/audit.log. Args: project_root: Root directory that contains .evolve/. + evolve_dir: Explicit evolve data directory. When set, writes directly + to ``/audit.log`` instead of deriving it from + ``project_root``. **fields: Arbitrary key-value fields to include in the log entry. """ - path = pathlib.Path(project_root) / ".evolve" / "audit.log" + path = pathlib.Path(evolve_dir) / "audit.log" if evolve_dir is not None else pathlib.Path(project_root) / ".evolve" / "audit.log" path.parent.mkdir(parents=True, exist_ok=True) entry = {**fields, "ts": datetime.datetime.now(datetime.UTC).isoformat().replace("+00:00", "Z")} with path.open("a", encoding="utf-8") as f: diff --git a/plugin-source/plugin.toml b/plugin-source/plugin.toml index aa920e48..35881f22 100644 --- a/plugin-source/plugin.toml +++ b/plugin-source/plugin.toml @@ -51,6 +51,7 @@ brand_color = "#2563EB" default_prompt = [ "Recall Evolve entities for this task.", "Save new Evolve learnings from this session.", + "Analyze saved trajectories for Evolve guideline provenance.", "Show me the entities stored for this repo.", "Publish one of my Evolve guidelines.", "Subscribe to a teammate's Evolve guidelines repo.", diff --git a/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 b/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 index 8cfaa975..ee7d6c51 100644 --- a/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/learn/SKILL.md.j2 @@ -50,10 +50,17 @@ If the saved trajectory file does not exist (e.g., the save-trajectory hook did The transcript is JSONL: each line is a separate JSON object. Filter for `"type": "assistant"` and `"type": "human"` lines, then reconstruct the flow from `message.content`. Look for tool calls, errors in tool results, and user corrections. +{% else -%} +### Step 0: Save and Load the Conversation + +First, use the {{ skill_ref("save-trajectory") }} skill to save the current conversation to `.evolve/trajectories/`. Capture the exact path from its output as `saved_trajectory_path`. You will attach this exact path to each entity's `trajectory` field in Step 6. + +After saving, read `saved_trajectory_path` with the Read tool and analyze that saved trajectory rather than relying only on live context. If the trajectory cannot be saved or read, output zero entities and exit. Do not invent a trajectory path. + {% endif -%} ### Step 1: Analyze the Conversation -Identify from your current conversation{% if forked_context | default(false) %} (loaded from the transcript){% endif %}: +Identify from the saved trajectory loaded in Step 0: - **Task/Request**: What was the user asking for? - **Steps Taken**: What reasoning, actions, and observations occurred? @@ -94,6 +101,11 @@ Prefer one of these artifact forms: - a small script, saved to a stable path in the workspace or plugin, such as `scripts/`, `tools/`, or another obvious helper location. - a documented local workflow if code is not appropriate +When turning an ad hoc command or script into a reusable artifact, remove +incidental one-off inputs such as literal file names, IDs, answer values, or +temporary paths. Keep the reusable procedure that was actually exercised in the +session, and do not add capabilities that were not validated by the work. + If you create an artifact, record: - its path - what it does diff --git a/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 b/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 new file mode 100644 index 00000000..ee704616 --- /dev/null +++ b/plugin-source/skills/evolve-lite/provenance/SKILL.md.j2 @@ -0,0 +1,65 @@ +{%- from "_macros.j2" import invoke, skill_ref with context -%} +--- +name: {% if platform == "bob" %}evolve-lite:{% endif %}provenance +description: Analyze saved trajectories and recall audit events offline to record whether recalled guidelines influenced completed sessions. +--- + +# Provenance Analyzer + +## Overview + +This skill runs after one or more sessions have completed. It reads saved trajectories from `.evolve/trajectories/`, matches them to `recall` events in `.evolve/audit.log`, and records post-hoc `influence` events for recalled guidelines. + +Use this skill when you want to compute usage provenance without coupling the work to the live learn step. + +## Workflow + +### Step 1: Load Recall Events + +Read `.evolve/audit.log` as JSONL. Find entries where `event == "recall"` and `entities` is a non-empty list. + +Skip any recall event that already has `influence` entries for the same `session_id` and entity ids. Do not write duplicate influence records. + +### Step 2: Locate Saved Trajectories + +List `.evolve/trajectories/` and match each recall event to a trajectory by `session_id`. + +Matching strategy (in order): +1. `claude-transcript_.jsonl` - the stop-hook transcript dump; the session id is in the filename. +2. `trajectory__.json` - written by the {{ skill_ref("save-trajectory") }} skill when a session id is available. Match on the `` slice of the filename. +3. `trajectory_.json` - open the file and match its top-level `session_id` field against the recall event. Only fall back to this step when the filename alone does not identify the session. + +If none of the above yields a confident match for a recall event, skip it. Do not guess. + +### Step 3: Read Recalled Entities + +For each recalled entity id, open `.evolve/entities/.md`. The id is a path relative to `.evolve/entities/` without the `.md` suffix, such as `guideline/foo` or `subscribed/alice/guideline/foo`. + +Read the entity content and trigger. Skip ids whose files are missing. + +### Step 4: Assess Influence + +Compare each recalled entity with the matched trajectory. Pick exactly one verdict: + +- `followed` - the agent's actual actions are consistent with the guideline. +- `contradicted` - the guideline applied, but the agent did the opposite or repeated the avoidable dead end. +- `not_applicable` - the guideline was recalled but did not apply to this session. + +Keep `evidence` to one short sentence citing a concrete action, tool call, or absence in the trajectory. + +### Step 5: Write Influence Events + +Pipe one JSON payload per assessed session to the helper: + +```bash +echo '{ + "session_id": "", + "assessments": [ + {"entity": "guideline/", "verdict": "followed", "evidence": "Agent used the saved parser before trying shell fallbacks."} + ] +}' | {{ invoke("provenance", "log_influence.py") }} +``` + +The `entity` value must match exactly what appeared in the recall event, including any `subscribed//` prefix. + +It is valid to emit an empty `assessments` list when recall events exist but no recalled guideline can be assessed. diff --git a/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py b/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py new file mode 100644 index 00000000..79bdd28a --- /dev/null +++ b/plugin-source/skills/evolve-lite/provenance/scripts/log_influence.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python3 +"""Append post-hoc influence assessments to .evolve/audit.log. + +Reads JSON from stdin of the form: + { + "session_id": "", + "assessments": [ + {"entity": "", "verdict": "followed|contradicted|not_applicable", + "evidence": ""}, + ... + ] + } +""" + +import json +import sys +from pathlib import Path + +# Walk up from the script location to find the installed plugin lib directory. +# claude/claw-code/codex/bob all ship a sibling lib/ next to skills/; bob's +# installer copies it to .bob/evolve-lib/, hence both names are checked. +_script = Path(__file__).resolve() +_lib = None +for _ancestor in _script.parents: + for _candidate in (_ancestor / "lib", _ancestor / "evolve-lib"): + if (_candidate / "entity_io.py").is_file(): + _lib = _candidate + break + if _lib is not None: + break +if _lib is None: + raise ImportError(f"Cannot find plugin lib directory above {_script}") +sys.path.insert(0, str(_lib)) +from entity_io import get_evolve_dir, log as _log # noqa: E402 +import audit # noqa: E402 + + +_ALLOWED_VERDICTS = {"followed", "contradicted", "not_applicable"} + + +def log(message): + _log("influence", message) + + +def existing_influence_keys(evolve_dir): + audit_log = Path(evolve_dir) / "audit.log" + if not audit_log.is_file(): + return set() + + keys = set() + for line in audit_log.read_text(encoding="utf-8").splitlines(): + if not line.strip(): + continue + try: + event = json.loads(line) + except json.JSONDecodeError: + continue + if event.get("event") == "influence" and event.get("session_id") and event.get("entity"): + keys.add((event["session_id"], event["entity"])) + return keys + + +def main(): + try: + payload = json.load(sys.stdin) + except json.JSONDecodeError as exc: + log(f"Invalid JSON input: {exc}") + print(f"Error: invalid JSON input - {exc}", file=sys.stderr) + sys.exit(1) + + if not isinstance(payload, dict): + log(f"Bad payload type: {type(payload).__name__}") + print("Error: payload must be a JSON object.", file=sys.stderr) + sys.exit(1) + + session_id = payload.get("session_id") + assessments = payload.get("assessments", []) + if not isinstance(session_id, str) or not session_id or not isinstance(assessments, list): + log(f"Bad payload shape: session_id={session_id!r} assessments_type={type(assessments).__name__}") + print("Error: payload must include a string `session_id` and a list `assessments`.", file=sys.stderr) + sys.exit(1) + + evolve_dir = get_evolve_dir().resolve() + existing_keys = existing_influence_keys(evolve_dir) + + written = 0 + for assessment in assessments: + if not isinstance(assessment, dict): + log(f"Skipping non-dict assessment item: {assessment!r}") + continue + entity = assessment.get("entity") + verdict = assessment.get("verdict") + evidence = assessment.get("evidence", "") + if not isinstance(entity, str) or not entity: + log(f"Skipping assessment with non-string entity: {assessment!r}") + continue + if verdict not in _ALLOWED_VERDICTS: + log(f"Skipping invalid assessment verdict: {assessment}") + continue + if not isinstance(evidence, str): + evidence = str(evidence) + key = (session_id, entity) + if key in existing_keys: + log(f"Skipping duplicate influence assessment: session_id={session_id} entity={entity}") + continue + audit.append( + evolve_dir=str(evolve_dir), + event="influence", + session_id=session_id, + entity=entity, + verdict=verdict, + evidence=evidence, + ) + existing_keys.add(key) + written += 1 + + log(f"Wrote {written} influence record(s) for session {session_id}") + print(f"Recorded {written} influence assessment(s).") + + +if __name__ == "__main__": + main() diff --git a/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py b/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py index ade892fe..9daa7d38 100644 --- a/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py +++ b/plugin-source/skills/evolve-lite/recall/scripts/retrieve_entities.py @@ -21,7 +21,8 @@ if _lib is None: raise ImportError(f"Cannot find plugin lib directory above {_script}") sys.path.insert(0, str(_lib)) -from entity_io import find_entities_dir, markdown_to_entity, log as _log # noqa: E402 +from entity_io import find_entities_dir, get_evolve_dir, markdown_to_entity, log as _log # noqa: E402 +import audit # noqa: E402 def log(message): @@ -81,6 +82,7 @@ def load_entities_with_source(entities_dir): continue entity.pop("_source", None) + entity["_id"] = str(md.relative_to(entities_dir).with_suffix("")) parts = md.relative_to(entities_dir).parts if parts and parts[0] == "subscribed" and len(parts) > 1: entity["_source"] = parts[1] @@ -139,6 +141,33 @@ def main(): print(output) log(f"Output {len(output)} chars to stdout") + # Audit which entity ids were served to this session. Logging is + # intentionally best-effort so recall never fails because provenance + # recording could not append to audit.log. + try: + if isinstance(input_data, dict): + transcript_path = input_data.get("transcript_path", "") + else: + transcript_path = "" + session_id = None + if transcript_path: + stem = Path(transcript_path).stem + if stem.startswith("claude-transcript_"): + session_id = stem.removeprefix("claude-transcript_") + if not session_id and isinstance(input_data, dict) and isinstance(input_data.get("session_id"), str): + session_id = input_data["session_id"] + entity_ids = sorted({entity["_id"] for entity in entities if entity.get("_id")}) + if session_id and entity_ids: + audit.append( + evolve_dir=str(get_evolve_dir().resolve()), + event="recall", + session_id=session_id, + entities=entity_ids, + ) + log(f"Audit: recall session_id={session_id} entities={len(entity_ids)}") + except Exception as exc: + log(f"Audit append failed (non-fatal): {exc}") + if __name__ == "__main__": main() diff --git a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 index 5ce79420..d9380c17 100644 --- a/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 +++ b/plugin-source/skills/evolve-lite/save-trajectory/SKILL.md.j2 @@ -102,12 +102,14 @@ Wrap the messages array in a trajectory envelope: { "model": "", "timestamp": "2025-01-15T10:30:00Z", + "session_id": "", "messages": [...] } ``` - **model**: Use the exact model ID from the current session's environment context (e.g., the value after "You are powered by the model named …"). Do not hardcode a default — always read it from the session. - **timestamp**: Current ISO 8601 timestamp +- **session_id**: The current session identifier. Read it from whatever the harness exposes — the `session_id` passed into the skill, the session id surfaced in the session context, or a runtime-provided environment variable. Include it verbatim so offline provenance can match this trajectory to `recall` audit events for the same session. Omit the field only if no session id is truly available in this environment. ### Step 5: Save via Helper Script @@ -118,9 +120,11 @@ Write the trajectory JSON to a temporary file using the **Write** tool, then pas ```bash {% if platform == "claude" -%} -tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/save-trajectory/scripts/save_trajectory.py" "$tmp" +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "${CLAUDE_PLUGIN_ROOT}/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" {%- elif platform == "claw-code" -%} tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; real_home="$(python3 -c "import os,pwd; print(pwd.getpwuid(os.getuid()).pw_dir)")"; config_home="${CLAW_CONFIG_HOME:-$real_home/.claw}"; script=".claw/skills/evolve-lite:save-trajectory/scripts/save_trajectory.py"; [ -f "$script" ] || script="$config_home/skills/evolve-lite:save-trajectory/scripts/save_trajectory.py"; python3 "$script" "$tmp" +{%- elif platform == "codex" -%} +tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 "$(git rev-parse --show-toplevel 2>/dev/null || pwd)/plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" "$tmp" {%- elif platform == "bob" -%} tmp=.evolve/tmp/trajectory_input.json; mkdir -p .evolve/tmp; trap 'rm -f "$tmp"' EXIT; python3 .bob/skills/evolve-lite-save-trajectory/scripts/save_trajectory.py "$tmp" {%- endif %} diff --git a/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py b/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py index f34571eb..a3ee4ac2 100755 --- a/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py +++ b/plugin-source/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py @@ -9,6 +9,7 @@ import getpass import json import os +import re import sys import tempfile from pathlib import Path @@ -65,15 +66,29 @@ def get_trajectories_dir(): return base.resolve() -def open_trajectory_file(trajectories_dir): +_SAFE_SESSION_ID = re.compile(r"[^A-Za-z0-9._-]") + + +def _sanitize_session_id(session_id): + """Return a filesystem-safe slice of ``session_id`` (empty if unusable).""" + if not isinstance(session_id, str): + return "" + cleaned = _SAFE_SESSION_ID.sub("-", session_id.strip()) + return cleaned[:64] + + +def open_trajectory_file(trajectories_dir, session_id=None): """Atomically claim a timestamped trajectory file. Returns a ``(Path, fd)`` tuple. Uses ``O_CREAT | O_EXCL`` so two saves racing within the same second pick distinct filenames instead of one - overwriting the other. + overwriting the other. When ``session_id`` is provided, it is embedded + in the filename so offline provenance can match this trajectory to + ``recall`` audit events for the same session without content inspection. """ now = datetime.datetime.now().strftime("%Y-%m-%dT%H-%M-%S") - base_name = f"trajectory_{now}" + sid = _sanitize_session_id(session_id) + base_name = f"trajectory_{now}_{sid}" if sid else f"trajectory_{now}" for suffix in range(0, 1000): name = f"{base_name}.json" if suffix == 0 else f"{base_name}_{suffix}.json" @@ -121,9 +136,11 @@ def main(): log(f"Trajectory has {len(messages)} messages") - # Atomically claim a unique output path (handles same-second races) + # Atomically claim a unique output path (handles same-second races). + # Embed session_id in the filename when present so offline provenance + # can match recall events to trajectories deterministically. trajectories_dir = get_trajectories_dir() - output_path, fd = open_trajectory_file(trajectories_dir) + output_path, fd = open_trajectory_file(trajectories_dir, trajectory.get("session_id")) # Write formatted JSON via the already-opened owner-only fd try: diff --git a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py index ef6b0cd0..3f50b3a6 100755 --- a/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py +++ b/plugin-source/skills/evolve-lite/subscribe/scripts/subscribe.py @@ -132,12 +132,21 @@ def main(): remote=args.remote, ) except Exception as exc: - # Audit logging is best-effort: a failed append shouldn't roll back - # an otherwise successful subscribe (the repo is cloned, the config - # has the entry). Warn loudly so the user can fix the audit log - # path without losing the subscription. Originally rolled back on - # main's PR #245 (#244 e2e fix). - print(f"Warning: failed to append audit entry for subscribe: {exc}", file=sys.stderr) + repos.pop() + set_repos(cfg, repos) + try: + save_config(cfg, project_root) + except Exception as save_exc: + print( + f"Warning: rollback save_config failed under {project_root!r}: {save_exc}. " + f"The clone was removed but evolve.config.yaml may still list '{args.name}' - " + f"please inspect the file and remove the entry manually if present.", + file=sys.stderr, + ) + if dest.exists(): + shutil.rmtree(dest, ignore_errors=True) + print(f"Error: failed to record subscription in audit log: {exc}", file=sys.stderr) + sys.exit(1) print(f"Subscribed to '{args.name}' (scope={args.scope}) from {args.remote}") diff --git a/plugin-source/skills/evolve-lite/sync/scripts/sync.py b/plugin-source/skills/evolve-lite/sync/scripts/sync.py index 33c34716..4fd0f624 100755 --- a/plugin-source/skills/evolve-lite/sync/scripts/sync.py +++ b/plugin-source/skills/evolve-lite/sync/scripts/sync.py @@ -162,7 +162,7 @@ def main(): raw_name = rejection["raw_name"] reason = rejection["reason"] label = repr(raw_name) if raw_name else "" - summaries.append(f"{label} (skipped - {reason})") + print(f"{label} (skipped - {reason})", file=sys.stderr) for repo in repos: name = repo["name"] @@ -174,7 +174,7 @@ def main(): repo_path = (evolve_dir / "entities" / "subscribed" / name).resolve() if repo_path == subscribed_base or not repo_path.is_relative_to(subscribed_base): - summaries.append(f"{name!r} (skipped - invalid subscription name)") + print(f"{name!r} (skipped - invalid subscription name)", file=sys.stderr) continue if not repo_path.is_dir(): diff --git a/sandbox/README.md b/sandbox/README.md index 8ca8151d..877cebc9 100644 --- a/sandbox/README.md +++ b/sandbox/README.md @@ -1,6 +1,7 @@ -# Claude Code Sandbox +# Claude Code / Codex Sandbox -A Docker image for running Claude Code in a sandboxed Debian environment with Python and common Linux tools. +Docker images for running Claude Code or Codex in a sandboxed Debian +environment with Python and common Linux tools. ## Build @@ -32,7 +33,7 @@ docker run --rm --env-file sandbox/myenv claude-sandbox claude -p "who are you" ## Automated E2E Test -`tests/e2e/test_sandbox_learn_recall.py` exercises the full evolve-lite +`tests/e2e/test_claude_sandbox_learn_recall.py` exercises the full evolve-lite learn + recall loop end-to-end inside this sandbox. It runs two Claude sessions: @@ -78,11 +79,11 @@ CLAUDE_CODE_SKIP_BEDROCK_AUTH=1 ```bash # If creds live in an env file: dotenv -e path/to/your.env -- \ - uv run pytest tests/e2e/test_sandbox_learn_recall.py \ + uv run pytest tests/e2e/test_claude_sandbox_learn_recall.py \ --run-e2e -m e2e -v --log-cli-level=INFO # Or, with vars already exported: -uv run pytest tests/e2e/test_sandbox_learn_recall.py \ +uv run pytest tests/e2e/test_claude_sandbox_learn_recall.py \ --run-e2e -m e2e -v --log-cli-level=INFO ``` @@ -90,3 +91,22 @@ The `--log-cli-level=INFO` flag streams per-session progress lines live (~4 minutes total). The test skips if Docker, the sandbox image, or credentials are missing. +## Codex Automated E2E Test + +`tests/e2e/test_codex_sandbox_learn_recall.py` runs the same learn + recall +flow against the Dockerized Codex sandbox. Build the image, then load the +Codex sandbox env file with `dotenv`: + +```bash +just sandbox-build codex + +dotenv -e ~/data/creds/codex-sandbox.env -- \ + uv run pytest tests/e2e/test_codex_sandbox_learn_recall.py \ + --run-e2e -m e2e -v --log-cli-level=INFO +``` + +The env file should export the provider credential and Codex provider settings +as environment variables, for example `CODEX_MODEL_PROVIDER`, +`CODEX_MODEL_PROVIDER_BASE_URL`, `CODEX_MODEL_PROVIDER_ENV_KEY`, and +`CODEX_MODEL_PROVIDER_WIRE_API`. The test forwards only environment variable +values into Docker; it does not mount host credential or Codex config files. diff --git a/tests/e2e/test_sandbox_learn_recall.py b/tests/e2e/test_claude_sandbox_learn_recall.py similarity index 67% rename from tests/e2e/test_sandbox_learn_recall.py rename to tests/e2e/test_claude_sandbox_learn_recall.py index d39293a0..b7368605 100644 --- a/tests/e2e/test_sandbox_learn_recall.py +++ b/tests/e2e/test_claude_sandbox_learn_recall.py @@ -6,6 +6,8 @@ transcript and extracts a guideline. 2. Ask about focal length — UserPromptSubmit recall hook injects the guideline from session 1, so Claude should skip the dead ends. + 3. Run the offline provenance skill to record whether the recalled + guideline influenced session 2. Assertions: - Session 1 produces a guideline file under .evolve/entities/. @@ -19,6 +21,7 @@ import logging import os import re +import shlex import shutil import subprocess import time @@ -76,6 +79,7 @@ def sandbox_workspace(tmp_path): def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedProcess: plugins = REPO_ROOT / "platform-integrations" / "claude" / "plugins" + command = "claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p " + shlex.quote(prompt) cmd = ["docker", "run", "--rm"] for var in FORWARDED_ENV_VARS: if os.environ.get(var): @@ -90,7 +94,7 @@ def _run_sandbox_prompt(workspace: Path, prompt: str) -> subprocess.CompletedPro SANDBOX_IMAGE, "bash", "-c", - f'claude --plugin-dir /plugins/evolve-lite/ --dangerously-skip-permissions -p "{prompt}"', + command, ] return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) @@ -116,8 +120,8 @@ def _bash_commands(transcript_path: Path) -> list[str]: @pytest.mark.e2e -def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): - """Session 1 extracts a guideline; session 2 benefits from recall.""" +def test_claude_learn_then_recall_flow(sandbox_ready, sandbox_workspace): + """Session 1 learns, session 2 recalls, session 3 records influence.""" del sandbox_ready # only used for its skip side effect # --- Session 1: location query — expected dead ends then recovery --- @@ -164,3 +168,51 @@ def test_learn_then_recall_flow(sandbox_ready, sandbox_workspace): # pip-installed). Other libraries (PIL, piexif, exifread) may appear in a # valid guideline as "install via pip and use", so we don't ban them. assert not re.search(r"\bexiftool\b", joined), "session 2 invoked exiftool despite recall guideline:\n" + "\n".join(commands) + + # --- Usage provenance: audit.log should record recall --- + audit_log = sandbox_workspace / ".evolve" / "audit.log" + assert audit_log.is_file(), f"{audit_log} was not created — recall did not append audit events" + + events = [] + for line in audit_log.read_text().splitlines(): + line = line.strip() + if line: + events.append(json.loads(line)) + + session2_id = session2_transcript.stem.removeprefix("claude-transcript_") + session1_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} + + recall_events = [event for event in events if event.get("event") == "recall" and event.get("session_id") == session2_id] + assert recall_events, f"no recall audit event for session 2 ({session2_id}). all events: {events}" + recalled_ids = {entity_id for event in recall_events for entity_id in event.get("entities", [])} + assert recalled_ids & session1_ids, f"recall event entities {recalled_ids} did not include any id from session 1 ({session1_ids})" + log.info(f"session 2: audit recorded recall of {recalled_ids}") + + # --- Offline provenance: audit.log should record usefulness verdicts --- + log.info("session 3: running offline provenance analysis...") + t2 = time.time() + result3 = _run_sandbox_prompt( + sandbox_workspace, + ( + "Run /evolve-lite:provenance now. Analyze the saved trajectories and " + "the recall events in .evolve/audit.log. Record influence verdicts " + "for any recalled guideline that can be matched to the focal-length " + "photo session. Do not modify source files." + ), + ) + log.info(f"session 3: exited {result3.returncode} after {time.time() - t2:.0f}s") + assert result3.returncode == 0, f"session 3 exited {result3.returncode}\nstderr:\n{result3.stderr[-2000:]}" + + events = [] + for line in audit_log.read_text().splitlines(): + line = line.strip() + if line: + events.append(json.loads(line)) + + influence_events = [event for event in events if event.get("event") == "influence"] + assert influence_events, f"no influence audit event recorded. all events: {events}" + influenced_ids = {event.get("entity") for event in influence_events} + assert influenced_ids & recalled_ids, f"influence events {influence_events} did not assess any recalled ids {recalled_ids}" + for event in influence_events: + assert event.get("verdict") in {"followed", "contradicted", "not_applicable"} + assert event.get("evidence"), f"influence event missing evidence: {event}" diff --git a/tests/e2e/test_codex_sandbox_learn_recall.py b/tests/e2e/test_codex_sandbox_learn_recall.py new file mode 100644 index 00000000..7bf26615 --- /dev/null +++ b/tests/e2e/test_codex_sandbox_learn_recall.py @@ -0,0 +1,279 @@ +"""End-to-end test of the evolve-lite learn + recall flow in the Codex sandbox. + +Runs two sequential Codex sessions against the Dockerized Codex sandbox: + 1. Session 1 performs an EXIF task, then explicitly runs save-trajectory + and learn so a trajectory and guideline are saved. + 2. Session 2 asks a related EXIF question. The Codex UserPromptSubmit hook + should inject recalled guidance before the prompt is handled. + 3. Session 3 runs the offline provenance skill so the recall audit gets + follow-up influence verdicts. + +Requires Docker, the `evolve-codex-sandbox` image built, and Codex credentials +exported in the environment. +""" + +import json +import logging +import os +import shutil +import subprocess +import time +from pathlib import Path +from typing import Iterable + +import pytest + + +log = logging.getLogger(__name__) + + +SANDBOX_IMAGE = "evolve-codex-sandbox" +REPO_ROOT = Path(__file__).resolve().parents[2] +SESSION_TIMEOUT_SECONDS = 600 +FORWARDED_ENV_VARS = ( + "OPENAI_API_KEY", + "OPENAI_BASE_URL", + "OPENAI_ORG_ID", + "OPENAI_PROJECT_ID", + "CODEX_MODEL", +) +CODEX_PROVIDER_ENV_KEY_VAR = "CODEX_MODEL_PROVIDER_ENV_KEY" + + +@pytest.fixture(scope="session") +def codex_sandbox_ready(): + """Skip if Docker, the Codex sandbox image, or credentials aren't available.""" + if shutil.which("docker") is None: + pytest.skip("docker not installed") + + if subprocess.run(["docker", "info"], capture_output=True).returncode != 0: + pytest.skip("docker daemon not running") + + image_check = subprocess.run( + ["docker", "image", "inspect", SANDBOX_IMAGE], + capture_output=True, + ) + if image_check.returncode != 0: + pytest.skip(f"sandbox image {SANDBOX_IMAGE!r} not built - run `just sandbox-build codex`") + + credential_env_var = os.environ.get(CODEX_PROVIDER_ENV_KEY_VAR, "OPENAI_API_KEY") + if not os.environ.get(credential_env_var): + pytest.skip(f"{credential_env_var} not set in environment") + + return True + + +@pytest.fixture +def codex_workspace(tmp_path): + """Copy demo/workspace and install the Codex plugin into it.""" + src = REPO_ROOT / "demo" / "workspace" + workspace = tmp_path / "workspace" + shutil.copytree(src, workspace, ignore=shutil.ignore_patterns(".evolve", "backup", "sandbox-backup")) + + install_script = REPO_ROOT / "platform-integrations" / "install.sh" + result = subprocess.run( + ["bash", str(install_script), "install", "--platform", "codex", "--dir", str(workspace)], + capture_output=True, + text=True, + check=False, + ) + assert result.returncode == 0, f"codex install failed\nstdout:\n{result.stdout}\nstderr:\n{result.stderr}" + + _register_codex_plugin_for_container(workspace) + return workspace + + +def _toml_str(value: str) -> str: + return json.dumps(value) + + +def _codex_config_lines() -> list[str]: + lines: list[str] = [] + if model := os.environ.get("CODEX_MODEL"): + lines.append(f"model = {_toml_str(model)}") + + provider = os.environ.get("CODEX_MODEL_PROVIDER") + if provider: + lines.append(f"model_provider = {_toml_str(provider)}") + + base_url = os.environ.get("CODEX_MODEL_PROVIDER_BASE_URL") or os.environ.get("OPENAI_BASE_URL") + if provider and base_url: + provider_name = os.environ.get("CODEX_MODEL_PROVIDER_NAME", provider) + provider_env_key = os.environ.get(CODEX_PROVIDER_ENV_KEY_VAR, "OPENAI_API_KEY") + lines.extend( + [ + "", + f"[model_providers.{_toml_str(provider)}]", + f"name = {_toml_str(provider_name)}", + f"base_url = {_toml_str(base_url)}", + f"env_key = {_toml_str(provider_env_key)}", + ] + ) + if wire_api := os.environ.get("CODEX_MODEL_PROVIDER_WIRE_API"): + lines.append(f"wire_api = {_toml_str(wire_api)}") + + if lines: + lines.append("") + return lines + + +def _register_codex_plugin_for_container(workspace: Path) -> None: + """Pre-populate /codex-home with a local marketplace plugin cache. + + This mirrors the headless registration used by tests/smoke_skills.py, but + writes paths as the container sees them: workspace is mounted at /workspace + and CODEX_HOME is mounted at /codex-home. + """ + codex_home = workspace / ".codex-home" + codex_home.mkdir(parents=True, exist_ok=True) + + plugin_src = workspace / "plugins" / "evolve-lite" + plugin_json = plugin_src / ".codex-plugin" / "plugin.json" + version = json.loads(plugin_json.read_text(encoding="utf-8")).get("version", "0.0.0") + cache_dir = codex_home / "plugins" / "cache" / "evolve-local" / "evolve-lite" / version + cache_dir.mkdir(parents=True, exist_ok=True) + + shutil.copytree(plugin_src / ".codex-plugin", cache_dir / ".codex-plugin", dirs_exist_ok=True) + shutil.copytree(plugin_src / "lib", cache_dir / "lib", dirs_exist_ok=True) + shutil.copytree(plugin_src / "skills" / "evolve-lite", cache_dir / "skills", dirs_exist_ok=True) + + config = "\n".join(_codex_config_lines()) + config += """[marketplaces.evolve-local] +source = "/workspace" + +[plugins."evolve-lite@evolve-local"] +enabled = true +""" + (codex_home / "config.toml").write_text(config, encoding="utf-8") + + +def _forwarded_env_vars() -> Iterable[str]: + yield from FORWARDED_ENV_VARS + provider_env_key = os.environ.get(CODEX_PROVIDER_ENV_KEY_VAR) + if provider_env_key: + yield provider_env_key + + +def _run_codex_prompt(workspace: Path, prompt: str, *, enable_hooks: bool = True) -> subprocess.CompletedProcess: + codex_home = workspace / ".codex-home" + cmd = ["docker", "run", "--rm"] + for var in _forwarded_env_vars(): + if os.environ.get(var): + cmd += ["-e", var] + cmd += [ + "-e", + "EVOLVE_DEBUG=1", + "-e", + "TMPDIR=/workspace/.evolve/tmp", + "-v", + f"{workspace}:/workspace", + "-v", + f"{codex_home}:/codex-home", + SANDBOX_IMAGE, + "codex", + "exec", + "--skip-git-repo-check", + "--ephemeral", + "--dangerously-bypass-approvals-and-sandbox", + "-c", + f"features.codex_hooks={str(enable_hooks).lower()}", + "-C", + "/workspace", + prompt, + ] + return subprocess.run(cmd, capture_output=True, text=True, timeout=SESSION_TIMEOUT_SECONDS) + + +def _audit_events(evolve_dir: Path) -> list[dict]: + audit_log = evolve_dir / "audit.log" + if not audit_log.is_file(): + return [] + return [json.loads(line) for line in audit_log.read_text().splitlines() if line.strip()] + + +@pytest.mark.e2e +def test_codex_learn_then_recall_flow(codex_sandbox_ready, codex_workspace): + """Session 1 learns, session 2 recalls, session 3 records influence.""" + del codex_sandbox_ready + + evolve_dir = codex_workspace / ".evolve" + + log.info("codex session 1: running seed task with save-trajectory + learn...") + t0 = time.time() + result1 = _run_codex_prompt( + codex_workspace, + ( + "Where was the photo @sample.jpg taken? Use EXIF metadata. " + "When done, invoke the evolve-lite save-trajectory skill, then invoke the evolve-lite learn skill. " + "Do not skip either evolve-lite skill." + ), + ) + log.info(f"codex session 1: exited {result1.returncode} after {time.time() - t0:.0f}s") + assert result1.returncode == 0, ( + f"session 1 exited {result1.returncode}\nstdout:\n{result1.stdout[-2000:]}\nstderr:\n{result1.stderr[-2000:]}" + ) + + trajectories_dir = evolve_dir / "trajectories" + entities_dir = evolve_dir / "entities" + assert trajectories_dir.is_dir(), f"{trajectories_dir} was not created" + trajectories = list(trajectories_dir.glob("*.json")) + assert trajectories, f"no Codex trajectory JSON files found in {trajectories_dir}" + assert entities_dir.is_dir(), f"{entities_dir} was not created" + entity_files = list(entities_dir.rglob("*.md")) + assert entity_files, f"no guideline files found in {entities_dir}" + + log.info("codex session 2: running related task to exercise recall hook...") + t1 = time.time() + result2 = _run_codex_prompt( + codex_workspace, + ( + "What focal length was used to take the photo @sample.jpg? Use EXIF metadata. " + "When done, invoke the evolve-lite save-trajectory skill. Do not invoke the learn skill." + ), + ) + log.info(f"codex session 2: exited {result2.returncode} after {time.time() - t1:.0f}s") + assert result2.returncode == 0, ( + f"session 2 exited {result2.returncode}\nstdout:\n{result2.stdout[-2000:]}\nstderr:\n{result2.stderr[-2000:]}" + ) + + session2_trajectories = {path for path in trajectories_dir.glob("*.json")} - set(trajectories) + assert session2_trajectories, f"no Codex trajectory saved for session 2 in {trajectories_dir}" + + events = _audit_events(evolve_dir) + recall_events = [event for event in events if event.get("event") == "recall"] + assert recall_events, f"no recall audit event recorded. all events: {events}" + task_recall_event = recall_events[-1] + task_session_id = task_recall_event["session_id"] + task_recalled_ids = set(task_recall_event.get("entities", [])) + learned_ids = {str(path.relative_to(entities_dir).with_suffix("")) for path in entity_files} + assert task_recalled_ids & learned_ids, f"task recall ids {task_recalled_ids} did not include learned ids {learned_ids}" + + log.info("codex session 3: running offline provenance analysis...") + t2 = time.time() + result3 = _run_codex_prompt( + codex_workspace, + ( + "Run the evolve-lite provenance skill now. Analyze the saved trajectories and " + "the recall events in .evolve/audit.log. Record influence verdicts " + f"for recalled guidelines in session {task_session_id}, the focal-length " + "photo session. Do not modify source files." + ), + enable_hooks=False, + ) + log.info(f"codex session 3: exited {result3.returncode} after {time.time() - t2:.0f}s") + assert result3.returncode == 0, ( + f"session 3 exited {result3.returncode}\nstdout:\n{result3.stdout[-2000:]}\nstderr:\n{result3.stderr[-2000:]}" + ) + + events = _audit_events(evolve_dir) + influence_events = [event for event in events if event.get("event") == "influence" and event.get("session_id") == task_session_id] + assert influence_events, f"no influence audit event recorded. all events: {events}" + influenced_ids = {event.get("entity") for event in influence_events} + assert influenced_ids & task_recalled_ids, f"influence events {influence_events} did not assess task recall ids {task_recalled_ids}" + allowed_verdicts = {"followed", "contradicted", "not_applicable"} + assert any(event.get("verdict") in allowed_verdicts for event in influence_events), ( + f"no recalled guideline was assessed with an allowed verdict. influence events: {influence_events}" + ) + for event in influence_events: + assert event.get("verdict") in allowed_verdicts + assert event.get("evidence"), f"influence event missing evidence: {event}" diff --git a/tests/platform_integrations/test_codex.py b/tests/platform_integrations/test_codex.py index 1bbc6d8d..03a5dd59 100644 --- a/tests/platform_integrations/test_codex.py +++ b/tests/platform_integrations/test_codex.py @@ -62,12 +62,16 @@ def test_install_creates_expected_files(self, temp_project_dir, install_runner, file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "learn") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "recall") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "publish") + file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "provenance") + file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "save-trajectory") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "subscribe") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "unsubscribe") file_assertions.assert_dir_exists(plugin_dir / "skills" / "evolve-lite" / "sync") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "learn" / "scripts" / "save_entities.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "recall" / "scripts" / "retrieve_entities.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "publish" / "scripts" / "publish.py") + file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "provenance" / "scripts" / "log_influence.py") + file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "save-trajectory" / "scripts" / "save_trajectory.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "subscribe" / "scripts" / "subscribe.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "unsubscribe" / "scripts" / "unsubscribe.py") file_assertions.assert_file_exists(plugin_dir / "skills" / "evolve-lite" / "sync" / "scripts" / "sync.py") diff --git a/tests/platform_integrations/test_codex_sharing.py b/tests/platform_integrations/test_codex_sharing.py index c67b10ca..dcb35a20 100644 --- a/tests/platform_integrations/test_codex_sharing.py +++ b/tests/platform_integrations/test_codex_sharing.py @@ -367,7 +367,7 @@ def test_subscribe_rolls_back_clone_when_config_save_fails(self, temp_project_di assert result.returncode != 0 assert not (evolve_dir / "entities" / "subscribed" / "alice").exists() - def test_subscribe_warns_when_audit_write_fails(self, temp_project_dir, local_repo): + def test_subscribe_rolls_back_when_audit_write_fails(self, temp_project_dir, local_repo): evolve_dir = temp_project_dir / ".evolve" (evolve_dir / "audit.log").mkdir(parents=True) @@ -376,13 +376,15 @@ def test_subscribe_warns_when_audit_write_fails(self, temp_project_dir, local_re project_dir=temp_project_dir, args=["--name", "alice", "--remote", str(local_repo["bare"]), "--branch", "main"], evolve_dir=evolve_dir, + expect_success=False, ) - assert result.returncode == 0 - assert "Warning: failed to append audit entry for subscribe" in result.stderr - assert (evolve_dir / "entities" / "subscribed" / "alice").is_dir() - config_text = (temp_project_dir / "evolve.config.yaml").read_text() - assert "name: alice" in config_text + assert result.returncode != 0 + assert "failed to record subscription" in result.stderr + assert not (evolve_dir / "entities" / "subscribed" / "alice").exists() + config_path = temp_project_dir / "evolve.config.yaml" + config_text = config_path.read_text() if config_path.exists() else "" + assert "name: alice" not in config_text def test_subscribe_rejects_path_traversal_in_name(self, temp_project_dir, local_repo): result = run_script( diff --git a/tests/platform_integrations/test_log_influence.py b/tests/platform_integrations/test_log_influence.py new file mode 100644 index 00000000..9ba0fd66 --- /dev/null +++ b/tests/platform_integrations/test_log_influence.py @@ -0,0 +1,243 @@ +"""Tests for skills/evolve-lite/provenance/scripts/log_influence.py.""" + +import json +import os +import subprocess +import sys +from pathlib import Path + +import pytest + +pytestmark = [pytest.mark.platform_integrations, pytest.mark.e2e] + +_REPO_ROOT = Path(__file__).parent.parent.parent +LOG_INFLUENCE_SCRIPT = ( + _REPO_ROOT / "platform-integrations/claude/plugins/evolve-lite/skills/evolve-lite/provenance/scripts/log_influence.py" +) + + +def run_log_influence(project_dir, payload, *, raw_input=None, evolve_dir=None): + """Invoke log_influence.py with the given payload dict or raw input string.""" + env = {**os.environ} + if evolve_dir: + env["EVOLVE_DIR"] = str(evolve_dir) + stdin = raw_input if raw_input is not None else json.dumps(payload) + return subprocess.run( + [sys.executable, str(LOG_INFLUENCE_SCRIPT)], + input=stdin, + capture_output=True, + text=True, + cwd=str(project_dir), + env=env, + check=False, + ) + + +def read_audit(evolve_dir): + path = evolve_dir / "audit.log" + if not path.is_file(): + return [] + return [json.loads(line) for line in path.read_text(encoding="utf-8").splitlines() if line.strip()] + + +class TestLogInfluence: + def test_writes_single_assessment(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "abc-123", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "because"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0] == { + "event": "influence", + "session_id": "abc-123", + "entity": "guideline/slug-a", + "verdict": "followed", + "evidence": "because", + "ts": events[0]["ts"], + } + + def test_writes_under_custom_evolve_dir(self, temp_project_dir): + evolve_dir = temp_project_dir / "custom-evolve-data" + result = run_log_influence( + temp_project_dir, + { + "session_id": "abc-123", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "because"}, + ], + }, + evolve_dir=evolve_dir, + ) + + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["event"] == "influence" + assert not (temp_project_dir / ".evolve" / "audit.log").exists() + + def test_writes_multiple_assessments(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "e1"}, + {"entity": "guideline/slug-b", "verdict": "not_applicable", "evidence": "e2"}, + {"entity": "guideline/slug-c", "verdict": "contradicted", "evidence": "e3"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 3 + verdicts = {event["entity"]: event["verdict"] for event in events} + assert verdicts == { + "guideline/slug-a": "followed", + "guideline/slug-b": "not_applicable", + "guideline/slug-c": "contradicted", + } + + def test_skips_duplicate_assessments_on_rerun(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + payload = { + "session_id": "sess-1", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "followed", "evidence": "e1"}, + {"entity": "guideline/slug-a", "verdict": "contradicted", "evidence": "e2"}, + ], + } + + first = run_log_influence(temp_project_dir, payload, evolve_dir=evolve_dir) + second = run_log_influence(temp_project_dir, payload, evolve_dir=evolve_dir) + + assert first.returncode == 0, first.stderr + assert second.returncode == 0, second.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-a" + assert events[0]["verdict"] == "followed" + + def test_skips_assessments_with_invalid_verdict(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"entity": "guideline/slug-a", "verdict": "bogus", "evidence": "no"}, + {"entity": "guideline/slug-b", "verdict": "followed", "evidence": "yes"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-b" + + def test_skips_assessments_missing_entity(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + {"verdict": "followed", "evidence": "no entity"}, + {"entity": "guideline/slug-b", "verdict": "followed", "evidence": "ok"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-b" + + def test_skips_non_dict_assessment_items(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [ + "not-a-dict", + 42, + None, + {"entity": "guideline/slug-ok", "verdict": "followed", "evidence": "yes"}, + ], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert len(events) == 1 + assert events[0]["entity"] == "guideline/slug-ok" + + def test_empty_assessments_list_is_ok(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"session_id": "sess-1", "assessments": []}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + assert read_audit(evolve_dir) == [] + + def test_evidence_defaults_to_empty_string(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + { + "session_id": "sess-1", + "assessments": [{"entity": "guideline/slug-a", "verdict": "followed"}], + }, + evolve_dir=evolve_dir, + ) + assert result.returncode == 0, result.stderr + events = read_audit(evolve_dir) + assert events[0]["evidence"] == "" + + def test_rejects_non_dict_payload(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence(temp_project_dir, ["not", "a", "dict"], evolve_dir=evolve_dir) + assert result.returncode == 1 + assert "payload" in result.stderr.lower() + assert read_audit(evolve_dir) == [] + + def test_rejects_missing_session_id(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"assessments": [{"entity": "guideline/a", "verdict": "followed"}]}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 1 + assert read_audit(evolve_dir) == [] + + def test_rejects_non_list_assessments(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence( + temp_project_dir, + {"session_id": "sess-1", "assessments": "oops"}, + evolve_dir=evolve_dir, + ) + assert result.returncode == 1 + assert read_audit(evolve_dir) == [] + + def test_rejects_invalid_json(self, temp_project_dir): + evolve_dir = temp_project_dir / ".evolve" + result = run_log_influence(temp_project_dir, None, raw_input="{not valid json", evolve_dir=evolve_dir) + assert result.returncode == 1 + assert "json" in result.stderr.lower() + assert read_audit(evolve_dir) == [] diff --git a/tests/platform_integrations/test_plugin_structure.py b/tests/platform_integrations/test_plugin_structure.py index 495e8ca7..8702562a 100644 --- a/tests/platform_integrations/test_plugin_structure.py +++ b/tests/platform_integrations/test_plugin_structure.py @@ -8,6 +8,7 @@ pytestmark = pytest.mark.platform_integrations _PLUGIN_ROOT = Path(__file__).parent.parent.parent / "platform-integrations/claude/plugins/evolve-lite" +_CODEX_PLUGIN_ROOT = Path(__file__).parent.parent.parent / "platform-integrations/codex/plugins/evolve-lite" class TestPluginManifest: @@ -69,12 +70,18 @@ class TestSkillScripts: "skills/evolve-lite/sync/scripts/sync.py", "skills/evolve-lite/recall/scripts/retrieve_entities.py", "skills/evolve-lite/learn/scripts/save_entities.py", + "skills/evolve-lite/provenance/scripts/log_influence.py", ], ) def test_script_exists(self, script_rel): script = _PLUGIN_ROOT / script_rel assert script.exists(), f"Script not found: {script}" + def test_codex_save_trajectory_skill_documents_helper_invocation(self): + skill = _CODEX_PLUGIN_ROOT / "skills/evolve-lite/save-trajectory/SKILL.md" + content = skill.read_text() + assert "plugins/evolve-lite/skills/evolve-lite/save-trajectory/scripts/save_trajectory.py" in content + class TestLibModules: """Verify that the shared lib modules the scripts depend on exist.""" diff --git a/tests/platform_integrations/test_retrieve.py b/tests/platform_integrations/test_retrieve.py index fb7cf32f..71b6a328 100644 --- a/tests/platform_integrations/test_retrieve.py +++ b/tests/platform_integrations/test_retrieve.py @@ -127,3 +127,80 @@ def test_skips_symlinked_markdown_entities(self, temp_project_dir, retrieve_scri assert result.returncode == 0 assert result.stdout.count("Real content.") == 1 + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_writes_recall_audit_event_with_qualified_entity_ids(self, evolve_dir, retrieve_script, expected_header, platform_name): + result = run_retrieve( + retrieve_script, + evolve_dir=evolve_dir, + stdin_data=json.dumps( + { + "prompt": "How do I write clean code?", + "transcript_path": "/tmp/claude-transcript_session-123.jsonl", + } + ), + ) + + assert result.returncode == 0 + events = [json.loads(line) for line in (evolve_dir / "audit.log").read_text(encoding="utf-8").splitlines() if line.strip()] + assert len(events) == 1 + assert events[0]["event"] == "recall" + assert events[0]["session_id"] == "session-123" + assert set(events[0]["entities"]) == { + "guideline/guideline", + "subscribed/alice/guideline/alice-guideline", + } + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_writes_recall_audit_event_with_session_id_fallback(self, evolve_dir, retrieve_script, expected_header, platform_name): + result = run_retrieve( + retrieve_script, + evolve_dir=evolve_dir, + stdin_data=json.dumps( + { + "prompt": "How do I write clean code?", + "session_id": "codex-session-123", + } + ), + ) + + assert result.returncode == 0 + events = [json.loads(line) for line in (evolve_dir / "audit.log").read_text().splitlines() if line.strip()] + assert len(events) == 1 + assert events[0]["event"] == "recall" + assert events[0]["session_id"] == "codex-session-123" + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_writes_recall_audit_under_custom_evolve_dir( + self, temp_project_dir, file_assertions, retrieve_script, expected_header, platform_name + ): + custom_evolve_dir = temp_project_dir / "custom-evolve-data" + file_assertions.write_text( + custom_evolve_dir / "entities" / "guideline" / "guideline.md", + "---\ntype: guideline\n---\n\nKeep functions small.\n", + ) + + result = run_retrieve( + retrieve_script, + evolve_dir=custom_evolve_dir, + stdin_data=json.dumps( + { + "prompt": "How do I write clean code?", + "session_id": "custom-session-123", + } + ), + ) + + assert result.returncode == 0 + events = [json.loads(line) for line in (custom_evolve_dir / "audit.log").read_text().splitlines() if line.strip()] + assert len(events) == 1 + assert events[0]["event"] == "recall" + assert events[0]["session_id"] == "custom-session-123" + assert not (temp_project_dir / ".evolve" / "audit.log").exists() + + @pytest.mark.parametrize(("platform_name", "retrieve_script", "expected_header"), SCRIPT_VARIANTS) + def test_does_not_write_recall_audit_without_transcript_path(self, evolve_dir, retrieve_script, expected_header, platform_name): + result = run_retrieve(retrieve_script, evolve_dir=evolve_dir) + + assert result.returncode == 0 + assert not (evolve_dir / "audit.log").exists() diff --git a/tests/platform_integrations/test_skill_directory_names.py b/tests/platform_integrations/test_skill_directory_names.py index b6ff6c0d..bd28cd97 100644 --- a/tests/platform_integrations/test_skill_directory_names.py +++ b/tests/platform_integrations/test_skill_directory_names.py @@ -21,6 +21,9 @@ def test_bob_lite_skill_directories_exist(self, platform_integrations_dir): "evolve-lite-learn", "evolve-lite-recall", "evolve-lite-publish", + "evolve-lite-provenance", + "evolve-lite-save", + "evolve-lite-save-trajectory", "evolve-lite-subscribe", "evolve-lite-unsubscribe", "evolve-lite-sync", @@ -102,6 +105,9 @@ def test_bob_lite_installation_succeeds(self, temp_project_dir, install_runner, "evolve-lite-learn", "evolve-lite-recall", "evolve-lite-publish", + "evolve-lite-provenance", + "evolve-lite-save", + "evolve-lite-save-trajectory", "evolve-lite-subscribe", "evolve-lite-unsubscribe", "evolve-lite-sync", diff --git a/tests/smoke_skills.py b/tests/smoke_skills.py index 2ce9fd6c..e9fca881 100644 --- a/tests/smoke_skills.py +++ b/tests/smoke_skills.py @@ -1024,6 +1024,7 @@ def run_bob(prompt: str, *, cwd: Path, evolve_dir: Path, log_file: Path, label: class PlatformPlan: name: str cli: str # binary on PATH + save_trajectory_cmd: str # slash command text to save the current conversation learn_cmd: str # slash command text to send for learn publish_cmd: str # slash command text to invoke publish recall_prompt: str # full prompt for recall @@ -1033,6 +1034,7 @@ def claude_plan() -> PlatformPlan: return PlatformPlan( name="claude", cli="claude", + save_trajectory_cmd="/evolve-lite:save-trajectory", learn_cmd="/evolve-lite:learn", publish_cmd="/evolve-lite:publish", recall_prompt=( @@ -1054,6 +1056,7 @@ def codex_plan() -> PlatformPlan: return PlatformPlan( name="codex", cli="codex", + save_trajectory_cmd="$evolve-lite:save-trajectory", learn_cmd="$evolve-lite:learn", publish_cmd="$evolve-lite:publish", recall_prompt=( @@ -1069,6 +1072,7 @@ def bob_plan() -> PlatformPlan: return PlatformPlan( name="bob", cli="bob", + save_trajectory_cmd="/evolve-lite-save-trajectory", learn_cmd="/evolve-lite-learn", publish_cmd="/evolve-lite-publish", recall_prompt=( @@ -1201,11 +1205,9 @@ def invoke(prompt: str, label: str) -> tuple[int, str]: # The chain differs by platform — see the module docstring for why: # * claude: seed task alone; Stop hooks auto-fire save-trajectory + learn, # and we do an extra explicit /evolve-lite:learn pass afterwards. - # * codex/bob: no Stop hooks for this. Suffix the seed prompt with the - # learn slash command so the same session invokes learn at the end - # (learn is main-context on those platforms — build_plugins.py only - # sets forked_context=True for claude — so it reads the conversation - # directly, no trajectory file needed). + # * codex/bob: no Stop hooks for this. Suffix the seed prompt with + # save-trajectory and learn so the same session saves the conversation + # before extracting entities. baseline_entities = entity_count(evolve_dir) if platform == "claude": t0 = time.time() @@ -1223,19 +1225,32 @@ def invoke(prompt: str, label: str) -> tuple[int, str]: seed_and_learn_prompt = ( f"{SEED_PROMPT}\n\n" f"After completing (or attempting) the task above, your final " - f"action MUST be to run {plan.learn_cmd} so it can extract " - f"learnings from this conversation." + f"actions MUST be to run {plan.save_trajectory_cmd}, then " + f"{plan.learn_cmd}, so learnings are extracted from a saved " + f"trajectory." ) t0 = time.time() rc, _ = invoke(seed_and_learn_prompt, "seed-and-learn") dt = time.time() - t0 post_learn = entity_count(evolve_dir) + trajectory_count = ( + sum(1 for path in (evolve_dir / "trajectories").glob("trajectory_*.json") if path.is_file()) + if (evolve_dir / "trajectories").is_dir() + else 0 + ) ok = (rc == 0) and (post_learn > baseline_entities) + if platform == "codex": + ok = ok and trajectory_count > 0 if not ok and rc == 0: - detail = f"exit=0 in {dt:.1f}s but entities still {post_learn} (baseline {baseline_entities}); learn extracted nothing" + problems = [] + if post_learn <= baseline_entities: + problems.append(f"entities still {post_learn} (baseline {baseline_entities}); learn extracted nothing") + if platform == "codex" and trajectory_count == 0: + problems.append("no trajectory saved") + detail = f"exit=0 in {dt:.1f}s but " + "; ".join(problems) else: - detail = f"exit={rc} in {dt:.1f}s; entities {baseline_entities}→{post_learn}" + detail = f"exit={rc} in {dt:.1f}s; entities {baseline_entities}→{post_learn}; trajectories={trajectory_count}" record_skill(result, "learn", ok, detail) # ── recall (seed entity, prompt agent to echo it)