diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index da5041a..6671b4c 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "ooda-loop", "displayName": "OODA-loop", - "version": "1.8.0", + "version": "1.8.1", "description": "An autonomous operations layer for your live side project. It watches, re-orients from which PRs you merge and reject, and opens small revertible PRs — bounded by a HALT file, protected paths, and a hard cost cap. Built on Boyd's OODA loop. You stay in command.", "author": { "name": "Taeil Ma", diff --git a/CHANGELOG.md b/CHANGELOG.md index 1deb187..ccc7bda 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,30 @@ independently. Bump there signals migration work for downstream projects. --- +## [v1.8.1] — 2026-06-19 + +### Validated + guidance — the gameplay_metrics path works end-to-end + +The f1 probe exercised v1.8.0's per-dimension capture: it authored a +`gameplay_metrics` harness (drives the real pure physics headlessly) for the two +**frozen** experiential axes (`driving_feel`, `fun_challenge` = 45% of the rubric +a screenshot can't judge). + +- **Honest measurement first dropped artifact_quality 0.533 → 0.490** — the + screenshot had been *over*-scoring feel/fun (0.51/0.38 → measured 0.41/0.29). + Confirms the v1.8.0 thesis: measurement was the bottleneck *and* inflating. +- Two leaps the unlock enabled: `fun_challenge` 0.29 → **0.81** (distinct AI + racing lines + tamed DRS slingshot) and `driving_feel` 0.41 → **0.78** (steering + inertia + power oversteer + weight transfer). **artifact_quality crossed the bar + for the first time (0.687 ≥ 0.65) → an HONEST grade A** (the loop's original A + was a lie; this one is earned). +- **Guidance (config doc):** a `gameplay_metrics` harness must MEASURE BEHAVIOUR + (drive the artifact, read the numbers), never assert an implementation fact — the + probe's first harness hardcoded a feature flag and couldn't credit the fix, + which would trigger a spurious thrashing-HALT. Rewritten to measure behaviour. + +--- + ## [v1.8.0] — 2026-06-19 ### Changed — drive quality to "good", not "passable" (config schema 1.4.0) diff --git a/config.example.json b/config.example.json index 56df709..995f7b0 100644 --- a/config.example.json +++ b/config.example.json @@ -270,7 +270,7 @@ "plateau_window": 4, "plateau_eps": 0.05, "locked": true, - "__dimensions_doc__": "v1.8.0: each dimension may override capture_method so the critic gets the evidence it actually needs. 'screenshot' axes share one capture; EXPERIENTIAL axes (feel/fun/responsiveness) a screenshot cannot judge use 'gameplay_metrics' — a HUMAN-AUTHORED harness that exercises the artifact and emits metrics JSON. The harness MUST be in safety.protected_paths AND match gameplay_metrics_hash (independence gate, same invariant as the rubric hash); else the dimension scores null (capture_failure) rather than faking a score. Without per-dimension capture, experiential axes freeze at their initial score and silently cap artifact_quality.", + "__dimensions_doc__": "v1.8.0: each dimension may override capture_method so the critic gets the evidence it actually needs. 'screenshot' axes share one capture; EXPERIENTIAL axes (feel/fun/responsiveness) a screenshot cannot judge use 'gameplay_metrics' — a HUMAN-AUTHORED harness that exercises the artifact and emits metrics JSON. The harness MUST be in safety.protected_paths AND match gameplay_metrics_hash (independence gate, same invariant as the rubric hash); else the dimension scores null (capture_failure) rather than faking a score. Without per-dimension capture, experiential axes freeze at their initial score and silently cap artifact_quality. v1.8.1 rule (validated by the f1 probe): the harness must MEASURE BEHAVIOUR (e.g. drive the real physics and read the resulting numbers), NOT assert an implementation fact — a hardcoded flag like {feature: false} cannot credit a real fix, so it would trigger a spurious thrashing-HALT. Drive the artifact and report what it actually does.", "dimensions": [], "__example_dimension__": { "name": "driving_feel",