From c952bba1013044501e922e2167ff69f156d7b225 Mon Sep 17 00:00:00 2001 From: Taeil Ma Date: Fri, 19 Jun 2026 20:38:33 +0900 Subject: [PATCH] =?UTF-8?q?feat(v1.9.0):=20Ambition=20=E2=80=94=20dual=20b?= =?UTF-8?q?ars,=20benchmark=20anchors,=20technique=20menu,=20mega-leap,=20?= =?UTF-8?q?asset=20ceilings?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The f1 probe still looked 1980s after v1.7/v1.8: an independent re-grade vs REAL racing games scored 0.09 (F+) while the internal rubric said 0.687 "A". The loop's ceiling is min(standard, medium, leap-scope) — all three were pinned to "prototype". v1.9.0 unpins them: - Dual thresholds bar_leap/bar_coast (rubric_score + loop_scorecard): below bar_leap always leap; coast only above bar_coast (~0.85, anchored to a real product); the forcing zone between keeps leaping. Back-compat: lone `bar` sets both equal. - Benchmark anchors: each dimension carries reference score_0.10..0.90 naming real products; the 5-G critic scores against them, not the artifact's own past (a flat prototype reads ~0.10, not 0.6). prototype_ceiling → ANCHOR WARNING. - Technique menu per dimension (techniques + technique_cdns) injected into the leap so it reaches for EffectComposer/PMREM/particles, not more BoxGeometry. - Mega-leap: human-approved multi-cycle re-platform (bigger budget, atomic rollback) for radical rewrites a bounded leap can't make. - Asset ceilings: ceiling_without_assets → a human_required skill_gap instead of thrashing when code-only work tops out. verify.py 61 → 62. plugin 1.8.1→1.9.0, config schema 1.4→1.5. Co-Authored-By: Claude Opus 4.8 --- .claude-plugin/plugin.json | 2 +- CHANGELOG.md | 37 +++++++++++++++++++++ config.example.json | 51 +++++++++++++++++++++++----- scripts/loop_scorecard.py | 24 +++++++++----- scripts/rubric_score.py | 68 ++++++++++++++++++++++++++++---------- tests/verify.py | 17 ++++++++++ 6 files changed, 165 insertions(+), 34 deletions(-) diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 6671b4c..3d1298d 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "ooda-loop", "displayName": "OODA-loop", - "version": "1.8.1", + "version": "1.9.0", "description": "An autonomous operations layer for your live side project. It watches, re-orients from which PRs you merge and reject, and opens small revertible PRs — bounded by a HALT file, protected paths, and a hard cost cap. Built on Boyd's OODA loop. You stay in command.", "author": { "name": "Taeil Ma", diff --git a/CHANGELOG.md b/CHANGELOG.md index ccc7bda..a94d949 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,43 @@ independently. Bump there signals migration work for downstream projects. --- +## [v1.9.0] — 2026-06-19 + +### Added — "Ambition": let the loop make RADICAL jumps, not prototype plateaus (config schema 1.5.0) + +The f1 probe still looked like a 1980s game after all the v1.7/v1.8 work — an +independent re-grade **against real racing games scored it 0.09 (F+)**, vs the +internal rubric's 0.687 "A". A 6-agent diagnosis found the loop's quality ceiling +is `min(standard, medium, leap-scope)` and all three were pinned to "prototype": + +- **Dual thresholds (`bar_leap` + `bar_coast`).** The single `bar` made the loop + COAST the instant it cleared a prototype number. Now: below `bar_leap` → always + leap; only above `bar_coast` (set high, ~0.85, anchored to a real product) may + it coast; the forcing zone between keeps leaping on stagnation. Back-compat: a + lone `bar` sets both equal (old behaviour). `rubric_score` + `loop_scorecard` + grade against `bar_coast`. +- **Benchmark anchors (critic recalibration).** Each rubric dimension carries + `reference` anchors (`score_0.10..0.90`) naming what each level looks like in + REAL products; the 5-G critic scores against those, NOT relative to the + artifact's own past — so a flat-shaded prototype reads ~0.10, not 0.6. A + `prototype_ceiling` triggers an ANCHOR WARNING when the critic may be grading on + a curve. +- **Technique menu.** Each dimension lists the modern `techniques` + pre-approved + `technique_cdns` (EffectComposer, PMREM/IBL, particles, Sky shader…). A leap is + told to pick ONE and implement it completely — the fix for "the loop reached for + more BoxGeometry instead of post-processing". +- **Mega-leap mode.** A human-approved, multi-cycle RE-PLATFORM (bigger budget, no + per-cycle revert, atomic final-gate rollback) for radical rewrites a bounded + leap can't make. `requires_human_plan_approval` keeps the loop from + self-authorising it. +- **Asset ceilings.** Each dimension declares `ceiling_without_assets`; when a + code-only leap reaches it, the loop records a `human_required` skill_gap instead + of thrashing — honest about what needs authored models/textures/audio. + +`tests/verify.py` 61 → **62**. plugin 1.8.1→1.9.0. + +--- + ## [v1.8.1] — 2026-06-19 ### Validated + guidance — the gameplay_metrics path works end-to-end diff --git a/config.example.json b/config.example.json index 995f7b0..08f98c9 100644 --- a/config.example.json +++ b/config.example.json @@ -1,5 +1,5 @@ { - "schema_version": "1.4.0", + "schema_version": "1.5.0", "project": { "name": "my-app", "locale": "en", @@ -265,21 +265,47 @@ "quality_rubric": { "__doc__": "ARTIFACT-quality axis (evolve Step 5-G, v1.7.0) — the fix for the dogfood failure where every cycle scored 0.5 / graded A while the built thing was broken, because nothing measured the artifact. CANONICAL placement is PER-DOMAIN: config.domains[].quality_rubric (evolve is domain-agnostic). This top-level block is the single-domain fallback. Each cycle that produces an artifact, an INDEPENDENT critic (separate model context) captures the real artifact via capture_method and scores each dimension 0..1; rubric_score.py aggregates to artifact_score, which MULTIPLIES the process score in 6-C9 and drives the Goodhart Guard + LEAP trigger. The rubric is HUMAN-AUTHORED and read-only to the loop — add 'quality_rubric' / the config path to safety.protected_paths so the loop can never write its own grading standard (gaming-resistance). Empty dimensions = artifact axis OFF (process-only scoring, back-compat).", "bar": 0.65, + "__bars_doc__": "v1.9.0 'Ambition' — DUAL thresholds break the prototype plateau. bar_leap: below it, ALWAYS leap. bar_coast: only above it may the loop coast (stop leaping). The forcing zone (bar_leap..bar_coast) keeps leaping on stagnation so the loop cannot declare victory at prototype quality. Set bar_coast HIGH (~0.85) anchored to a REAL product, not a demo. Legacy single `bar` → bar_leap==bar_coast==bar (old behaviour).", + "bar_leap": 0.65, + "bar_coast": 0.85, + "prototype_ceiling": 0.20, "capture_method": "screenshot", "capture_command": "", "plateau_window": 4, "plateau_eps": 0.05, "locked": true, + "__anchors_doc__": "v1.9.0 — each dimension SHOULD carry `reference` anchors naming what score_0.10/0.40/0.70/0.90 look like in REAL products, and `ceiling_without_assets`. The 5-G critic scores against these named anchors, not relative to the artifact's own past — without them a critic silently grades a flat-shaded prototype as 0.6 ('it works') instead of 0.1 ('vs Gran Turismo'). When artifact_score >= a dimension's ceiling_without_assets, the loop records a `human_required` skill_gap (authored model/track/audio assets needed) instead of leaping fruitlessly. Reference anchors + ceilings are human-authored + protected.", "__dimensions_doc__": "v1.8.0: each dimension may override capture_method so the critic gets the evidence it actually needs. 'screenshot' axes share one capture; EXPERIENTIAL axes (feel/fun/responsiveness) a screenshot cannot judge use 'gameplay_metrics' — a HUMAN-AUTHORED harness that exercises the artifact and emits metrics JSON. The harness MUST be in safety.protected_paths AND match gameplay_metrics_hash (independence gate, same invariant as the rubric hash); else the dimension scores null (capture_failure) rather than faking a score. Without per-dimension capture, experiential axes freeze at their initial score and silently cap artifact_quality. v1.8.1 rule (validated by the f1 probe): the harness must MEASURE BEHAVIOUR (e.g. drive the real physics and read the resulting numbers), NOT assert an implementation fact — a hardcoded flag like {feature: false} cannot credit a real fix, so it would trigger a spurious thrashing-HALT. Drive the artifact and report what it actually does.", "dimensions": [], "__example_dimension__": { - "name": "driving_feel", + "name": "visual_fidelity", "weight": 0.25, - "capture_method": "gameplay_metrics", - "gameplay_metrics_command": "node tools/feel_harness.mjs", - "gameplay_metrics_hash": "", - "metrics_fields": ["input_lag_ms", "physics_response_ms", "completion_rate"], - "description": "Responsive steering, weight transfer, distinct braking — judge against: input_lag_ms<40, physics_response_ms stable, completion_rate>0.7." + "capture_method": "screenshot", + "description": "3D visual quality vs SHIPPED games. Score against the reference anchors, not the artifact's past.", + "reference": { + "score_0.10": "flat-shaded primitive meshes, solid-colour sky, no shadows/post-processing (a 1990s look)", + "score_0.40": "textured surfaces, basic shadows, simple post (PS2-era)", + "score_0.70": "PBR materials + image-based lighting, bloom + tone mapping, particle FX, real skybox", + "score_0.90": "authored models, motion blur, SSAO, weather, near-photoreal" + }, + "ceiling_without_assets": 0.35, + "ceiling_note": "Procedural geometry + CDN Three.js addons (post-processing, PBR/IBL, particles, Sky shader) top out ~0.35. Above that needs authored glTF models / textures / HDRIs — the loop should record a human_required skill_gap, not keep leaping.", + "techniques": [ + "EffectComposer: RenderPass + UnrealBloomPass + OutputPass (ACES tone map)", + "PMREMGenerator from RoomEnvironment or Sky → scene.environment (IBL)", + "MeshStandardMaterial roughness/metalness + envMapIntensity (PBR paint)", + "three/addons Sky shader (atmospheric scattering) replacing solid clear colour", + "Points/BufferGeometry particle systems (tyre smoke, sparks)", + "procedural normalMap (canvas noise) on the road/track material" + ], + "technique_cdns": ["three/addons/postprocessing/*", "three/addons/environments/RoomEnvironment.js", "three/addons/objects/Sky.js"], + "__experiential_example__": { + "name": "driving_feel", "weight": 0.25, "capture_method": "gameplay_metrics", + "gameplay_metrics_command": "node tools/feel_harness.mjs", + "gameplay_metrics_hash": "", + "metrics_fields": ["steer_response_rad_s", "has_oversteer", "has_weight_transfer"], + "description": "Responsive, believable handling — judge metrics vs targets, not vibes." + } } }, "leap": { @@ -290,7 +316,16 @@ "max_per_day": 2, "gap_weight": 30.0, "cost_limit_usd": 0.5, - "lock_until_bar": true + "lock_until_bar": true, + "mega_leap": { + "__doc__": "v1.9.0 — a multi-cycle RE-PLATFORM for radical jumps a normal leap can't make (replace the whole rendering pipeline, swap a subsystem). Unlocked only by a human-authored, approved mega_leap_plan.json after a leap exhausts max_attempts_per_dimension. Bigger budget, no per-cycle revert; the whole sequence reverts if the cumulative artifact delta misses min_artifact_delta_at_completion.", + "enabled": false, + "max_lines": 5000, + "max_cycles": 4, + "min_artifact_delta_at_completion": 0.15, + "requires_human_plan_approval": true, + "plan_file": "agent/state/evolve/mega_leap_plan.json" + } }, "goal_completion_idle": true } diff --git a/scripts/loop_scorecard.py b/scripts/loop_scorecard.py index ff752b9..90ef1cd 100644 --- a/scripts/loop_scorecard.py +++ b/scripts/loop_scorecard.py @@ -34,15 +34,23 @@ def _load(p: Path, default=None): def _resolve_bar(config: dict) -> float: - """The artifact quality bar. Top-level config.quality_rubric.bar, else the - first domain that declares a per-domain quality_rubric.bar, else default.""" - top = (config.get("quality_rubric") or {}).get("bar") - if isinstance(top, (int, float)): - return float(top) + """The "genuinely good" bar the scorecard grades against. v1.9.0: prefer + `bar_coast` (the real-quality ceiling) over `bar`/`bar_leap`, so the headline + grade reflects distance to a good product, not a cleared prototype bar. + Top-level quality_rubric first, then the first domain that declares one.""" + def pick(r): + for key in ("bar_coast", "bar"): + v = (r or {}).get(key) + if isinstance(v, (int, float)): + return float(v) + return None + top = pick(config.get("quality_rubric")) + if top is not None: + return top for d in (config.get("domains") or {}).values(): - b = ((d or {}).get("quality_rubric") or {}).get("bar") - if isinstance(b, (int, float)): - return float(b) + b = pick((d or {}).get("quality_rubric")) + if b is not None: + return b return DEFAULT_BAR diff --git a/scripts/rubric_score.py b/scripts/rubric_score.py index f995652..486afc4 100644 --- a/scripts/rubric_score.py +++ b/scripts/rubric_score.py @@ -32,6 +32,13 @@ DEFAULT_BAR = 0.70 DEFAULT_PLATEAU_WINDOW = 4 # build cycles to look back over DEFAULT_PLATEAU_EPS = 0.05 # min artifact_score gain over the window to count as progress +# v1.9.0 "Ambition": dual thresholds. The single `bar` made the loop COAST the +# instant it cleared a prototype-level number, so it never pushed toward a real +# product. bar_leap = below this, ALWAYS leap; bar_coast = until this, never coast +# (the forcing zone in between alternates leap/feature). Set bar_coast high +# (~0.85) so "it exists and works" can't masquerade as "it's genuinely good". +DEFAULT_BAR_COAST = 0.85 +DEFAULT_PROTOTYPE_CEILING = 0.20 # below this, the critic is likely grading vs a prototype, not best-in-class def _load(p: Path, default=None): @@ -63,9 +70,16 @@ def rubric_of(config: dict, domain: str | None = None) -> dict: if not r: r = config.get("quality_rubric") or {} dims = r.get("dimensions") or r.get("axes") or [] + bar = r.get("bar", DEFAULT_BAR) return { "dimensions": dims, - "bar": r.get("bar", DEFAULT_BAR), + "bar": bar, + # v1.9.0 dual thresholds. Back-compat: a project with only `bar` set gets + # bar_leap == bar_coast == bar (old single-bar behaviour). To open the + # forcing zone, the project sets bar_coast (e.g. 0.85) above bar_leap. + "bar_leap": r.get("bar_leap", bar), + "bar_coast": r.get("bar_coast", bar), + "prototype_ceiling": r.get("prototype_ceiling", DEFAULT_PROTOTYPE_CEILING), "plateau_window": r.get("plateau_window", DEFAULT_PLATEAU_WINDOW), "plateau_eps": r.get("plateau_eps", DEFAULT_PLATEAU_EPS), "enabled": bool(dims), # no dimensions defined → artifact axis is off (back-compat) @@ -123,8 +137,10 @@ def aggregate(dimension_scores: dict, rubric: dict) -> dict: def _weighted_gap_target(dimension_scores: dict, dims: list, rubric: dict) -> str | None: """Pick the dimension with the largest weight × max(0, bar − score). Ties - break to the lower raw score (the more broken one).""" - bar = rubric.get("bar", DEFAULT_BAR) + break to the lower raw score (the more broken one). v1.9.0: gap is measured to + the COAST bar (distance to "good"), not the leap bar, so targeting reflects + distance to the real goal.""" + bar = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR)) best, best_key = None, None for d in dims: name = d.get("name") @@ -140,23 +156,31 @@ def _weighted_gap_target(dimension_scores: dict, dims: list, rubric: dict) -> st def meets_bar(artifact_score, rubric: dict) -> bool: + """'Good enough to stop leaping' = cleared the COAST bar (v1.9.0). Back-compat: + when bar_coast defaults to bar, this is the old behaviour.""" if artifact_score is None: return True # no artifact axis configured → don't block (back-compat) - return artifact_score >= rubric.get("bar", DEFAULT_BAR) + return artifact_score >= rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR)) def goodhart_flag(process_green: bool, artifact_score, rubric: dict) -> dict: """The 'measurement is lying' detector. process_green := futile≈0 and goal high. If the process scoreboard is green but the artifact is below bar, the headline grade must be capped and the operator warned.""" + coast = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR)) lying = bool(process_green) and artifact_score is not None and not meets_bar(artifact_score, rubric) - return { - "lying": lying, - "message": ( - "MEASUREMENT WARNING: process green but artifact %.2f < bar %.2f — " - "the scoreboard is lying; cap grade and LEAP." % (artifact_score, rubric.get("bar", DEFAULT_BAR)) - ) if lying else "", - } + # v1.9.0 anchor warning: a very low artifact_score suggests the critic is + # grading vs a prototype, not best-in-class (re-anchor the rubric references). + ceil = rubric.get("prototype_ceiling", DEFAULT_PROTOTYPE_CEILING) + anchor_warn = artifact_score is not None and artifact_score < ceil + msg = "" + if lying: + msg = ("MEASUREMENT WARNING: process green but artifact %.2f < coast bar %.2f — " + "the scoreboard is lying; cap grade and LEAP." % (artifact_score, coast)) + elif anchor_warn: + msg = ("ANCHOR WARNING: artifact %.2f below prototype_ceiling %.2f — this is " + "prototype-level vs best-in-class; keep leaping (don't trust a high process grade)." % (artifact_score, ceil)) + return {"lying": lying, "anchor_warn": bool(anchor_warn), "message": msg} def artifact_series(outcomes: list) -> list: @@ -190,15 +214,25 @@ def detect_plateau(outcomes: list, rubric: dict) -> dict: tail = weak_dims[-window:] weak_stuck = len(set(tail)) == 1 and tail[0] is not None - # A plateau only matters if we are not already good enough. - plateau = (stagnant or weak_stuck) and below_bar + # v1.9.0 dual-bar plateau: leap if (stagnant OR weak-stuck OR below bar_leap), + # as long as we are not yet at bar_coast. Below bar_leap ALWAYS leaps (don't + # wait for a full stagnation window when quality is still prototype-level); + # the forcing zone (bar_leap..bar_coast) leaps on stagnation. This is the fix + # for "the loop coasted the instant it cleared a prototype bar". + bar_leap = rubric.get("bar_leap", rubric.get("bar", DEFAULT_BAR)) + bar_coast = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR)) + below_coast = latest is not None and latest < bar_coast + below_leap = latest is not None and latest < bar_leap + plateau = (stagnant or weak_stuck or below_leap) and below_coast reasons = [] + if below_leap: + reasons.append("artifact %.2f below leap bar %.2f (always leap)" % (latest, bar_leap)) if stagnant: - reasons.append("artifact_score flat over last %d cycles (<%.2f gain)" % (window, eps)) + reasons.append("artifact_score flat over last %d cycles (<=%.2f gain)" % (window, eps)) if weak_stuck: reasons.append("'%s' weakest for %d cycles running" % (weak_dims[-1], window)) - if below_bar and not (stagnant or weak_stuck): - reasons.append("artifact %.2f below bar %.2f" % (latest, rubric.get("bar", DEFAULT_BAR))) + if below_coast and not (stagnant or weak_stuck or below_leap): + reasons.append("artifact %.2f in forcing zone (< coast %.2f)" % (latest, bar_coast)) # v1.7.1: the LEAP target is the largest weighted gap on the latest critique, # not just the running weakest_dimension (impact on the headline metric). latest_dims = (scored[-1].get("dimension_scores") if scored else None) or {} @@ -243,7 +277,7 @@ def lock_target(outcomes: list, rubric: dict, leap_target: str | None) -> str | last = outcomes[-1] if last.get("cycle_mode") != "leap" or last.get("result_type") == "leap_regressed": return None - bar = rubric.get("bar", DEFAULT_BAR) + bar = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR)) # v1.9.0: lock until COAST (good), not prototype bar eps = float(rubric.get("plateau_eps", DEFAULT_PLATEAU_EPS)) score = (last.get("dimension_scores") or {}).get(leap_target) if score is None: diff --git a/tests/verify.py b/tests/verify.py index 666096c..760f237 100644 --- a/tests/verify.py +++ b/tests/verify.py @@ -719,6 +719,23 @@ def _mod(name, fn): f"regressed={R.lock_target(regressed, rubL, 'visual_fidelity')}", ) + # 11) v1.9.0 dual-bar: the loop keeps leaping in the forcing zone (above bar_leap, + # below bar_coast) and only coasts once genuinely good — back-compat when only + # `bar` is set (bar_leap == bar_coast == bar). + rub19 = R.rubric_of({"quality_rubric": {"bar_leap": 0.65, "bar_coast": 0.85, + "plateau_window": 3, "plateau_eps": 0.05, "dimensions": [{"name": "v", "weight": 1}]}}) + zone = [{"artifact_score": 0.70, "weakest_dimension": "v", "dimension_scores": {"v": 0.70}}] * 3 + good = [{"artifact_score": 0.88, "weakest_dimension": "v", "dimension_scores": {"v": 0.88}}] * 3 + legacy = R.rubric_of({"quality_rubric": {"bar": 0.65, "dimensions": [{"name": "v", "weight": 1}]}}) + r.check( + "artifact-axis: dual-bar keeps leaping in the forcing zone, coasts only when good (v1.9.0)", + R.detect_plateau(zone, rub19)["plateau"] is True + and R.detect_plateau(good, rub19)["plateau"] is False + and legacy["bar_leap"] == 0.65 and legacy["bar_coast"] == 0.65, + f"zone(0.70)→leap={R.detect_plateau(zone, rub19)['plateau']}, " + f"good(0.88)→leap={R.detect_plateau(good, rub19)['plateau']}, legacy bars={legacy['bar_leap']}/{legacy['bar_coast']}", + ) + def main() -> int: r = Runner()