From c952bba1013044501e922e2167ff69f156d7b225 Mon Sep 17 00:00:00 2001
From: Taeil Ma <taeil.ma0915@gmail.com>
Date: Fri, 19 Jun 2026 20:38:33 +0900
Subject: [PATCH] =?UTF-8?q?feat(v1.9.0):=20Ambition=20=E2=80=94=20dual=20b?=
 =?UTF-8?q?ars,=20benchmark=20anchors,=20technique=20menu,=20mega-leap,=20?=
 =?UTF-8?q?asset=20ceilings?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The f1 probe still looked 1980s after v1.7/v1.8: an independent re-grade vs REAL
racing games scored 0.09 (F+) while the internal rubric said 0.687 "A". The
loop's ceiling is min(standard, medium, leap-scope) — all three were pinned to
"prototype". v1.9.0 unpins them:

- Dual thresholds bar_leap/bar_coast (rubric_score + loop_scorecard): below
  bar_leap always leap; coast only above bar_coast (~0.85, anchored to a real
  product); the forcing zone between keeps leaping. Back-compat: lone `bar` sets
  both equal.
- Benchmark anchors: each dimension carries reference score_0.10..0.90 naming real
  products; the 5-G critic scores against them, not the artifact's own past (a
  flat prototype reads ~0.10, not 0.6). prototype_ceiling → ANCHOR WARNING.
- Technique menu per dimension (techniques + technique_cdns) injected into the
  leap so it reaches for EffectComposer/PMREM/particles, not more BoxGeometry.
- Mega-leap: human-approved multi-cycle re-platform (bigger budget, atomic
  rollback) for radical rewrites a bounded leap can't make.
- Asset ceilings: ceiling_without_assets → a human_required skill_gap instead of
  thrashing when code-only work tops out.

verify.py 61 → 62. plugin 1.8.1→1.9.0, config schema 1.4→1.5.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
---
 .claude-plugin/plugin.json |  2 +-
 CHANGELOG.md               | 37 +++++++++++++++++++++
 config.example.json        | 51 +++++++++++++++++++++++-----
 scripts/loop_scorecard.py  | 24 +++++++++-----
 scripts/rubric_score.py    | 68 ++++++++++++++++++++++++++++----------
 tests/verify.py            | 17 ++++++++++
 6 files changed, 165 insertions(+), 34 deletions(-)

diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json
index 6671b4c..3d1298d 100644
--- a/.claude-plugin/plugin.json
+++ b/.claude-plugin/plugin.json
@@ -1,7 +1,7 @@
 {
   "name": "ooda-loop",
   "displayName": "OODA-loop",
-  "version": "1.8.1",
+  "version": "1.9.0",
   "description": "An autonomous operations layer for your live side project. It watches, re-orients from which PRs you merge and reject, and opens small revertible PRs — bounded by a HALT file, protected paths, and a hard cost cap. Built on Boyd's OODA loop. You stay in command.",
   "author": {
     "name": "Taeil Ma",
diff --git a/CHANGELOG.md b/CHANGELOG.md
index ccc7bda..a94d949 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -8,6 +8,43 @@ independently. Bump there signals migration work for downstream projects.
 
 ---
 
+## [v1.9.0] — 2026-06-19
+
+### Added — "Ambition": let the loop make RADICAL jumps, not prototype plateaus (config schema 1.5.0)
+
+The f1 probe still looked like a 1980s game after all the v1.7/v1.8 work — an
+independent re-grade **against real racing games scored it 0.09 (F+)**, vs the
+internal rubric's 0.687 "A". A 6-agent diagnosis found the loop's quality ceiling
+is `min(standard, medium, leap-scope)` and all three were pinned to "prototype":
+
+- **Dual thresholds (`bar_leap` + `bar_coast`).** The single `bar` made the loop
+  COAST the instant it cleared a prototype number. Now: below `bar_leap` → always
+  leap; only above `bar_coast` (set high, ~0.85, anchored to a real product) may
+  it coast; the forcing zone between keeps leaping on stagnation. Back-compat: a
+  lone `bar` sets both equal (old behaviour). `rubric_score` + `loop_scorecard`
+  grade against `bar_coast`.
+- **Benchmark anchors (critic recalibration).** Each rubric dimension carries
+  `reference` anchors (`score_0.10..0.90`) naming what each level looks like in
+  REAL products; the 5-G critic scores against those, NOT relative to the
+  artifact's own past — so a flat-shaded prototype reads ~0.10, not 0.6. A
+  `prototype_ceiling` triggers an ANCHOR WARNING when the critic may be grading on
+  a curve.
+- **Technique menu.** Each dimension lists the modern `techniques` + pre-approved
+  `technique_cdns` (EffectComposer, PMREM/IBL, particles, Sky shader…). A leap is
+  told to pick ONE and implement it completely — the fix for "the loop reached for
+  more BoxGeometry instead of post-processing".
+- **Mega-leap mode.** A human-approved, multi-cycle RE-PLATFORM (bigger budget, no
+  per-cycle revert, atomic final-gate rollback) for radical rewrites a bounded
+  leap can't make. `requires_human_plan_approval` keeps the loop from
+  self-authorising it.
+- **Asset ceilings.** Each dimension declares `ceiling_without_assets`; when a
+  code-only leap reaches it, the loop records a `human_required` skill_gap instead
+  of thrashing — honest about what needs authored models/textures/audio.
+
+`tests/verify.py` 61 → **62**. plugin 1.8.1→1.9.0.
+
+---
+
 ## [v1.8.1] — 2026-06-19
 
 ### Validated + guidance — the gameplay_metrics path works end-to-end
diff --git a/config.example.json b/config.example.json
index 995f7b0..08f98c9 100644
--- a/config.example.json
+++ b/config.example.json
@@ -1,5 +1,5 @@
 {
-  "schema_version": "1.4.0",
+  "schema_version": "1.5.0",
   "project": {
     "name": "my-app",
     "locale": "en",
@@ -265,21 +265,47 @@
   "quality_rubric": {
     "__doc__": "ARTIFACT-quality axis (evolve Step 5-G, v1.7.0) — the fix for the dogfood failure where every cycle scored 0.5 / graded A while the built thing was broken, because nothing measured the artifact. CANONICAL placement is PER-DOMAIN: config.domains[<build domain>].quality_rubric (evolve is domain-agnostic). This top-level block is the single-domain fallback. Each cycle that produces an artifact, an INDEPENDENT critic (separate model context) captures the real artifact via capture_method and scores each dimension 0..1; rubric_score.py aggregates to artifact_score, which MULTIPLIES the process score in 6-C9 and drives the Goodhart Guard + LEAP trigger. The rubric is HUMAN-AUTHORED and read-only to the loop — add 'quality_rubric' / the config path to safety.protected_paths so the loop can never write its own grading standard (gaming-resistance). Empty dimensions = artifact axis OFF (process-only scoring, back-compat).",
     "bar": 0.65,
+    "__bars_doc__": "v1.9.0 'Ambition' — DUAL thresholds break the prototype plateau. bar_leap: below it, ALWAYS leap. bar_coast: only above it may the loop coast (stop leaping). The forcing zone (bar_leap..bar_coast) keeps leaping on stagnation so the loop cannot declare victory at prototype quality. Set bar_coast HIGH (~0.85) anchored to a REAL product, not a demo. Legacy single `bar` → bar_leap==bar_coast==bar (old behaviour).",
+    "bar_leap": 0.65,
+    "bar_coast": 0.85,
+    "prototype_ceiling": 0.20,
     "capture_method": "screenshot",
     "capture_command": "<serve + screenshot for web UIs | run + capture stdout for api_call | run benchmark>",
     "plateau_window": 4,
     "plateau_eps": 0.05,
     "locked": true,
+    "__anchors_doc__": "v1.9.0 — each dimension SHOULD carry `reference` anchors naming what score_0.10/0.40/0.70/0.90 look like in REAL products, and `ceiling_without_assets`. The 5-G critic scores against these named anchors, not relative to the artifact's own past — without them a critic silently grades a flat-shaded prototype as 0.6 ('it works') instead of 0.1 ('vs Gran Turismo'). When artifact_score >= a dimension's ceiling_without_assets, the loop records a `human_required` skill_gap (authored model/track/audio assets needed) instead of leaping fruitlessly. Reference anchors + ceilings are human-authored + protected.",
     "__dimensions_doc__": "v1.8.0: each dimension may override capture_method so the critic gets the evidence it actually needs. 'screenshot' axes share one capture; EXPERIENTIAL axes (feel/fun/responsiveness) a screenshot cannot judge use 'gameplay_metrics' — a HUMAN-AUTHORED harness that exercises the artifact and emits metrics JSON. The harness MUST be in safety.protected_paths AND match gameplay_metrics_hash (independence gate, same invariant as the rubric hash); else the dimension scores null (capture_failure) rather than faking a score. Without per-dimension capture, experiential axes freeze at their initial score and silently cap artifact_quality. v1.8.1 rule (validated by the f1 probe): the harness must MEASURE BEHAVIOUR (e.g. drive the real physics and read the resulting numbers), NOT assert an implementation fact — a hardcoded flag like {feature: false} cannot credit a real fix, so it would trigger a spurious thrashing-HALT. Drive the artifact and report what it actually does.",
     "dimensions": [],
     "__example_dimension__": {
-      "name": "driving_feel",
+      "name": "visual_fidelity",
       "weight": 0.25,
-      "capture_method": "gameplay_metrics",
-      "gameplay_metrics_command": "node tools/feel_harness.mjs",
-      "gameplay_metrics_hash": "<sha256 of the harness file>",
-      "metrics_fields": ["input_lag_ms", "physics_response_ms", "completion_rate"],
-      "description": "Responsive steering, weight transfer, distinct braking — judge against: input_lag_ms<40, physics_response_ms stable, completion_rate>0.7."
+      "capture_method": "screenshot",
+      "description": "3D visual quality vs SHIPPED games. Score against the reference anchors, not the artifact's past.",
+      "reference": {
+        "score_0.10": "flat-shaded primitive meshes, solid-colour sky, no shadows/post-processing (a 1990s look)",
+        "score_0.40": "textured surfaces, basic shadows, simple post (PS2-era)",
+        "score_0.70": "PBR materials + image-based lighting, bloom + tone mapping, particle FX, real skybox",
+        "score_0.90": "authored models, motion blur, SSAO, weather, near-photoreal"
+      },
+      "ceiling_without_assets": 0.35,
+      "ceiling_note": "Procedural geometry + CDN Three.js addons (post-processing, PBR/IBL, particles, Sky shader) top out ~0.35. Above that needs authored glTF models / textures / HDRIs — the loop should record a human_required skill_gap, not keep leaping.",
+      "techniques": [
+        "EffectComposer: RenderPass + UnrealBloomPass + OutputPass (ACES tone map)",
+        "PMREMGenerator from RoomEnvironment or Sky → scene.environment (IBL)",
+        "MeshStandardMaterial roughness/metalness + envMapIntensity (PBR paint)",
+        "three/addons Sky shader (atmospheric scattering) replacing solid clear colour",
+        "Points/BufferGeometry particle systems (tyre smoke, sparks)",
+        "procedural normalMap (canvas noise) on the road/track material"
+      ],
+      "technique_cdns": ["three/addons/postprocessing/*", "three/addons/environments/RoomEnvironment.js", "three/addons/objects/Sky.js"],
+      "__experiential_example__": {
+        "name": "driving_feel", "weight": 0.25, "capture_method": "gameplay_metrics",
+        "gameplay_metrics_command": "node tools/feel_harness.mjs",
+        "gameplay_metrics_hash": "<sha256 of the harness file>",
+        "metrics_fields": ["steer_response_rad_s", "has_oversteer", "has_weight_transfer"],
+        "description": "Responsive, believable handling — judge metrics vs targets, not vibes."
+      }
     }
   },
   "leap": {
@@ -290,7 +316,16 @@
     "max_per_day": 2,
     "gap_weight": 30.0,
     "cost_limit_usd": 0.5,
-    "lock_until_bar": true
+    "lock_until_bar": true,
+    "mega_leap": {
+      "__doc__": "v1.9.0 — a multi-cycle RE-PLATFORM for radical jumps a normal leap can't make (replace the whole rendering pipeline, swap a subsystem). Unlocked only by a human-authored, approved mega_leap_plan.json after a leap exhausts max_attempts_per_dimension. Bigger budget, no per-cycle revert; the whole sequence reverts if the cumulative artifact delta misses min_artifact_delta_at_completion.",
+      "enabled": false,
+      "max_lines": 5000,
+      "max_cycles": 4,
+      "min_artifact_delta_at_completion": 0.15,
+      "requires_human_plan_approval": true,
+      "plan_file": "agent/state/evolve/mega_leap_plan.json"
+    }
   },
   "goal_completion_idle": true
 }
diff --git a/scripts/loop_scorecard.py b/scripts/loop_scorecard.py
index ff752b9..90ef1cd 100644
--- a/scripts/loop_scorecard.py
+++ b/scripts/loop_scorecard.py
@@ -34,15 +34,23 @@ def _load(p: Path, default=None):
 
 
 def _resolve_bar(config: dict) -> float:
-    """The artifact quality bar. Top-level config.quality_rubric.bar, else the
-    first domain that declares a per-domain quality_rubric.bar, else default."""
-    top = (config.get("quality_rubric") or {}).get("bar")
-    if isinstance(top, (int, float)):
-        return float(top)
+    """The "genuinely good" bar the scorecard grades against. v1.9.0: prefer
+    `bar_coast` (the real-quality ceiling) over `bar`/`bar_leap`, so the headline
+    grade reflects distance to a good product, not a cleared prototype bar.
+    Top-level quality_rubric first, then the first domain that declares one."""
+    def pick(r):
+        for key in ("bar_coast", "bar"):
+            v = (r or {}).get(key)
+            if isinstance(v, (int, float)):
+                return float(v)
+        return None
+    top = pick(config.get("quality_rubric"))
+    if top is not None:
+        return top
     for d in (config.get("domains") or {}).values():
-        b = ((d or {}).get("quality_rubric") or {}).get("bar")
-        if isinstance(b, (int, float)):
-            return float(b)
+        b = pick((d or {}).get("quality_rubric"))
+        if b is not None:
+            return b
     return DEFAULT_BAR
 
 
diff --git a/scripts/rubric_score.py b/scripts/rubric_score.py
index f995652..486afc4 100644
--- a/scripts/rubric_score.py
+++ b/scripts/rubric_score.py
@@ -32,6 +32,13 @@
 DEFAULT_BAR = 0.70
 DEFAULT_PLATEAU_WINDOW = 4      # build cycles to look back over
 DEFAULT_PLATEAU_EPS = 0.05      # min artifact_score gain over the window to count as progress
+# v1.9.0 "Ambition": dual thresholds. The single `bar` made the loop COAST the
+# instant it cleared a prototype-level number, so it never pushed toward a real
+# product. bar_leap = below this, ALWAYS leap; bar_coast = until this, never coast
+# (the forcing zone in between alternates leap/feature). Set bar_coast high
+# (~0.85) so "it exists and works" can't masquerade as "it's genuinely good".
+DEFAULT_BAR_COAST = 0.85
+DEFAULT_PROTOTYPE_CEILING = 0.20  # below this, the critic is likely grading vs a prototype, not best-in-class
 
 
 def _load(p: Path, default=None):
@@ -63,9 +70,16 @@ def rubric_of(config: dict, domain: str | None = None) -> dict:
     if not r:
         r = config.get("quality_rubric") or {}
     dims = r.get("dimensions") or r.get("axes") or []
+    bar = r.get("bar", DEFAULT_BAR)
     return {
         "dimensions": dims,
-        "bar": r.get("bar", DEFAULT_BAR),
+        "bar": bar,
+        # v1.9.0 dual thresholds. Back-compat: a project with only `bar` set gets
+        # bar_leap == bar_coast == bar (old single-bar behaviour). To open the
+        # forcing zone, the project sets bar_coast (e.g. 0.85) above bar_leap.
+        "bar_leap": r.get("bar_leap", bar),
+        "bar_coast": r.get("bar_coast", bar),
+        "prototype_ceiling": r.get("prototype_ceiling", DEFAULT_PROTOTYPE_CEILING),
         "plateau_window": r.get("plateau_window", DEFAULT_PLATEAU_WINDOW),
         "plateau_eps": r.get("plateau_eps", DEFAULT_PLATEAU_EPS),
         "enabled": bool(dims),     # no dimensions defined → artifact axis is off (back-compat)
@@ -123,8 +137,10 @@ def aggregate(dimension_scores: dict, rubric: dict) -> dict:
 
 def _weighted_gap_target(dimension_scores: dict, dims: list, rubric: dict) -> str | None:
     """Pick the dimension with the largest weight × max(0, bar − score). Ties
-    break to the lower raw score (the more broken one)."""
-    bar = rubric.get("bar", DEFAULT_BAR)
+    break to the lower raw score (the more broken one). v1.9.0: gap is measured to
+    the COAST bar (distance to "good"), not the leap bar, so targeting reflects
+    distance to the real goal."""
+    bar = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR))
     best, best_key = None, None
     for d in dims:
         name = d.get("name")
@@ -140,23 +156,31 @@ def _weighted_gap_target(dimension_scores: dict, dims: list, rubric: dict) -> st
 
 
 def meets_bar(artifact_score, rubric: dict) -> bool:
+    """'Good enough to stop leaping' = cleared the COAST bar (v1.9.0). Back-compat:
+    when bar_coast defaults to bar, this is the old behaviour."""
     if artifact_score is None:
         return True   # no artifact axis configured → don't block (back-compat)
-    return artifact_score >= rubric.get("bar", DEFAULT_BAR)
+    return artifact_score >= rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR))
 
 
 def goodhart_flag(process_green: bool, artifact_score, rubric: dict) -> dict:
     """The 'measurement is lying' detector. process_green := futile≈0 and goal high.
     If the process scoreboard is green but the artifact is below bar, the headline
     grade must be capped and the operator warned."""
+    coast = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR))
     lying = bool(process_green) and artifact_score is not None and not meets_bar(artifact_score, rubric)
-    return {
-        "lying": lying,
-        "message": (
-            "MEASUREMENT WARNING: process green but artifact %.2f < bar %.2f — "
-            "the scoreboard is lying; cap grade and LEAP." % (artifact_score, rubric.get("bar", DEFAULT_BAR))
-        ) if lying else "",
-    }
+    # v1.9.0 anchor warning: a very low artifact_score suggests the critic is
+    # grading vs a prototype, not best-in-class (re-anchor the rubric references).
+    ceil = rubric.get("prototype_ceiling", DEFAULT_PROTOTYPE_CEILING)
+    anchor_warn = artifact_score is not None and artifact_score < ceil
+    msg = ""
+    if lying:
+        msg = ("MEASUREMENT WARNING: process green but artifact %.2f < coast bar %.2f — "
+               "the scoreboard is lying; cap grade and LEAP." % (artifact_score, coast))
+    elif anchor_warn:
+        msg = ("ANCHOR WARNING: artifact %.2f below prototype_ceiling %.2f — this is "
+               "prototype-level vs best-in-class; keep leaping (don't trust a high process grade)." % (artifact_score, ceil))
+    return {"lying": lying, "anchor_warn": bool(anchor_warn), "message": msg}
 
 
 def artifact_series(outcomes: list) -> list:
@@ -190,15 +214,25 @@ def detect_plateau(outcomes: list, rubric: dict) -> dict:
         tail = weak_dims[-window:]
         weak_stuck = len(set(tail)) == 1 and tail[0] is not None
 
-    # A plateau only matters if we are not already good enough.
-    plateau = (stagnant or weak_stuck) and below_bar
+    # v1.9.0 dual-bar plateau: leap if (stagnant OR weak-stuck OR below bar_leap),
+    # as long as we are not yet at bar_coast. Below bar_leap ALWAYS leaps (don't
+    # wait for a full stagnation window when quality is still prototype-level);
+    # the forcing zone (bar_leap..bar_coast) leaps on stagnation. This is the fix
+    # for "the loop coasted the instant it cleared a prototype bar".
+    bar_leap = rubric.get("bar_leap", rubric.get("bar", DEFAULT_BAR))
+    bar_coast = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR))
+    below_coast = latest is not None and latest < bar_coast
+    below_leap = latest is not None and latest < bar_leap
+    plateau = (stagnant or weak_stuck or below_leap) and below_coast
     reasons = []
+    if below_leap:
+        reasons.append("artifact %.2f below leap bar %.2f (always leap)" % (latest, bar_leap))
     if stagnant:
-        reasons.append("artifact_score flat over last %d cycles (<%.2f gain)" % (window, eps))
+        reasons.append("artifact_score flat over last %d cycles (<=%.2f gain)" % (window, eps))
     if weak_stuck:
         reasons.append("'%s' weakest for %d cycles running" % (weak_dims[-1], window))
-    if below_bar and not (stagnant or weak_stuck):
-        reasons.append("artifact %.2f below bar %.2f" % (latest, rubric.get("bar", DEFAULT_BAR)))
+    if below_coast and not (stagnant or weak_stuck or below_leap):
+        reasons.append("artifact %.2f in forcing zone (< coast %.2f)" % (latest, bar_coast))
     # v1.7.1: the LEAP target is the largest weighted gap on the latest critique,
     # not just the running weakest_dimension (impact on the headline metric).
     latest_dims = (scored[-1].get("dimension_scores") if scored else None) or {}
@@ -243,7 +277,7 @@ def lock_target(outcomes: list, rubric: dict, leap_target: str | None) -> str |
     last = outcomes[-1]
     if last.get("cycle_mode") != "leap" or last.get("result_type") == "leap_regressed":
         return None
-    bar = rubric.get("bar", DEFAULT_BAR)
+    bar = rubric.get("bar_coast", rubric.get("bar", DEFAULT_BAR))  # v1.9.0: lock until COAST (good), not prototype bar
     eps = float(rubric.get("plateau_eps", DEFAULT_PLATEAU_EPS))
     score = (last.get("dimension_scores") or {}).get(leap_target)
     if score is None:
diff --git a/tests/verify.py b/tests/verify.py
index 666096c..760f237 100644
--- a/tests/verify.py
+++ b/tests/verify.py
@@ -719,6 +719,23 @@ def _mod(name, fn):
         f"regressed={R.lock_target(regressed, rubL, 'visual_fidelity')}",
     )
 
+    # 11) v1.9.0 dual-bar: the loop keeps leaping in the forcing zone (above bar_leap,
+    # below bar_coast) and only coasts once genuinely good — back-compat when only
+    # `bar` is set (bar_leap == bar_coast == bar).
+    rub19 = R.rubric_of({"quality_rubric": {"bar_leap": 0.65, "bar_coast": 0.85,
+        "plateau_window": 3, "plateau_eps": 0.05, "dimensions": [{"name": "v", "weight": 1}]}})
+    zone = [{"artifact_score": 0.70, "weakest_dimension": "v", "dimension_scores": {"v": 0.70}}] * 3
+    good = [{"artifact_score": 0.88, "weakest_dimension": "v", "dimension_scores": {"v": 0.88}}] * 3
+    legacy = R.rubric_of({"quality_rubric": {"bar": 0.65, "dimensions": [{"name": "v", "weight": 1}]}})
+    r.check(
+        "artifact-axis: dual-bar keeps leaping in the forcing zone, coasts only when good (v1.9.0)",
+        R.detect_plateau(zone, rub19)["plateau"] is True
+        and R.detect_plateau(good, rub19)["plateau"] is False
+        and legacy["bar_leap"] == 0.65 and legacy["bar_coast"] == 0.65,
+        f"zone(0.70)→leap={R.detect_plateau(zone, rub19)['plateau']}, "
+        f"good(0.88)→leap={R.detect_plateau(good, rub19)['plateau']}, legacy bars={legacy['bar_leap']}/{legacy['bar_coast']}",
+    )
+
 
 def main() -> int:
     r = Runner()