diff --git a/.claude-plugin/plugin.json b/.claude-plugin/plugin.json index 95921d5..de55683 100644 --- a/.claude-plugin/plugin.json +++ b/.claude-plugin/plugin.json @@ -1,7 +1,7 @@ { "name": "ooda-loop", "displayName": "OODA-loop", - "version": "1.10.1", + "version": "1.11.0", "description": "An autonomous operations layer for your live side project. It watches, re-orients from which PRs you merge and reject, and opens small revertible PRs — bounded by a HALT file, protected paths, and a hard cost cap. Built on Boyd's OODA loop. You stay in command.", "author": { "name": "Taeil Ma", diff --git a/CHANGELOG.md b/CHANGELOG.md index a97a33b..1f86558 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -8,6 +8,38 @@ independently. Bump there signals migration work for downstream projects. --- +## [v1.11.0] — 2026-06-21 + +### Added — Research-Grounded OODA (the anti-maze methodology) + +The f1 probe hit the classic failure: it **iterated without improving** (a +local-optimum "maze") because generation was anchored to model priors, not +external ground truth. A heavy external-research pass (graphics/physics/art + +the methodology literature) produced a cited playbook AND this 5-part fix, each +grounded in published work: + +- **Pre-generation research grounding (dev-cycle Step 3-PRE).** Before any + quality leap, resolve a reference (config.references / a researched playbook), + WebFetch the concrete block, derive acceptance criteria, THEN generate — and + record `grounded_in`. (AlphaCodium arXiv:2401.08500: a structured pre-stage + raised pass@5 19%→44%. AutoCodeRover; "concrete examples beat abstract specs".) +- **Reference targets (config.references + config.research).** Named real-product + levels + reference-implementation URLs + a research playbook path; mirror into + principles.json as permanent memory. +- **Reference comparison in the 5-G critic.** The critic names the ONE concrete + attribute of the reference the artifact lacks per axis (`per_axis_gap`) — an + actionable Observe signal, not "looks worse". +- **Stall → REWRITE escalation (evolve 2-G + `recommend_rewrite()`).** When the + incremental LEAPS themselves stall (not just the artifact), escalate from patch + to a from-scratch REWRITE carrying a Reflexion (arXiv:2303.11366) negative- + example memo, BEFORE giving up to a HALT — the fix for the `sky.visible=false` + class (symptom patched on a wrong architecture). +- **Diagnose/fix isolation on regression** (MASAI arXiv:2406.11638) noted for the + regression path. + +New deterministic helper `rubric_score.recommend_rewrite()` + test. verify.py +63 → 64. plugin 1.10.1→1.11.0. + ## [v1.10.1] — 2026-06-20 ### Fixed/clarified — gate integrity (f1 probe, overnight) diff --git a/config.example.json b/config.example.json index 37e3c98..0cafde9 100644 --- a/config.example.json +++ b/config.example.json @@ -310,8 +310,21 @@ } } }, + "references": { + "__doc__": "v1.11.0 (anti-maze) — REFERENCE TARGETS that ground the loop in external ground truth instead of model priors. dev-cycle Step 3-PRE resolves the reference for a technique, WebFetches the concrete block, derives acceptance criteria, THEN generates (AlphaCodium pattern). The 5-G critic scores AGAINST these (not the artifact's own past). Permanent: also mirror into agent/state/evolve/principles.json so they survive episode rollover. Populate per project with named real-product levels and reference-implementation URLs.", + "visual": "https://threejs.org/examples/webgl_materials_car.html", + "physics": "https://github.com/spacejack/carphysics2d/blob/master/public/js/Car.js", + "camera": "https://github.com/mrdoob/Starter-Kit-Racing/blob/main/js/Camera.js", + "quality_floor": 0.7 + }, + "research": { + "__doc__": "v1.11.0 — a researched, CITED knowledge base the loop reads before leaping (the missing external-knowledge input that breaks the 'iterate without improving' maze). Built by a heavy web-research pass (graphics/physics/art/methodology) distilled into an actionable, sourced playbook. dev-cycle Step 3-PRE prefers a concrete playbook move (technique + parameters + source URL) over the model's first idea.", + "playbook_path": "agent/state/research/playbook.md", + "refresh_when_stalled": true + }, "leap": { - "__doc__": "Quantum-leap cycles (evolve Steps 2-G/3-K, v1.7.0) — the fix for monotonic incrementalism (RICE structurally forbids overhauls). When the artifact plateaus BELOW bar, the next cycle is forced into LEAP mode: it overhauls the weakest dimension (step-change, not a new feature), bypassing pure RICE via a gap-to-bar bonus, with a larger size budget and an ARTIFACT-improvement gate instead of the unit-test gate. Safety: min_dimension_delta must be cleared or the leap is reverted; max_attempts_per_dimension failures escalate to HALT; cost/day caps bound spend. v1.8.0: lock_until_bar keeps leaping the SAME dimension until it clears bar (drive-to-good, not detect-and-nudge).", + "__doc__": "Quantum-leap cycles (evolve Steps 2-G/3-K, v1.7.0) — the fix for monotonic incrementalism (RICE structurally forbids overhauls). When the artifact plateaus BELOW bar, the next cycle is forced into LEAP mode: it overhauls the weakest dimension (step-change, not a new feature), bypassing pure RICE via a gap-to-bar bonus, with a larger size budget and an ARTIFACT-improvement gate instead of the unit-test gate. Safety: min_dimension_delta must be cleared or the leap is reverted; max_attempts_per_dimension failures escalate to HALT; cost/day caps bound spend. v1.8.0: lock_until_bar keeps leaping the SAME dimension until it clears bar (drive-to-good, not detect-and-nudge). v1.11.0: rewrite_on_stall — when recommend_rewrite() fires (incremental leaps THEMSELVES stalled), escalate from patch to a from-scratch REWRITE carrying a Reflexion negative-example memo, instead of thrashing to HALT.", + "rewrite_on_stall": true, "max_lines": 1500, "min_dimension_delta": 0.05, "max_attempts_per_dimension": 2, diff --git a/scripts/rubric_score.py b/scripts/rubric_score.py index bdec07c..516fe57 100644 --- a/scripts/rubric_score.py +++ b/scripts/rubric_score.py @@ -287,6 +287,35 @@ def asset_ceiling_hit(dimension: dict, score) -> bool: return ceil is not None and score is not None and score >= ceil +def recommend_rewrite(outcomes: list, dimension: str, rubric: dict, min_failed: int = 2) -> dict: + """v1.11.0 stall → REWRITE escalation (the anti-maze fix). + + `detect_plateau` says "the artifact stalled → do a LEAP". This says something + sharper: "the incremental LEAPS THEMSELVES have stalled — stop patching the + same approach, start over." True when the dimension is plateaued AND + >= `min_failed` incremental leaps on it already failed to clear the plateau + epsilon. The Reflect step then queues a from-scratch REWRITE (not another + same-architecture patch) carrying a negative-example memo of the stalled + approach — Reflexion (arXiv:2303.11366) verbal episodic memory so the next + attempt explicitly avoids what stalled. This is the fix for the + `sky.visible=false` class: a symptom patched cycle after cycle while the root + cause (wrong IBL source) is preserved — exactly the f1 'iterate without + improving' maze.""" + pl = detect_plateau(outcomes, rubric) + if not pl.get("plateau"): + return {"rewrite": False, "failed_leaps": 0, "reason": "not plateaued"} + eps = rubric.get("plateau_eps", DEFAULT_PLATEAU_EPS) + fails = failed_leaps(outcomes, dimension, eps) + do = fails >= min_failed + return { + "rewrite": do, + "failed_leaps": fails, + "reason": (f"{fails} incremental leaps stalled on '{dimension}' (>= {min_failed}) " + f"→ rewrite, don't patch") if do + else f"incremental still viable ({fails} failed < {min_failed})", + } + + def lock_target(outcomes: list, rubric: dict, leap_target: str | None) -> str | None: """v1.8.0 dimension-lock: after a SUCCESSFUL leap whose target is still below (bar − eps), return that target so evolve 2-G keeps the plateau active on it diff --git a/skills/dev-cycle/SKILL.md b/skills/dev-cycle/SKILL.md index 4015c8e..3a5f5e1 100644 --- a/skills/dev-cycle/SKILL.md +++ b/skills/dev-cycle/SKILL.md @@ -196,8 +196,40 @@ Read context files before writing any code: 3. Read `selected.source_domain` report (if referenced) to understand the motivation behind this action. -Analyze what needs to change based on the action title and source report, then -implement the changes (write and/or edit files). +### Step 3-PRE: Research grounding (v1.11.0 — the anti-maze step) + +> **Why this exists.** The f1 dogfood proved the loop can "iterate without +> improving" — a maze/local-optimum — when generation is anchored to the model's +> own priors instead of to external ground truth. The fix (AlphaCodium +> arXiv:2401.08500, which raised pass@5 19%→44% with a structured pre-generation +> stage; AutoCodeRover; Simon Willison's "concrete examples beat abstract +> requirements"): **ground every non-trivial change in an external reference +> BEFORE writing code.** This is Boyd's Observe extended to the world's knowledge, +> not just local state. + +For any leap / quality-improving / "make it better" action (skip for a trivial +mechanical edit), BEFORE writing code: + +1. **Resolve a reference.** Read `config.references` (and `agent/state/research/*` + if present — a researched, cited playbook). Pick the reference target for this + technique/domain (e.g. a named real-product level, a reference implementation + URL, or a specific playbook move with its concrete API/parameters). +2. **Fetch the concrete block.** WebFetch / curl the *specific* reference snippet + (the 30–50 lines that matter — the exact API calls, parameter values, order of + operations), not the whole repo. If a research playbook already contains the + cited concrete spec, use that. +3. **Derive acceptance criteria** from the reference: "the implementation MUST + (a) call X with params Y, (b) produce effect Z visible at camera/probe C, + (c) not break the gate." Record them in the cycle's outcome as + `reference_block` + `acceptance_criteria`. +4. **Only then generate** — implement the cited technique, adapting names to the + real code. The PR/outcome records WHICH reference grounded it (`grounded_in`). + +A leap with no `grounded_in` reference is a red flag for the maze: prefer +researching a concrete approach over reaching for the model's first idea. + +Analyze what needs to change based on the action title, source report, **and the +resolved reference block**, then implement the changes (write and/or edit files). **Protected paths enforcement:** diff --git a/skills/evolve/SKILL.md b/skills/evolve/SKILL.md index 9b25848..03acfbb 100644 --- a/skills/evolve/SKILL.md +++ b/skills/evolve/SKILL.md @@ -742,13 +742,38 @@ for the implementation/build domain that declares config.domains[d].quality_rubr if a.dimension == p.leap_target AND a.delta_score < config.leap.min_dimension_delta) if fails >= config.leap.max_attempts_per_dimension: - record skill_gap { name: "leap_stuck_{p.leap_target}", type:"quality_gap", - detail:"Leap on {p.leap_target} failed {fails}× without clearing min delta — - likely UNMEASURABLE by the current capture_method (see 5-G)." } - Create HALT: "Leap on '{p.leap_target}' failed {fails}× — human review needed: - supply a richer capture_method/metrics harness for this - dimension, or reweight the rubric. The loop cannot self-fix it." - set plateau_leap_blocked = true + -- STALL → REWRITE escalation (v1.11.0, the anti-maze fix). Before giving up + -- to a HALT, try ONE from-scratch REWRITE: repeated incremental leaps that + -- stall are the signature of the maze (a symptom patched on a wrong + -- architecture — the f1 `sky.visible=false` class). A patch can't escape; + -- a rewrite can. rubric_score.recommend_rewrite() confirms the stall. + rw = rubric_score.recommend_rewrite(outcomes.entries, p.leap_target, rubric, + min_failed = config.leap.max_attempts_per_dimension) + already_rewrote = any(e.cycle_mode == "rewrite" AND e.leap_target == p.leap_target + for e in last config.leap.max_attempts_per_dimension outcomes) + if config.leap.rewrite_on_stall AND rw.rewrite AND not already_rewrote: + -- Reflexion (arXiv:2303.11366): carry a NEGATIVE-EXAMPLE memo so the + -- rewrite explicitly avoids the stalled approach, and re-ground it + -- (dev-cycle Step 3-PRE) in config.references / the research playbook — + -- the rewrite must implement a CITED reference technique, not re-guess. + write memo { type:"stall_detected", dimension:p.leap_target, + stalled_approach: summary of the last {fails} leaps' diffs, + instruction:"Start from scratch on {p.leap_target}. The incremental + approach stalled. Do NOT reuse it. Ground the rewrite in + config.references[{domain}] / the research playbook (Step 3-PRE)." } + set orient.plateau = { active:true, leap_target:p.leap_target, mode:"rewrite", + weakest_dimension:p.weakest_dimension, artifact_score:p.latest, + reason:"incremental stalled → grounded REWRITE" } + Print "[Orient] ♻️ STALL→REWRITE: incremental leaps on '{p.leap_target}' stalled {fails}×. Next cycle REWRITES from a cited reference (not another patch)." + else: + record skill_gap { name: "leap_stuck_{p.leap_target}", type:"quality_gap", + detail:"Leap+rewrite on {p.leap_target} failed without clearing min delta — + likely UNMEASURABLE by the current capture_method (see 5-G)." } + Create HALT: "Leap+rewrite on '{p.leap_target}' stalled — human review needed: + supply a richer capture_method/metrics harness or authored + assets (asset_sources) for this dimension, or reweight the + rubric. The loop cannot self-fix it." + set plateau_leap_blocked = true else: set orient.plateau = { active:true, leap_target:p.leap_target, weakest_dimension:p.weakest_dimension, @@ -1907,9 +1932,17 @@ verdict = critic( 'it exists and works' result is ~0.10 (score_0.10), NOT 0.5+. Worse than score_0.10 → below 0.10. Do not grade on a curve where 'a decent prototype' = good. --- - Score null only if the evidence is null. Output - {dimension_scores:{axis:score|null}, weakest_dimension, critique(<=30 words)}.", - input: { mission, rubric.dimensions (with reference anchors), evidence: dim_artifact } + --- REFERENCE COMPARISON (v1.11.0): you are also given `references` (the + reference target for this domain — a named real product and/or a + reference-implementation screenshot/spec). For each axis state the ONE + concrete attribute of the reference the artifact most lacks (name a + specific technique/parameter, e.g. 'no clearcoat on paint', 'camera has + no speed lead', 'tyres show no slip'). That gap-naming is the primary + Observe signal that drives the next leap — vague 'looks worse' is not + actionable; 'lacks X that reference has' is. --- + Score null only if the evidence is null. Output {dimension_scores:{axis: + score|null}, weakest_dimension, per_axis_gap:{axis:'lacks X'}, critique(<=30 words)}.", + input: { mission, rubric.dimensions (with reference anchors), config.references, evidence: dim_artifact } ) -> { dimension_scores, weakest_dimension, critique } -- ASSET CEILING + HAND-OFF (v1.9.0 → v1.10.0). For the targeted dimension, after diff --git a/tests/verify.py b/tests/verify.py index 9752881..5b9832f 100644 --- a/tests/verify.py +++ b/tests/verify.py @@ -752,6 +752,24 @@ def _mod(name, fn): f"with-assets ceiling={R.asset_ceiling(with_assets)} hit@0.36={R.asset_ceiling_hit(with_assets,0.36)}", ) + # 13) v1.11.0 stall→REWRITE: when the artifact is plateaued AND the incremental + # LEAPS themselves keep failing, escalate from patch to rewrite (the anti-maze + # fix — the f1 'iterate without improving' loop). Plateaued-but-leaps-working + # stays on incremental (leap, don't rewrite). + rubRW = R.rubric_of({"quality_rubric": {"bar": 0.65, "plateau_window": 3, + "plateau_eps": 0.05, "dimensions": [{"name": "v", "weight": 1}]}}) + stalled = [{"artifact_score": 0.50, "weakest_dimension": "v", "dimension_scores": {"v": 0.50}, + "cycle_mode": "leap", "leap_attempts": [{"dimension": "v", "delta_score": 0.01}]}] * 3 + viable = [{"artifact_score": 0.50, "weakest_dimension": "v", "dimension_scores": {"v": 0.50}, + "cycle_mode": "leap", "leap_attempts": [{"dimension": "v", "delta_score": 0.10}]}] * 3 + rw_s = R.recommend_rewrite(stalled, "v", rubRW) + rw_v = R.recommend_rewrite(viable, "v", rubRW) + r.check( + "artifact-axis: stalled incremental leaps escalate to REWRITE; working leaps stay incremental (v1.11.0)", + rw_s["rewrite"] is True and rw_s["failed_leaps"] >= 2 and rw_v["rewrite"] is False, + f"stalled→{rw_s}; viable→{rw_v}", + ) + def main() -> int: r = Runner()