From 503a0654fdb3e1f7913885b897f1d5e9f48c9666 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 09:09:32 +0200 Subject: [PATCH 01/91] harness: interactive proxy robustness + analytics fixes Major improvements to the interactive agent runtime to reduce spurious failures and improve tool feedback quality: - Pre-build Benchmark case modules once per task so the interactive tools don't race the Lake build (eliminates olean-not-found flake). - Add environment_error failure class with one-shot self-heal: when Lean can't find a dependency olean, auto-run lake build and retry before reporting to the model. Stagnation tracking skips these classes. - write_editable_proof now returns immediate warnings (placeholder, hole, theorem_statement_mismatch, hidden_proof_import, etc.) so the model sees actionable feedback before spending a build budget. - Synthesize TOOLS.md for interactive mode from the real tool specs instead of relying on static prompts that reference shell commands. - HTTP layer: retry/backoff with jitter, honour Retry-After, fall back through extra_body.fallback_models, and silently grow max_completion_tokens (up to 12000) on length finish_reason. - Raise max_tool_calls (24 -> 40) and max_completion_tokens (2000 -> 4096) for the default interactive profile. - Tighten HOLE_PATTERN (avoid matching `?_foo`) and broaden hidden import regex to cover `open`/`export` of proof modules. - Fix candidate_change_count / distinct_candidate_count for interactive attempts by falling back to stable_digest of candidate_file_contents when trace is missing. - Add light temperature schedule on repeated failure_class history. Schema: extend agent-run.schema.json with prebuild_reports. Verified: 3 quick tasks pass cleanly on both gpt and builtin/smart with correct analytics (distinct=1, change_count=1). Co-Authored-By: Claude Opus 4.6 --- harness/agents/interactive.json | 4 +- harness/default_agent.py | 258 +++++++++++++++++++++++++++----- harness/interactive_runtime.py | 151 ++++++++++++++++++- schemas/agent-run.schema.json | 6 + 4 files changed, 372 insertions(+), 47 deletions(-) diff --git a/harness/agents/interactive.json b/harness/agents/interactive.json index 8c9bf850..a144bec5 100644 --- a/harness/agents/interactive.json +++ b/harness/agents/interactive.json @@ -20,9 +20,9 @@ "harness/PROOF_PATTERNS.md" ], "temperature": 0.0, - "max_completion_tokens": 2000, + "max_completion_tokens": 4096, "max_attempts": 16, - "max_tool_calls": 24, + "max_tool_calls": 40, "headers": {}, "header_envs": {}, "extra_body": { diff --git a/harness/default_agent.py b/harness/default_agent.py index fb237396..72c4a644 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -8,6 +8,7 @@ import re import subprocess import sys +import random import time from dataclasses import dataclass from datetime import datetime, timezone @@ -16,7 +17,13 @@ from urllib import error, request from benchmark_config import load_benchmark_agent_defaults -from interactive_runtime import TaskProofRuntime, tool_result_json, extract_contract_simp_terms, classify_failure +from interactive_runtime import ( + TaskProofRuntime, + classify_failure, + extract_contract_simp_terms, + prebuild_task_modules, + tool_result_json, +) from task_runner import ROOT, load_task_record, resolve_task_manifest AGENT_RESULTS_DIR = ROOT / "results" / "agent_runs" @@ -452,9 +459,48 @@ def resolve_config(path: Path, *, require_secrets: bool, profile: str | None = N ) +def _synthesized_interactive_tools_prompt() -> str: + """Render the real interactive tool surface from TaskProofRuntime.tool_specs(). + + Replaces the static harness/TOOLS.md which advertises `lake build`, `scripts/run_task.sh`, + and `scripts/run_all.sh` — none of which are actually callable in interactive mode. + """ + lines = [ + "# Interactive Tool Surface", + "", + "You have exactly these function tools. Call them; do NOT call shell commands:", + "", + ] + # Build a minimal task shim to get tool_specs without instantiating a real task. + # Note: tool_specs() uses self.paths.public_files for the read_public_file enum, + # so we enumerate generic names here instead of calling tool_specs() directly. + surface = [ + ("read_public_file(path)", "Read one of the task's public Lean files (impl/spec/editable)."), + ("write_editable_proof(content)", "Replace the editable proof file. Returns immediate warnings for placeholders, theorem-signature changes, hidden imports, or unfilled `?_` holes. Does NOT run Lean."), + ("run_lean_check()", "Run `lake env lean` on the editable proof. Returns pass/fail with error details, failure_class, and repair_hints. Auto-retries once on environment errors (missing .olean)."), + ("inspect_lean_goals()", "Inspect goal state at explicit `?_` holes. Unsupported if no hole present."), + ("try_tactic_at_hole(tactic)", "Replace all `?_` holes with a tactic and check. Preserves original proof on failure."), + ("search_public_defs(query)", "Search the task's public impl/spec files for def/theorem/lemma names."), + ] + for name, desc in surface: + lines.append(f"- `{name}` — {desc}") + lines.extend([ + "", + "Typical loop: write_editable_proof → run_lean_check → read repair_hints → iterate.", + "Do NOT emit `lake build` or `scripts/...`; there is no shell tool.", + ]) + return "\n".join(lines) + + def build_system_prompt(config: ResolvedAgentConfig) -> str: sections = [] for rel_path in config.system_prompt_files: + # In interactive mode, replace the static TOOLS.md (which advertises shell + # commands that don't exist) with a synthesized description of the real + # function-tool surface. + if config.mode == "interactive" and rel_path.endswith("TOOLS.md"): + sections.append(f"[{rel_path}]\n{_synthesized_interactive_tools_prompt()}") + continue path = ROOT / rel_path sections.append(f"[{rel_path}]\n{path.read_text(encoding='utf-8').strip()}") return "\n\n".join(sections).strip() @@ -860,7 +906,13 @@ def build_attempt_trace( "candidate_sha256": stable_digest(candidate_text), "status": status, "failure_mode": failure_mode, - "candidate_changed_from_previous": None if previous_attempt is None else candidate_text != previous_candidate, + # Treat the first non-empty candidate as a change (previously was None, which + # broke candidate_change_count analytics — every successful run showed 0). + "candidate_changed_from_previous": ( + bool(candidate_text.strip()) + if previous_attempt is None + else candidate_text != previous_candidate + ), "failure_mode_changed_from_previous": ( None if previous_attempt is None else failure_mode != previous_trace.get("failure_mode") ), @@ -942,21 +994,37 @@ def build_run_analysis( reasoning_attempts = 0 candidate_change_count = 0 failure_mode_change_count = 0 + distinct_candidate_hashes: set[str] = set() + previous_candidate = "" for attempt in attempts: - trace = attempt.get("trace", {}) - if not isinstance(trace, dict): - continue - if int(trace.get("provider_reasoning_chars") or 0) > 0: - reasoning_attempts += 1 - if trace.get("candidate_changed_from_previous") is True: - candidate_change_count += 1 - if trace.get("failure_mode_changed_from_previous") is True: - failure_mode_change_count += 1 + trace = attempt.get("trace", {}) or {} + if isinstance(trace, dict): + if int(trace.get("provider_reasoning_chars") or 0) > 0: + reasoning_attempts += 1 + if trace.get("candidate_changed_from_previous") is True: + candidate_change_count += 1 + if trace.get("failure_mode_changed_from_previous") is True: + failure_mode_change_count += 1 + candidate_hash = trace.get("candidate_sha256") + if isinstance(candidate_hash, str) and candidate_hash and int(trace.get("candidate_chars") or 0) > 0: + distinct_candidate_hashes.add(candidate_hash) + # Fallback for interactive-mode attempts that do not populate `trace`: + # derive candidate changes/hashes directly from candidate_file_contents. + candidate_text = str(attempt.get("candidate_file_contents", "")) + if candidate_text.strip(): + candidate_hash = stable_digest(candidate_text) + if candidate_hash not in distinct_candidate_hashes: + distinct_candidate_hashes.add(candidate_hash) + if not isinstance(trace, dict) or not trace.get("candidate_sha256"): + if candidate_text != previous_candidate: + candidate_change_count += 1 + previous_candidate = candidate_text return { "attempt_count": len(attempts), "tool_calls_used": tool_calls_used, "reasoning_attempt_count": reasoning_attempts, "candidate_change_count": candidate_change_count, + "distinct_candidate_count": len(distinct_candidate_hashes), "failure_mode_change_count": failure_mode_change_count, "final_failure_mode": evaluation.get("failure_mode"), "final_status": evaluation.get("status"), @@ -984,47 +1052,128 @@ def build_finalization_messages( ] +RETRY_STATUS_CODES = frozenset({408, 409, 425, 429, 500, 502, 503, 504}) +MAX_CHAT_COMPLETION_RETRIES = 6 + + +def _parse_retry_after(value: str | None) -> float | None: + if not value: + return None + value = value.strip() + if not value: + return None + try: + return max(0.0, float(value)) + except ValueError: + return None + + +def _backoff_delay(attempt: int, retry_after: float | None) -> float: + if retry_after is not None: + return min(retry_after, 60.0) + # Exponential backoff with jitter, capped at 30s. + base = min(30.0, 2.0 ** attempt) + return base * (0.5 + random.random() * 0.5) + + +def _post_chat_completion( + config: ResolvedAgentConfig, + payload: dict[str, Any], + model: str, +) -> dict[str, Any]: + """POST one chat completion request with retries on transient failures. + + Retries on HTTP 408/409/425/429/500/502/503/504 and URL-level errors (timeouts) + using exponential backoff with jitter, respecting Retry-After when present. + """ + url = f"{config.base_url}{config.chat_completions_path}" + body_payload = dict(payload) + body_payload["model"] = model + req_body = json.dumps(body_payload).encode("utf-8") + headers = { + "Authorization": f"Bearer {config.api_key}", + "Content-Type": "application/json", + "User-Agent": "verity-benchmark/0.1", + **config.headers, + } + last_error: str | None = None + for attempt in range(MAX_CHAT_COMPLETION_RETRIES): + req = request.Request(url, data=req_body, headers=headers, method="POST") + try: + with request.urlopen(req, timeout=config.request_timeout_seconds) as response: + body = response.read().decode("utf-8") + try: + return json.loads(body) + except json.JSONDecodeError as exc: + raise SystemExit( + f"chat completion request returned non-JSON response: {body[:400]!r}" + ) from exc + except error.HTTPError as exc: + detail = exc.read().decode("utf-8", errors="replace") + last_error = f"HTTP {exc.code}: {detail[:400]}" + if exc.code not in RETRY_STATUS_CODES or attempt == MAX_CHAT_COMPLETION_RETRIES - 1: + raise _ChatCompletionError(status=exc.code, detail=detail, model=model) from exc + retry_after = _parse_retry_after(exc.headers.get("Retry-After") if exc.headers else None) + time.sleep(_backoff_delay(attempt, retry_after)) + continue + except error.URLError as exc: + last_error = f"URL error: {exc}" + if attempt == MAX_CHAT_COMPLETION_RETRIES - 1: + raise _ChatCompletionError(status=0, detail=str(exc), model=model) from exc + time.sleep(_backoff_delay(attempt, None)) + continue + raise _ChatCompletionError(status=0, detail=last_error or "unknown", model=model) + + +class _ChatCompletionError(Exception): + def __init__(self, *, status: int, detail: str, model: str) -> None: + super().__init__(f"chat completion failed with status {status}: {detail[:400]}") + self.status = status + self.detail = detail + self.model = model + + def send_chat_completion( config: ResolvedAgentConfig, messages: list[dict[str, Any]], *, tools: list[dict[str, Any]] | None = None, max_tokens_override: int | None = None, + temperature_override: float | None = None, ) -> dict[str, Any]: - url = f"{config.base_url}{config.chat_completions_path}" - payload = { - "model": config.model, + payload: dict[str, Any] = { "messages": messages, - "temperature": config.temperature, + "temperature": config.temperature if temperature_override is None else temperature_override, "max_tokens": max_tokens_override or config.max_completion_tokens, } if tools: payload["tools"] = tools payload["tool_choice"] = "auto" payload.update(config.extra_body) - req = request.Request( - url, - data=json.dumps(payload).encode("utf-8"), - headers={ - "Authorization": f"Bearer {config.api_key}", - "Content-Type": "application/json", - "User-Agent": "verity-benchmark/0.1", - **config.headers, - }, - method="POST", + # Allow configuring a fallback chain via extra_body.fallback_models (list of model ids). + # This lets a rate-limited primary (e.g. "opus") degrade gracefully instead of failing the run. + fallback_models = [ + str(item) + for item in (config.extra_body.get("fallback_models") or []) + if isinstance(item, str) and item.strip() + ] + payload.pop("fallback_models", None) + models_to_try: list[str] = [config.model, *fallback_models] + last_exc: _ChatCompletionError | None = None + for model in models_to_try: + try: + return _post_chat_completion(config, payload, model) + except _ChatCompletionError as exc: + last_exc = exc + # Fall back only on rate-limit / service-unavailable style errors. + if exc.status not in (429, 500, 502, 503, 504) and exc.status != 0: + break + continue + if last_exc is None: + raise SystemExit("chat completion request failed with no attempts") + raise SystemExit( + f"chat completion request failed with HTTP {last_exc.status} (model={last_exc.model}): {last_exc.detail[:400]}" ) - try: - with request.urlopen(req, timeout=config.request_timeout_seconds) as response: - body = response.read().decode("utf-8") - except error.HTTPError as exc: - detail = exc.read().decode("utf-8", errors="replace") - raise SystemExit(f"chat completion request failed with HTTP {exc.code}: {detail}") from exc - except error.URLError as exc: - raise SystemExit(f"chat completion request failed: {exc}") from exc - try: - return json.loads(body) - except json.JSONDecodeError as exc: - raise SystemExit(f"chat completion request returned non-JSON response: {body[:400]!r}") from exc def list_models(config: ResolvedAgentConfig) -> dict[str, Any]: @@ -1593,13 +1742,25 @@ def execute_interactive_agent_task( consecutive_length_stops = 0 max_total_turns = config.max_attempts * 2 # hard cap to prevent infinite loops token_budget = config.max_completion_tokens + # Temperature schedule: escalate after repeated same-class failures to break out + # of deterministic loops where temperature=0 reproduces byte-identical responses. + current_temperature = config.temperature + failure_class_history: list[str] = [] turn = 0 while proof_attempts < config.max_attempts and turn < max_total_turns: turn += 1 + # Adjust temperature when the last two proof attempts failed with the same class. + if ( + len(failure_class_history) >= 2 + and failure_class_history[-1] == failure_class_history[-2] + and failure_class_history[-1] not in ("", "environment_error") + ): + current_temperature = min(0.7, max(current_temperature + 0.2, 0.2)) response = send_chat_completion( config, transcript, tools=runtime.tool_specs(), max_tokens_override=token_budget if token_budget != config.max_completion_tokens else None, + temperature_override=current_temperature if current_temperature != config.temperature else None, ) response_text = extract_text(response) tool_calls = extract_tool_calls(response) @@ -1613,9 +1774,9 @@ def execute_interactive_agent_task( finish_reason = choices[0].get("finish_reason", "") if finish_reason == "length" and not tool_calls and not response_text.strip(): consecutive_length_stops += 1 - if consecutive_length_stops == 1: - # First length stop: bump token budget once and retry silently - token_budget = min(int(token_budget * 1.5), 4500) + # Up to 3 silent budget bumps before nudging the model to simplify. + if consecutive_length_stops <= 3: + token_budget = min(int(token_budget * 1.5), 12000) continue # Subsequent length stops: inject a nudge to simplify and use tools transcript.append({"role": "assistant", "content": ""}) @@ -1627,7 +1788,7 @@ def execute_interactive_agent_task( "then call run_lean_check. Keep the proof short." ), }) - if consecutive_length_stops >= 3: + if consecutive_length_stops >= 5: # Reset budget back to configured value after persistent overruns token_budget = config.max_completion_tokens continue @@ -1656,6 +1817,11 @@ def execute_interactive_agent_task( evaluation = runtime.evaluate_current() attempts[-1]["candidate_file_contents"] = runtime.current_proof_text attempts[-1]["evaluation"] = evaluation + failure_class_history.append( + classify_failure(str(evaluation.get("details", ""))) + if evaluation.get("status") == "failed" + else "" + ) if evaluation["status"] == "passed": return response, response_text, runtime.current_proof_text, evaluation, attempts, tool_calls_used # Failed candidate without tool calls: feed error back @@ -1740,6 +1906,8 @@ def execute_interactive_agent_task( ) if tool_name == "run_lean_check" and result.get("failure_mode") == "lean_check_failed": saw_lean_failure = True + fc = result.get("failure_class") or classify_failure(str(result.get("details", ""))) + failure_class_history.append(str(fc)) elif tool_name in ("run_lean_check", "try_tactic_at_hole") and result.get("status") == "passed": # Normalize to evaluation schema (try_tactic_at_hole returns tactic/details without failure_mode) evaluation = dict(result) @@ -1808,6 +1976,12 @@ def execute_agent_task( return 0, result_path start = time.perf_counter() + # Pre-build implementation/specification modules so `lake env lean` inside + # TaskProofRuntime.evaluate_candidate does not race against on-the-fly + # compilation with fast agent retries. + prebuild_reports: list[dict[str, Any]] = [] + if config.mode == "interactive": + prebuild_reports = prebuild_task_modules(task) if config.mode == "interactive": response, response_text, candidate_text, evaluation, attempts, tool_calls_used = execute_interactive_agent_task( config, @@ -1850,6 +2024,8 @@ def execute_agent_task( result["attempts"] = attempts result["tool_calls_used"] = tool_calls_used result["analysis"] = build_run_analysis(attempts=attempts, evaluation=evaluation, tool_calls_used=tool_calls_used) + if prebuild_reports: + result["prebuild_reports"] = prebuild_reports validate_result_payload(result, task_ref) result_path = write_result(task_ref, config, result) return (0 if evaluation["status"] == "passed" else 1), result_path diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 23420b59..48c3fc5a 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -12,9 +12,12 @@ PLACEHOLDER_PATTERN = re.compile(r"\b(sorry|admit|axiom)\b") -HOLE_PATTERN = re.compile(r"\?(?:_|\w+)") +# Match standalone `?_` holes only (not `?x` metavariables used in valid tactics). +HOLE_PATTERN = re.compile(r"(? dict[str, Any]: def write_editable_proof(self, content: str) -> dict[str, Any]: self.current_proof_text = content if content.endswith("\n") else f"{content}\n" - return { - "status": "ok", + warnings: list[dict[str, str]] = [] + if not self.current_proof_text.strip(): + warnings.append({"kind": "empty_content", "detail": "candidate is empty"}) + if PLACEHOLDER_PATTERN.search(self.current_proof_text): + warnings.append({ + "kind": "placeholder_detected", + "detail": "contains `sorry`/`admit`/`axiom`; replace before run_lean_check.", + }) + if HIDDEN_PROOF_IMPORT_PATTERN.search(self.current_proof_text): + warnings.append({ + "kind": "hidden_proof_import_detected", + "detail": "remove Benchmark.Cases.*.Proofs import/open/export.", + }) + blocked = self._find_blocked_case_imports(self.current_proof_text) + if blocked: + warnings.append({ + "kind": "hidden_case_import_detected", + "detail": "non-public imports: " + ", ".join(blocked), + }) + if HOLE_PATTERN.search(self.current_proof_text): + warnings.append({ + "kind": "unfilled_hole", + "detail": "proof still contains `?_` holes; fill before submitting.", + }) + candidate_signature = self._extract_theorem_signature(self.current_proof_text) + if candidate_signature != self.expected_theorem_signature: + warnings.append({ + "kind": "theorem_statement_mismatch", + "detail": "editable theorem signature changed; revert to the original statement.", + }) + status = "ok_with_warnings" if warnings else "ok" + result: dict[str, Any] = { + "status": status, "path": self.paths.editable_rel_path, "bytes": len(self.current_proof_text.encode("utf-8")), "lines": len(self.current_proof_text.splitlines()), } + if warnings: + result["warnings"] = warnings + return result def search_public_defs(self, query: str, *, limit: int = 20) -> dict[str, Any]: query_text = query.strip() @@ -351,6 +388,14 @@ def execute_tool(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]: return self.write_editable_proof(str(arguments.get("content", ""))) if name == "run_lean_check": result = self.evaluate_current() + # Auto-heal environment errors (missing .olean) once before annotating. + if result.get("status") == "failed" and result.get("failure_mode") == "lean_check_failed": + details = str(result.get("details", "")) + if classify_failure(details) == "environment_error": + module_name = _missing_olean_module(details) + healed = _attempt_lake_build(module_name) + if healed: + result = self.evaluate_current() if result.get("status") == "failed": result = self._annotate_check_result(result) # Also add structured repair hints from main's guidance @@ -385,6 +430,14 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: annotated = dict(result) annotated["failure_class"] = failure_class + # environment_error is infrastructure, not a proof problem. Don't track + # stagnation for it (retrying won't help) and tag the result clearly. + if failure_class == "environment_error": + annotated["environment_error"] = True + if hints: + annotated["repair_hints"] = hints + return annotated + if not is_lean_failure: if hints: annotated["repair_hints"] = hints @@ -546,6 +599,61 @@ def _module_name(rel_path: str) -> str: return module_path.replace("/", ".") +_LAKE_BUILD_CACHE: set[str] = set() + + +def _attempt_lake_build(module_name: str | None) -> bool: + """Best-effort `lake build` for a module. Returns True on success.""" + if not module_name: + return False + if not module_name.startswith("Benchmark."): + return False + if module_name in _LAKE_BUILD_CACHE: + # Already attempted in this process; skip. + return False + _LAKE_BUILD_CACHE.add(module_name) + code, _output = lean_run_command(["lake", "build", module_name], cwd=ROOT) + return code == 0 + + +def prebuild_task_modules(task: dict[str, Any]) -> list[dict[str, Any]]: + """Pre-build implementation/specification .olean files for a task. + + Returns a list of build reports. Meant to be called once before starting + the agent loop so on-the-fly compilation inside `lake env lean` does not + race with fast agent retries. + """ + reports: list[dict[str, Any]] = [] + targets: list[str] = [] + for rel_path in list(task.get("implementation_files", [])) + list(task.get("specification_files", [])): + path = Path(rel_path) + if path.suffix != ".lean": + continue + module_name = ".".join(path.with_suffix("").parts) + # Only modules inside the `Benchmark` lean_lib are buildable via `lake build`. + # Source-of-truth files under `cases/` are mirrored into `Benchmark/Cases/` and + # that mirror is what lake actually compiles. + if not module_name.startswith("Benchmark."): + continue + if module_name in targets: + continue + targets.append(module_name) + for module_name in targets: + if module_name in _LAKE_BUILD_CACHE: + reports.append({"module": module_name, "status": "cached"}) + continue + _LAKE_BUILD_CACHE.add(module_name) + code, output = lean_run_command(["lake", "build", module_name], cwd=ROOT) + reports.append( + { + "module": module_name, + "status": "ok" if code == 0 else "failed", + "output": output[-600:] if code != 0 else "", + } + ) + return reports + + def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: """Extract concrete simp terms from implementation and specification files. @@ -586,11 +694,39 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: return terms +ENVIRONMENT_ERROR_PATTERNS = ( + re.compile(r"object file ['\"][^'\"]+\.olean['\"]? does not exist"), + re.compile(r"failed to load environment"), + re.compile(r"lean executable .* not found", re.IGNORECASE), +) + + +def _missing_olean_module(details: str) -> str | None: + """Extract the module name whose .olean is missing, if the error is environmental.""" + match = re.search(r"object file ['\"]([^'\"]+\.olean)['\"]?", details) + if not match: + return None + olean_path = match.group(1) + # Strip any leading directories up to "Benchmark" (since paths may be absolute) + marker = "/Benchmark/" + idx = olean_path.find(marker) + if idx >= 0: + rel = olean_path[idx + 1 :] + else: + rel = olean_path + if rel.endswith(".olean"): + rel = rel[: -len(".olean")] + return rel.replace("/", ".") + + def classify_failure(details: str) -> str: """Classify a Lean checker failure into a coarse category.""" if not details: return "unknown" lower = details.lower() + for pattern in ENVIRONMENT_ERROR_PATTERNS: + if pattern.search(details): + return "environment_error" if "unknown identifier" in lower or "unknown constant" in lower: return "unknown_identifier" if "unsolved goals" in lower: @@ -625,6 +761,13 @@ def classify_failure(details: str) -> str: def _build_check_hints(failure_class: str, details: str) -> list[str]: """Build targeted repair hints based on failure classification.""" hints: list[str] = [] + if failure_class == "environment_error": + hints.append( + "ENVIRONMENT ERROR (not your fault): a dependency .olean is missing. " + "The harness is attempting to rebuild it. If this persists, your proof is likely correct; " + "retry run_lean_check once more." + ) + return hints if failure_class == "unknown_identifier": if "decide_True" in details or "decide_False" in details: hints.append("CRITICAL: `decide_True` and `decide_False` do not exist. Remove them. Instead, pass precondition hypotheses directly to `simp` - it handles `decide` reduction automatically.") diff --git a/schemas/agent-run.schema.json b/schemas/agent-run.schema.json index ece50053..3b0b612a 100644 --- a/schemas/agent-run.schema.json +++ b/schemas/agent-run.schema.json @@ -283,6 +283,12 @@ }, "analysis": { "type": "object" + }, + "prebuild_reports": { + "type": "array", + "items": { + "type": "object" + } } } } From 5cef24644eff278e40cc14a1b508e7c53fe9b43d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 09:17:29 +0200 Subject: [PATCH 02/91] fix: address codex review feedback - P1: only cache *successful* lake builds so failed builds can be retried on subsequent calls. Previously `_LAKE_BUILD_CACHE` marked modules as attempted before invoking lake, which prevented the self-heal path from ever recovering from a transient build failure in batch/suite runs that share a process. - P2: count every candidate transition in the interactive analytics fallback, including reverts (A -> B -> A now counts 2 changes, not 1). Previously the increment was gated by "hash not yet seen", which undercounted non-monotonic candidate sequences. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 11 ++++++----- harness/interactive_runtime.py | 29 +++++++++++++++++++---------- 2 files changed, 25 insertions(+), 15 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 72c4a644..29981435 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1010,14 +1010,15 @@ def build_run_analysis( distinct_candidate_hashes.add(candidate_hash) # Fallback for interactive-mode attempts that do not populate `trace`: # derive candidate changes/hashes directly from candidate_file_contents. + # Count every transition (incl. reverts like A -> B -> A), and record + # each distinct hash separately. candidate_text = str(attempt.get("candidate_file_contents", "")) if candidate_text.strip(): candidate_hash = stable_digest(candidate_text) - if candidate_hash not in distinct_candidate_hashes: - distinct_candidate_hashes.add(candidate_hash) - if not isinstance(trace, dict) or not trace.get("candidate_sha256"): - if candidate_text != previous_candidate: - candidate_change_count += 1 + distinct_candidate_hashes.add(candidate_hash) + if not isinstance(trace, dict) or not trace.get("candidate_sha256"): + if candidate_text != previous_candidate: + candidate_change_count += 1 previous_candidate = candidate_text return { "attempt_count": len(attempts), diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 48c3fc5a..50a8fdca 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -599,21 +599,28 @@ def _module_name(rel_path: str) -> str: return module_path.replace("/", ".") -_LAKE_BUILD_CACHE: set[str] = set() +_LAKE_BUILD_CACHE: dict[str, bool] = {} def _attempt_lake_build(module_name: str | None) -> bool: - """Best-effort `lake build` for a module. Returns True on success.""" + """Best-effort `lake build` for a module. Returns True on success. + + Only successful builds are cached; failures are retried on subsequent calls + so that transient build errors can be recovered from when the runtime is + reused across tasks (e.g. batch / suite runs in a single process). + """ if not module_name: return False if not module_name.startswith("Benchmark."): return False - if module_name in _LAKE_BUILD_CACHE: - # Already attempted in this process; skip. + if _LAKE_BUILD_CACHE.get(module_name): + # Already built successfully in this process; skip. return False - _LAKE_BUILD_CACHE.add(module_name) code, _output = lean_run_command(["lake", "build", module_name], cwd=ROOT) - return code == 0 + success = code == 0 + if success: + _LAKE_BUILD_CACHE[module_name] = True + return success def prebuild_task_modules(task: dict[str, Any]) -> list[dict[str, Any]]: @@ -639,16 +646,18 @@ def prebuild_task_modules(task: dict[str, Any]) -> list[dict[str, Any]]: continue targets.append(module_name) for module_name in targets: - if module_name in _LAKE_BUILD_CACHE: + if _LAKE_BUILD_CACHE.get(module_name): reports.append({"module": module_name, "status": "cached"}) continue - _LAKE_BUILD_CACHE.add(module_name) code, output = lean_run_command(["lake", "build", module_name], cwd=ROOT) + success = code == 0 + if success: + _LAKE_BUILD_CACHE[module_name] = True reports.append( { "module": module_name, - "status": "ok" if code == 0 else "failed", - "output": output[-600:] if code != 0 else "", + "status": "ok" if success else "failed", + "output": output[-600:] if not success else "", } ) return reports From 9ff078dc7831eadba6de22aca6e49754e0be31ac Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 10:04:25 +0200 Subject: [PATCH 03/91] fix: make auto-heal lake build actually rebuild on cache hit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bugbot (Medium): `_attempt_lake_build` returned False when the module was already cached as built, so after `prebuild_task_modules` populated the cache, every subsequent environment_error auto-heal attempt short-circuited without invoking lake and without retrying the check — while the model was told "The harness is attempting to rebuild it". The auto-heal path only fires when the runtime has observed a missing .olean at check time, which means the cached 'success' entry is stale. Always invoke lake build in this path and refresh the cache with the latest result. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 50a8fdca..7ca3beef 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -605,21 +605,19 @@ def _module_name(rel_path: str) -> str: def _attempt_lake_build(module_name: str | None) -> bool: """Best-effort `lake build` for a module. Returns True on success. - Only successful builds are cached; failures are retried on subsequent calls - so that transient build errors can be recovered from when the runtime is - reused across tasks (e.g. batch / suite runs in a single process). + Always invokes `lake build` — this is the self-heal path, called when the + runtime observed a missing .olean at check time, so the previously cached + "success" entry is stale and cannot be trusted. The cache is refreshed + with the latest result so subsequent prebuild calls can short-circuit + correctly. """ if not module_name: return False if not module_name.startswith("Benchmark."): return False - if _LAKE_BUILD_CACHE.get(module_name): - # Already built successfully in this process; skip. - return False code, _output = lean_run_command(["lake", "build", module_name], cwd=ROOT) success = code == 0 - if success: - _LAKE_BUILD_CACHE[module_name] = True + _LAKE_BUILD_CACHE[module_name] = success return success From 950d7e31453ae633b87cfdab4475fd590402930a Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 10:19:50 +0200 Subject: [PATCH 04/91] fix: address additional bugbot review feedback - Low: merge the two back-to-back `if config.mode == "interactive":` blocks in `execute_agent_task` into a single branch. - Low: `try_tactic_at_hole` now uses the same `HOLE_PATTERN` regex as `inspect_lean_goals`, so the two tools agree on what counts as a hole (previously the tightened lookbehind in HOLE_PATTERN was not mirrored in `try_tactic_at_hole`, which could let it replace a match that `inspect_lean_goals` had reported as absent). Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 1 - harness/interactive_runtime.py | 6 ++++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 29981435..83fddf52 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1983,7 +1983,6 @@ def execute_agent_task( prebuild_reports: list[dict[str, Any]] = [] if config.mode == "interactive": prebuild_reports = prebuild_task_modules(task) - if config.mode == "interactive": response, response_text, candidate_text, evaluation, attempts, tool_calls_used = execute_interactive_agent_task( config, task, diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 7ca3beef..524503f9 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -175,8 +175,10 @@ def try_tactic_at_hole(self, tactic: str) -> dict[str, Any]: if not tactic.strip(): return {"status": "rejected", "reason": "tactic_must_not_be_empty"} original = self.current_proof_text - # Replace standalone ?_ holes (not named holes like ?_foo) - modified = re.sub(r"\?_(?!\w)", tactic.strip(), original) + # Replace standalone `?_` holes (not named holes like `?_foo` and not + # identifiers ending in `?_`). Must match HOLE_PATTERN so both tools + # agree on what counts as a hole. + modified = HOLE_PATTERN.sub(tactic.strip(), original) if modified == original: return { "status": "unsupported", From 3ddda642d4e01a7dd0f8dad6ff779ee3e9df761f Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 10:41:42 +0200 Subject: [PATCH 05/91] fix: accept HTTP-date form of Retry-After Codex (P2): `_parse_retry_after` only handled delta-seconds, so when a provider sent an HTTP-date on 429/503 (which RFC 7231 permits), the function returned None and `_backoff_delay` fell back to a short exponential retry instead of honouring the server-requested wait. That can repeatedly hit rate limits and prolong failures. Now parse both forms; a date in the past clamps to 0. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/harness/default_agent.py b/harness/default_agent.py index 83fddf52..c251495c 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1058,6 +1058,15 @@ def build_finalization_messages( def _parse_retry_after(value: str | None) -> float | None: + """Parse an HTTP `Retry-After` header. + + Accepts both forms permitted by RFC 7231: + - delta-seconds (e.g. "120") + - HTTP-date (e.g. "Wed, 21 Oct 2015 07:28:00 GMT") + + Returns the number of seconds to wait, or None if the value cannot be + parsed. A date in the past is clamped to 0. + """ if not value: return None value = value.strip() @@ -1066,6 +1075,19 @@ def _parse_retry_after(value: str | None) -> float | None: try: return max(0.0, float(value)) except ValueError: + pass + try: + from email.utils import parsedate_to_datetime + import datetime as _dt + + parsed = parsedate_to_datetime(value) + if parsed is None: + return None + if parsed.tzinfo is None: + parsed = parsed.replace(tzinfo=_dt.timezone.utc) + delta = (parsed - _dt.datetime.now(_dt.timezone.utc)).total_seconds() + return max(0.0, delta) + except (TypeError, ValueError): return None From df1004c256ca4d0c54745d0c5f88928b8d5d758e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 10:52:31 +0200 Subject: [PATCH 06/91] fix: narrow environment_error + honour longer Retry-After Codex P2 #1: narrow `environment_error` classification to real infra failures. Previously any missing .olean matched, but that string is also produced when the model imports a non-existent module. In the latter case we want the normal stagnation/temperature logic to kick in so the model can correct itself. Now only classify as environment_error when the missing module is under `Benchmark.*` (our dependency tree which should have been pre-built) or when lean itself is missing. The generic "failed to load environment" substring no longer short-circuits. Codex P2 #2: raise `_backoff_delay` Retry-After ceiling from 60s to 600s. Providers routinely request several-minute waits on 429/503; the old 60s clamp caused retries to fire while the rate limit was still in force, undermining the "honour Retry-After" behaviour. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 6 +++++- harness/interactive_runtime.py | 20 +++++++++++++++----- 2 files changed, 20 insertions(+), 6 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index c251495c..c5be3afc 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1093,7 +1093,11 @@ def _parse_retry_after(value: str | None) -> float | None: def _backoff_delay(attempt: int, retry_after: float | None) -> float: if retry_after is not None: - return min(retry_after, 60.0) + # Honour the provider-requested wait. Clamp only at a safety ceiling + # (10 minutes) so a pathological header cannot stall the run + # indefinitely; the previous 60s clamp was too aggressive and caused + # retries to fire while the rate limit was still in force. + return min(retry_after, 600.0) # Exponential backoff with jitter, capped at 30s. base = min(30.0, 2.0 ** attempt) return base * (0.5 + random.random() * 0.5) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 524503f9..37cc80aa 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -703,16 +703,19 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: return terms -ENVIRONMENT_ERROR_PATTERNS = ( - re.compile(r"object file ['\"][^'\"]+\.olean['\"]? does not exist"), - re.compile(r"failed to load environment"), +# Missing-olean errors can be infrastructure (a Benchmark dependency wasn't +# pre-built) or the model's fault (imported a module that doesn't exist). We +# only classify the former as environment_error so stagnation/temperature +# logic still applies to model-caused import mistakes. +_MISSING_OLEAN_RE = re.compile(r"object file ['\"]([^'\"]+\.olean)['\"]? does not exist") +INFRA_ONLY_ERROR_PATTERNS = ( re.compile(r"lean executable .* not found", re.IGNORECASE), ) def _missing_olean_module(details: str) -> str | None: """Extract the module name whose .olean is missing, if the error is environmental.""" - match = re.search(r"object file ['\"]([^'\"]+\.olean)['\"]?", details) + match = _MISSING_OLEAN_RE.search(details) if not match: return None olean_path = match.group(1) @@ -733,9 +736,16 @@ def classify_failure(details: str) -> str: if not details: return "unknown" lower = details.lower() - for pattern in ENVIRONMENT_ERROR_PATTERNS: + # Infrastructure errors that the model cannot reasonably be blamed for. + for pattern in INFRA_ONLY_ERROR_PATTERNS: if pattern.search(details): return "environment_error" + # Missing .olean is infra only when it is a Benchmark.* dependency (which + # should have been pre-built). A missing olean for any other path means + # the model imported / referenced something that doesn't exist. + missing_module = _missing_olean_module(details) + if missing_module and missing_module.startswith("Benchmark."): + return "environment_error" if "unknown identifier" in lower or "unknown constant" in lower: return "unknown_identifier" if "unsolved goals" in lower: From 894a285f924d4ec3270a90def57023169e79d8d8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 11:01:36 +0200 Subject: [PATCH 07/91] fix: only bump temperature per new failure + match 'of module' form Bugbot (Medium): the temperature schedule condition was checked at the top of every loop iteration, so once two consecutive same-class failures triggered it, temperature was bumped on every subsequent turn -- including pure search/write turns -- until the 0.7 cap was reached. Track the history length we have already acted on and only bump once per new failure entry. Codex (P1): Lean reports missing artifacts in two forms depending on context: object file '...olean' does not exist object file '...olean' of module does not exist The `_MISSING_OLEAN_RE` regex only matched the shorter form, so on the more common "of module" diagnostic `_missing_olean_module` returned None, `classify_failure` did not return `environment_error`, and the auto-heal retry path was skipped. Accept arbitrary text between the path and the "does not exist" tail. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 12 ++++++++++-- harness/interactive_runtime.py | 9 ++++++++- 2 files changed, 18 insertions(+), 3 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index c5be3afc..8f6f0d70 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1773,17 +1773,25 @@ def execute_interactive_agent_task( # of deterministic loops where temperature=0 reproduces byte-identical responses. current_temperature = config.temperature failure_class_history: list[str] = [] + # Track how many failures we have already applied the temperature-bump + # schedule to, so we don't keep escalating temperature on every iteration + # once the trigger condition is first met (it would otherwise run to the + # cap within a few turns regardless of intervening search/write activity). + temperature_schedule_applied_at = 0 turn = 0 while proof_attempts < config.max_attempts and turn < max_total_turns: turn += 1 - # Adjust temperature when the last two proof attempts failed with the same class. + # Adjust temperature once per new failure entry when the last two + # proof attempts failed with the same class. if ( - len(failure_class_history) >= 2 + len(failure_class_history) > temperature_schedule_applied_at + and len(failure_class_history) >= 2 and failure_class_history[-1] == failure_class_history[-2] and failure_class_history[-1] not in ("", "environment_error") ): current_temperature = min(0.7, max(current_temperature + 0.2, 0.2)) + temperature_schedule_applied_at = len(failure_class_history) response = send_chat_completion( config, transcript, tools=runtime.tool_specs(), max_tokens_override=token_budget if token_budget != config.max_completion_tokens else None, diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 37cc80aa..f0a61513 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -707,7 +707,14 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: # pre-built) or the model's fault (imported a module that doesn't exist). We # only classify the former as environment_error so stagnation/temperature # logic still applies to model-caused import mistakes. -_MISSING_OLEAN_RE = re.compile(r"object file ['\"]([^'\"]+\.olean)['\"]? does not exist") +# Lean prints both forms of this diagnostic, depending on context: +# object file '.olean' does not exist +# object file '.olean' of module does not exist +# so accept arbitrary text (incl. "of module ") between the path and +# the "does not exist" tail. +_MISSING_OLEAN_RE = re.compile( + r"object file ['\"]([^'\"]+\.olean)['\"]?[^\n]*?does not exist" +) INFRA_ONLY_ERROR_PATTERNS = ( re.compile(r"lean executable .* not found", re.IGNORECASE), ) From 42efe751536acfee0b1b5496dbc921655c0dbfee Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 11:11:35 +0200 Subject: [PATCH 08/91] fix: require benchmark source file to exist before env classification Codex (P2): a model-authored bad import like `import Benchmark.Foo.Bar` where no such module exists would still produce an "object file ...olean does not exist" diagnostic, and under the previous check that matched the `Benchmark.*` prefix it would be misclassified as infrastructure failure -- skipping stagnation/temperature correction instead of letting the model discover and fix the bogus import. Require the corresponding `.lean` source file to actually exist in the tree before treating a missing .olean as environment_error. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index f0a61513..170dcfd4 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -747,12 +747,16 @@ def classify_failure(details: str) -> str: for pattern in INFRA_ONLY_ERROR_PATTERNS: if pattern.search(details): return "environment_error" - # Missing .olean is infra only when it is a Benchmark.* dependency (which - # should have been pre-built). A missing olean for any other path means - # the model imported / referenced something that doesn't exist. + # Missing .olean is infra only when it is a Benchmark.* dependency *whose + # source file actually exists* in the tree -- meaning lake should have + # built it but didn't. If the source file is missing too, the model + # imported / referenced something that doesn't exist, which is its own + # mistake and should go through the normal stagnation/temperature loop. missing_module = _missing_olean_module(details) if missing_module and missing_module.startswith("Benchmark."): - return "environment_error" + source_rel = Path(*missing_module.split(".")).with_suffix(".lean") + if (ROOT / source_rel).is_file(): + return "environment_error" if "unknown identifier" in lower or "unknown constant" in lower: return "unknown_identifier" if "unsolved goals" in lower: From 89b890e609f45cdb91980fd90adca8f21ba1fc73 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 11:21:20 +0200 Subject: [PATCH 09/91] fix: fall back on all transient HTTP statuses Codex (P2): `send_chat_completion`'s fallback-model gate used a hard-coded subset {429, 500, 502, 503, 504}, but `_post_chat_completion` retries the broader `RETRY_STATUS_CODES` set (408, 409, 425, 429, 500, 502, 503, 504). A primary model that kept returning 408/409/425 would exhaust retries and then skip the configured fallback chain entirely. Route on the same set used by the underlying retry loop. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 8f6f0d70..1289d23b 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1192,8 +1192,11 @@ def send_chat_completion( return _post_chat_completion(config, payload, model) except _ChatCompletionError as exc: last_exc = exc - # Fall back only on rate-limit / service-unavailable style errors. - if exc.status not in (429, 500, 502, 503, 504) and exc.status != 0: + # Fall back on the same transient statuses `_post_chat_completion` + # retries internally (plus status 0 for network/read errors), so a + # primary that keeps returning 408/409/425/429/5xx gets routed to + # the configured fallback chain instead of hard-failing. + if exc.status not in RETRY_STATUS_CODES and exc.status != 0: break continue if last_exc is None: From f8d43bbfaea6d9c786cfda787f8e04d0d764b33d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 11:42:21 +0200 Subject: [PATCH 10/91] fix: rfind Benchmark marker + extra_body cannot clobber overrides Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 12 +++++++----- harness/interactive_runtime.py | 2 +- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 1289d23b..8d15fb2c 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1168,15 +1168,17 @@ def send_chat_completion( max_tokens_override: int | None = None, temperature_override: float | None = None, ) -> dict[str, Any]: - payload: dict[str, Any] = { - "messages": messages, - "temperature": config.temperature if temperature_override is None else temperature_override, - "max_tokens": max_tokens_override or config.max_completion_tokens, - } + payload: dict[str, Any] = {"messages": messages} if tools: payload["tools"] = tools payload["tool_choice"] = "auto" + # Apply extra_body first so computed overrides below win over any + # temperature/max_tokens keys the user may have stashed in extra_body. payload.update(config.extra_body) + payload["temperature"] = ( + config.temperature if temperature_override is None else temperature_override + ) + payload["max_tokens"] = max_tokens_override or config.max_completion_tokens # Allow configuring a fallback chain via extra_body.fallback_models (list of model ids). # This lets a rate-limited primary (e.g. "opus") degrade gracefully instead of failing the run. fallback_models = [ diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 170dcfd4..8ec303c1 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -728,7 +728,7 @@ def _missing_olean_module(details: str) -> str | None: olean_path = match.group(1) # Strip any leading directories up to "Benchmark" (since paths may be absolute) marker = "/Benchmark/" - idx = olean_path.find(marker) + idx = olean_path.rfind(marker) if idx >= 0: rel = olean_path[idx + 1 :] else: From 07c3880b8dce2f59e306862d91b94437c42e158c Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 12:02:41 +0200 Subject: [PATCH 11/91] fix: bound length-retry budget + ignore env_error in temp schedule Three related fixes from bugbot/codex on f8d43bb: 1. (codex P1) Cap length-retry token bump at config.max_completion_tokens. The previous hard-coded cap of 12000 could exceed the provider's per- response output-token limit (e.g. models with a 4096 hard cap), turning a recoverable truncation into a non-transient HTTP 400 hard failure. 2. (bugbot Low) Also reset token_budget when recovering from a length streak, not only when consecutive_length_stops >= 5. Previously a 1-4 stop streak followed by recovery left the elevated budget in place for the rest of the run. 3. (bugbot Low) Skip environment_error entries when appending to failure_class_history. They are infra noise that would break the sliding window same-class check (e.g. [type_error, env_error, type_error]) even though the filter at the trigger site rejects them -- consistent with the PR's stated intent that env_error be invisible to stagnation tracking. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 28 ++++++++++++++++++++++------ 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 8d15fb2c..3a93bda0 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1815,8 +1815,11 @@ def execute_interactive_agent_task( if finish_reason == "length" and not tool_calls and not response_text.strip(): consecutive_length_stops += 1 # Up to 3 silent budget bumps before nudging the model to simplify. + # Cap bump at `config.max_completion_tokens` so we never exceed the + # provider-enforced per-response limit (some models hard-cap at the + # configured value and would return HTTP 400 on anything larger). if consecutive_length_stops <= 3: - token_budget = min(int(token_budget * 1.5), 12000) + token_budget = min(int(token_budget * 1.5), config.max_completion_tokens) continue # Subsequent length stops: inject a nudge to simplify and use tools transcript.append({"role": "assistant", "content": ""}) @@ -1828,12 +1831,15 @@ def execute_interactive_agent_task( "then call run_lean_check. Keep the proof short." ), }) - if consecutive_length_stops >= 5: - # Reset budget back to configured value after persistent overruns - token_budget = config.max_completion_tokens + # Reset budget back to configured value after persistent overruns + token_budget = config.max_completion_tokens continue else: + # Recovered from any length streak -- reset both the counter and + # the (possibly-elevated) token budget so we don't leak state into + # subsequent turns. consecutive_length_stops = 0 + token_budget = config.max_completion_tokens attempts.append( { @@ -1857,11 +1863,18 @@ def execute_interactive_agent_task( evaluation = runtime.evaluate_current() attempts[-1]["candidate_file_contents"] = runtime.current_proof_text attempts[-1]["evaluation"] = evaluation - failure_class_history.append( + # Track real model-driven failure classes for the temperature + # schedule's sliding window. Environment errors are infra noise + # that would break same-class detection (e.g. ["type_error", + # "environment_error", "type_error"] looks like a class change) + # so they are filtered out of the history. + fc_entry = ( classify_failure(str(evaluation.get("details", ""))) if evaluation.get("status") == "failed" else "" ) + if fc_entry != "environment_error": + failure_class_history.append(fc_entry) if evaluation["status"] == "passed": return response, response_text, runtime.current_proof_text, evaluation, attempts, tool_calls_used # Failed candidate without tool calls: feed error back @@ -1947,7 +1960,10 @@ def execute_interactive_agent_task( if tool_name == "run_lean_check" and result.get("failure_mode") == "lean_check_failed": saw_lean_failure = True fc = result.get("failure_class") or classify_failure(str(result.get("details", ""))) - failure_class_history.append(str(fc)) + # Skip environment errors: they are infra noise that would + # break the temperature schedule's same-class sliding window. + if str(fc) != "environment_error": + failure_class_history.append(str(fc)) elif tool_name in ("run_lean_check", "try_tactic_at_hole") and result.get("status") == "passed": # Normalize to evaluation schema (try_tactic_at_hole returns tactic/details without failure_mode) evaluation = dict(result) From 47e80848c28dc0795d2d10065d98502123acdf20 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 12:18:21 +0200 Subject: [PATCH 12/91] fix: retry on socket TimeoutError instead of hard-failing task Python 3.12 surfaces socket read timeouts during SSL as bare TimeoutError, which slipped past the urllib.error.URLError handler and killed the task with a traceback. Catch it alongside URLError and apply the same backoff retry policy. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/harness/default_agent.py b/harness/default_agent.py index 3a93bda0..be455a04 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1149,6 +1149,15 @@ def _post_chat_completion( raise _ChatCompletionError(status=0, detail=str(exc), model=model) from exc time.sleep(_backoff_delay(attempt, None)) continue + except TimeoutError as exc: + # Python 3.10+: socket.timeout during SSL read surfaces as + # TimeoutError rather than urllib.error.URLError. Treat it as + # a transient network failure and retry with backoff. + last_error = f"Read timeout: {exc}" + if attempt == MAX_CHAT_COMPLETION_RETRIES - 1: + raise _ChatCompletionError(status=0, detail=str(exc), model=model) from exc + time.sleep(_backoff_delay(attempt, None)) + continue raise _ChatCompletionError(status=0, detail=last_error or "unknown", model=model) From c5f118c37b9930a5866a98191efb48f80dafd2ad Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 12:55:26 +0200 Subject: [PATCH 13/91] fix: make length-retry token cap configurable via extra_body Previous commit capped the silent length-retry bump at config.max_completion_tokens, which -- as bugbot and codex both pointed out -- turned the feature into a no-op because token_budget starts at exactly that value. Add a config.extra_body.length_retry_token_cap knob (stripped from the provider payload in send_chat_completion). Default stays at max_completion_tokens so models with a hard per-response cap don't get surprised by HTTP 400; interactive.json opts into a 12000 ceiling, which is the original intended behavior for gpt-class models that accept larger single-response budgets. Co-Authored-By: Claude Opus 4.6 --- harness/agents/interactive.json | 3 ++- harness/default_agent.py | 15 ++++++++++++++- 2 files changed, 16 insertions(+), 2 deletions(-) diff --git a/harness/agents/interactive.json b/harness/agents/interactive.json index a144bec5..dbc9ef65 100644 --- a/harness/agents/interactive.json +++ b/harness/agents/interactive.json @@ -28,7 +28,8 @@ "extra_body": { "thinking": { "type": "disabled" - } + }, + "length_retry_token_cap": 12000 }, "request_timeout_seconds": 120 } diff --git a/harness/default_agent.py b/harness/default_agent.py index be455a04..3970b362 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1196,6 +1196,9 @@ def send_chat_completion( if isinstance(item, str) and item.strip() ] payload.pop("fallback_models", None) + # Benchmark-only knob consumed in execute_interactive_agent_task; strip + # it so providers don't reject the request with an unknown-field error. + payload.pop("length_retry_token_cap", None) models_to_try: list[str] = [config.model, *fallback_models] last_exc: _ChatCompletionError | None = None for model in models_to_try: @@ -1783,6 +1786,16 @@ def execute_interactive_agent_task( consecutive_length_stops = 0 max_total_turns = config.max_attempts * 2 # hard cap to prevent infinite loops token_budget = config.max_completion_tokens + # Ceiling for the length-retry silent bump. Read from config.extra_body so + # operators can opt into larger bumps for providers that accept them, but + # default to `max_completion_tokens` so models with a hard cap at that value + # don't get HTTP 400 when the bump kicks in. Stripped from the request + # payload in `send_chat_completion` so it never leaks to the provider. + length_retry_token_cap = int( + config.extra_body.get("length_retry_token_cap", config.max_completion_tokens) + ) + if length_retry_token_cap < config.max_completion_tokens: + length_retry_token_cap = config.max_completion_tokens # Temperature schedule: escalate after repeated same-class failures to break out # of deterministic loops where temperature=0 reproduces byte-identical responses. current_temperature = config.temperature @@ -1828,7 +1841,7 @@ def execute_interactive_agent_task( # provider-enforced per-response limit (some models hard-cap at the # configured value and would return HTTP 400 on anything larger). if consecutive_length_stops <= 3: - token_budget = min(int(token_budget * 1.5), config.max_completion_tokens) + token_budget = min(int(token_budget * 1.5), length_retry_token_cap) continue # Subsequent length stops: inject a nudge to simplify and use tools transcript.append({"role": "assistant", "content": ""}) From 54571ce2406e50d428bb556ac44b84f6874b6613 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 13:21:06 +0200 Subject: [PATCH 14/91] fix: defensively parse fallback_models and length_retry_token_cap Both knobs are read from schema-free extra_body: - fallback_models="gpt-4o-mini" (string shorthand) previously iterated character-by-character, producing fake model ids like "g", "p", "t" during failover. Normalize a bare string to a one-element list first. - length_retry_token_cap=null / "12k" / nested object would crash the int() cast before the first model turn, aborting the task. Fall back to max_completion_tokens silently. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 3970b362..700005a2 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1190,9 +1190,16 @@ def send_chat_completion( payload["max_tokens"] = max_tokens_override or config.max_completion_tokens # Allow configuring a fallback chain via extra_body.fallback_models (list of model ids). # This lets a rate-limited primary (e.g. "opus") degrade gracefully instead of failing the run. + # Normalize fallback_models: accept a list of strings (standard) or a + # single string (common operator shorthand). A bare string must not be + # iterated character-by-character, which would produce single-letter + # "models" like "g", "p", "t". + raw_fallback = config.extra_body.get("fallback_models") or [] + if isinstance(raw_fallback, str): + raw_fallback = [raw_fallback] fallback_models = [ str(item) - for item in (config.extra_body.get("fallback_models") or []) + for item in raw_fallback if isinstance(item, str) and item.strip() ] payload.pop("fallback_models", None) @@ -1791,9 +1798,13 @@ def execute_interactive_agent_task( # default to `max_completion_tokens` so models with a hard cap at that value # don't get HTTP 400 when the bump kicks in. Stripped from the request # payload in `send_chat_completion` so it never leaks to the provider. - length_retry_token_cap = int( - config.extra_body.get("length_retry_token_cap", config.max_completion_tokens) - ) + _cap_raw = config.extra_body.get("length_retry_token_cap", config.max_completion_tokens) + try: + length_retry_token_cap = int(_cap_raw) + except (TypeError, ValueError): + # Invalid operator-edited value (e.g. null, "12k", nested object). + # Fall back silently rather than aborting the run. + length_retry_token_cap = config.max_completion_tokens if length_retry_token_cap < config.max_completion_tokens: length_retry_token_cap = config.max_completion_tokens # Temperature schedule: escalate after repeated same-class failures to break out From 0cd7869d695c1c25ac6d711b4bb3d6c8a4f223c5 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 13:47:39 +0200 Subject: [PATCH 15/91] fix: retry on non-JSON 200 + skip candidate fallback when trace present Two small bugbot findings on 54571ce: 1. Medium: a non-JSON 200 response (HTML error page from CDN/LB mid- deploy) raised SystemExit, killing the task without letting the retry loop or fallback-model chain recover. Treat it like a URL error instead: retry with backoff, eventually raise _ChatCompletionError(status=0) so the outer fallback chain can route to a secondary model. 2. Low: the candidate-change fallback in build_run_analysis ran unconditionally even when the attempt already had a populated trace, redundantly adding the same hash via two code paths. Skip the fallback entirely when trace.candidate_sha256 is set so the two derivation paths cannot diverge silently. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 700005a2..281e85a8 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1011,15 +1011,19 @@ def build_run_analysis( # Fallback for interactive-mode attempts that do not populate `trace`: # derive candidate changes/hashes directly from candidate_file_contents. # Count every transition (incl. reverts like A -> B -> A), and record - # each distinct hash separately. - candidate_text = str(attempt.get("candidate_file_contents", "")) - if candidate_text.strip(): - candidate_hash = stable_digest(candidate_text) - distinct_candidate_hashes.add(candidate_hash) - if not isinstance(trace, dict) or not trace.get("candidate_sha256"): + # each distinct hash separately. Skip this block entirely when `trace` + # is already populated, so non-interactive traces are not redundantly + # re-hashed (which would be harmless while digests match but fragile + # if the two derivation paths ever diverge). + trace_has_hash = isinstance(trace, dict) and bool(trace.get("candidate_sha256")) + if not trace_has_hash: + candidate_text = str(attempt.get("candidate_file_contents", "")) + if candidate_text.strip(): + candidate_hash = stable_digest(candidate_text) + distinct_candidate_hashes.add(candidate_hash) if candidate_text != previous_candidate: candidate_change_count += 1 - previous_candidate = candidate_text + previous_candidate = candidate_text return { "attempt_count": len(attempts), "tool_calls_used": tool_calls_used, @@ -1132,9 +1136,15 @@ def _post_chat_completion( try: return json.loads(body) except json.JSONDecodeError as exc: - raise SystemExit( - f"chat completion request returned non-JSON response: {body[:400]!r}" - ) from exc + # Non-JSON 200 responses (HTML error pages from a CDN or load + # balancer mid-deploy are common) must be treated as transient + # failures so the retry loop and fallback-model chain can take + # over, not as SystemExit which aborts the whole task. + last_error = f"non-JSON response: {body[:200]!r}" + if attempt == MAX_CHAT_COMPLETION_RETRIES - 1: + raise _ChatCompletionError(status=0, detail=last_error, model=model) from exc + time.sleep(_backoff_delay(attempt, None)) + continue except error.HTTPError as exc: detail = exc.read().decode("utf-8", errors="replace") last_error = f"HTTP {exc.code}: {detail[:400]}" From 1666e403277f699b65d603959a242c8934e49483 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 15:04:26 +0200 Subject: [PATCH 16/91] fix: guard non-iterable fallback_models + jitter Retry-After - fallback_models: accept bare string, else require list/tuple; silently drop bools/ints/dicts instead of raising TypeError mid-request. - _backoff_delay: add up-to-1s additive jitter when honouring Retry-After so multiple workers desynchronise (thundering-herd fix). --- harness/default_agent.py | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 281e85a8..d932c052 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -1100,8 +1100,11 @@ def _backoff_delay(attempt: int, retry_after: float | None) -> float: # Honour the provider-requested wait. Clamp only at a safety ceiling # (10 minutes) so a pathological header cannot stall the run # indefinitely; the previous 60s clamp was too aggressive and caused - # retries to fire while the rate limit was still in force. - return min(retry_after, 600.0) + # retries to fire while the rate limit was still in force. Add a + # small additive jitter (up to 1s) so concurrent workers hitting the + # same Retry-After do not thunder back in lockstep. + clamped = min(retry_after, 600.0) + return clamped + random.random() # Exponential backoff with jitter, capped at 30s. base = min(30.0, 2.0 ** attempt) return base * (0.5 + random.random() * 0.5) @@ -1207,6 +1210,10 @@ def send_chat_completion( raw_fallback = config.extra_body.get("fallback_models") or [] if isinstance(raw_fallback, str): raw_fallback = [raw_fallback] + elif not isinstance(raw_fallback, (list, tuple)): + # extra_body is schema-free operator input; a truthy non-iterable + # (bool, int, dict, ...) must not blow up the iteration below. + raw_fallback = [] fallback_models = [ str(item) for item in raw_fallback From 7427d4c6a236cd9c28d5f316bc2315289b3b5d10 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 16:41:51 +0200 Subject: [PATCH 17/91] fix: strip non-schema keys from try_tactic_at_hole evaluation When the agent solves a task via try_tactic_at_hole, the tool result dict includes 'tactic' and 'failure_class' keys. Copying the entire result into the 'evaluation' field then trips the agent-run schema's additionalProperties: false guard and aborts the whole task with no result file. Whitelist only the keys the schema allows (status, failure_mode, details, command, candidate_workspace) when normalizing. --- harness/default_agent.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index d932c052..7872e176 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -2015,9 +2015,14 @@ def execute_interactive_agent_task( if str(fc) != "environment_error": failure_class_history.append(str(fc)) elif tool_name in ("run_lean_check", "try_tactic_at_hole") and result.get("status") == "passed": - # Normalize to evaluation schema (try_tactic_at_hole returns tactic/details without failure_mode) - evaluation = dict(result) + # Normalize to evaluation schema. `try_tactic_at_hole` returns + # extra keys like `tactic` that must be stripped, otherwise the + # final result fails schema validation (additionalProperties: + # false) and the whole task aborts with no result file. + _EVAL_KEYS = ("status", "failure_mode", "details", "command", "candidate_workspace") + evaluation = {k: result[k] for k in _EVAL_KEYS if k in result} evaluation.setdefault("failure_mode", None) + evaluation.setdefault("details", "") attempts[-1]["candidate_file_contents"] = runtime.current_proof_text attempts[-1]["evaluation"] = evaluation return response, response_text, runtime.current_proof_text, evaluation, attempts, tool_calls_used From b2bd74fdabcae06aada8fd73a2bfff85a2ad71ee Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 18:29:33 +0200 Subject: [PATCH 18/91] refactor: fold run_lean_check into write_editable_proof MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every write_editable_proof was followed by a separate run_lean_check, each costing one tool slot AND one model round-trip. Across a full benchmark run we measured 249 write_editable_proof calls and 0 write+check pairs in the same turn — the model always does them sequentially over two turns. Folding the check inline (with check=True default) saves one full LLM round-trip per iteration and roughly doubles the effective proof-exploration budget. On ceildiv_sandwich (hard failing task), the same ~20-tool-call budget now yields 13 write attempts + 3 tactic tries (= 16 proof iterations) vs 7 writes + 6 tactic tries + 3 explicit checks (= 13 proof iterations) before — ~23% more productive iterations for the same cost. The write result reuses execute_tool("run_lean_check", {}) so auto-heal for missing .olean, failure annotation, and repair hints all stay identical to a bare check. Format warnings (non_public_imports, unfilled_hole, theorem_statement_mismatch) remain in result["warnings"] alongside the Lean verdict. Dispatcher in default_agent.py treats write_editable_proof passed/failed identically to run_lean_check. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 4 ++-- harness/interactive_runtime.py | 21 +++++++++++++++++---- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 7872e176..465ddaf4 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -2007,14 +2007,14 @@ def execute_interactive_agent_task( "result": result, } ) - if tool_name == "run_lean_check" and result.get("failure_mode") == "lean_check_failed": + if tool_name in ("run_lean_check", "write_editable_proof") and result.get("failure_mode") == "lean_check_failed": saw_lean_failure = True fc = result.get("failure_class") or classify_failure(str(result.get("details", ""))) # Skip environment errors: they are infra noise that would # break the temperature schedule's same-class sliding window. if str(fc) != "environment_error": failure_class_history.append(str(fc)) - elif tool_name in ("run_lean_check", "try_tactic_at_hole") and result.get("status") == "passed": + elif tool_name in ("run_lean_check", "try_tactic_at_hole", "write_editable_proof") and result.get("status") == "passed": # Normalize to evaluation schema. `try_tactic_at_hole` returns # extra keys like `tactic` that must be stripped, otherwise the # final result fails schema validation (additionalProperties: diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 8ec303c1..cb217213 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -78,7 +78,7 @@ def read_public_file(self, rel_path: str) -> dict[str, Any]: except FileNotFoundError: return {"status": "missing", "path": rel_path} - def write_editable_proof(self, content: str) -> dict[str, Any]: + def write_editable_proof(self, content: str, *, check: bool = True) -> dict[str, Any]: self.current_proof_text = content if content.endswith("\n") else f"{content}\n" warnings: list[dict[str, str]] = [] if not self.current_proof_text.strip(): @@ -110,15 +110,28 @@ def write_editable_proof(self, content: str) -> dict[str, Any]: "kind": "theorem_statement_mismatch", "detail": "editable theorem signature changed; revert to the original statement.", }) - status = "ok_with_warnings" if warnings else "ok" result: dict[str, Any] = { - "status": status, + "status": "ok_with_warnings" if warnings else "ok", "path": self.paths.editable_rel_path, "bytes": len(self.current_proof_text.encode("utf-8")), "lines": len(self.current_proof_text.splitlines()), } if warnings: result["warnings"] = warnings + # Fold the Lean check into the write. Each write+check used to cost + # two tool slots and two model round-trips; inlining saves one full + # round-trip (hundreds of ms to seconds of LLM latency per proof + # iteration) and doubles the effective budget for proof exploration. + # The caller can disable by passing check=False (kept for callers + # that only want to stage a draft without paying for Lean). + if check: + # Reuse the full run_lean_check pipeline (auto-heal + annotation + + # repair hints) so downstream success/failure detection is + # identical to a bare run_lean_check call. Write-time metadata + # (path, bytes, lines, warnings) stays visible in the result so + # the model still sees format warnings like non_public_imports + # alongside the Lean verdict. + result.update(self.execute_tool("run_lean_check", {})) return result def search_public_defs(self, query: str, *, limit: int = 20) -> dict[str, Any]: @@ -310,7 +323,7 @@ def tool_specs(self) -> list[dict[str, Any]]: "type": "function", "function": { "name": "write_editable_proof", - "description": "Replace the entire editable proof file with complete Lean code.", + "description": "Replace the entire editable proof file with complete Lean code and automatically run the Lean check. The response reports status (passed/failed/ok/ok_with_warnings) and, on failure, failure_mode, details, and failure_class. A separate run_lean_check call is not needed after this.", "parameters": { "type": "object", "additionalProperties": False, From 3fa4c098372087fa6c5bb6e498e6ff43a8d81a04 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 18:49:39 +0200 Subject: [PATCH 19/91] feat: surface preflight failure modes as distinct failure_class Previously, preflight failures (placeholder_detected, theorem_statement_mismatch, hidden_proof_import_detected, hidden_case_import_detected, empty_response) all collapsed to failure_class="other" because classify_failure only pattern-matches Lean error text and these carry English rejection messages. The model then got a generic "other" class with no targeted hint and would keep resubmitting proofs containing `sorry`, `admit`, or altered theorem signatures. Fix: - Map preflight failure_mode values directly to failure_class in _annotate_check_result so the model sees e.g. "placeholder_detected" instead of "other" - Add targeted "PREFLIGHT REJECTED" hints in _build_check_hints explaining what triggered the rejection and how to recover Evidence (from 4-task post-refactor rerun of the previously-failing set): - 10 of 44 failed writes were classified "other"; 6 of those were preflight failures (5 placeholder_detected, 2 theorem_statement_mismatch) that now get specific class + actionable hint - No regression: lean_check_failed still routes through classify_failure as before (verified via direct unit smoke test on unsolved_goals and unknown_identifier cases) Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 59 +++++++++++++++++++++++++++++++++- 1 file changed, 58 insertions(+), 1 deletion(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index cb217213..7759d9fe 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -440,7 +440,15 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: # not preflight failures (empty_response, placeholder_detected, etc.) is_lean_failure = failure_mode == "lean_check_failed" details = str(result.get("details", "")) - failure_class = classify_failure(details) + # Preflight failures carry English-language details that classify_failure + # can't pattern-match, so they all collapse to "other" and the model gets + # no targeted hint. Map the failure_mode directly to a class name so the + # model sees e.g. "placeholder_detected" instead of "other" and + # _build_check_hints can dispatch a specific hint. + if not is_lean_failure and failure_mode in _PREFLIGHT_FAILURE_MODES: + failure_class = failure_mode + else: + failure_class = classify_failure(details) hints = _build_check_hints(failure_class, details) annotated = dict(result) annotated["failure_class"] = failure_class @@ -751,6 +759,18 @@ def _missing_olean_module(details: str) -> str | None: return rel.replace("/", ".") +# Preflight failure_mode values that preflight_candidate returns. Used by +# _annotate_check_result to surface these as failure_class directly rather than +# collapsing them into "other" via English-language classify_failure lookup. +_PREFLIGHT_FAILURE_MODES = frozenset({ + "empty_response", + "placeholder_detected", + "hidden_proof_import_detected", + "hidden_case_import_detected", + "theorem_statement_mismatch", +}) + + def classify_failure(details: str) -> str: """Classify a Lean checker failure into a coarse category.""" if not details: @@ -811,6 +831,43 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: "retry run_lean_check once more." ) return hints + if failure_class == "placeholder_detected": + hints.append( + "PREFLIGHT REJECTED: proof contains `sorry` or `admit`. The harness " + "will never accept these. Replace every `sorry`/`admit` with a real " + "tactic, or use `?_` (unnamed hole) to probe a sub-goal with " + "inspect_lean_goals / try_tactic_at_hole." + ) + return hints + if failure_class == "theorem_statement_mismatch": + hints.append( + "PREFLIGHT REJECTED: you changed the editable theorem signature. Only " + "the proof body after `:=` is editable. Restore the exact theorem " + "declaration from the original editable file (re-read it with " + "read_public_file if unsure) and edit only the body." + ) + return hints + if failure_class == "hidden_proof_import_detected": + hints.append( + "PREFLIGHT REJECTED: proof imports a hidden `Benchmark.Cases.*.Proofs` " + "module. Reference-solution modules are not part of the public API. " + "Remove that import and write the proof yourself." + ) + return hints + if failure_class == "hidden_case_import_detected": + hints.append( + "PREFLIGHT REJECTED: proof imports a non-public `Benchmark.Cases.*` " + "module. Only `Benchmark.Cases.*.Specs` (and your own editable file) " + "are visible. Remove the blocked import." + ) + return hints + if failure_class == "empty_response": + hints.append( + "PREFLIGHT REJECTED: the proof content was empty. Submit the full " + "Lean file including `import`, `namespace`, and the theorem with " + "its proof body." + ) + return hints if failure_class == "unknown_identifier": if "decide_True" in details or "decide_False" in details: hints.append("CRITICAL: `decide_True` and `decide_False` do not exist. Remove them. Instead, pass precondition hypotheses directly to `simp` - it handles `decide` reduction automatically.") From 896665f1b315a7305576da7d7d442b9b65e103ce Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:15:53 +0200 Subject: [PATCH 20/91] feat: classify omega_failed and emit nonlinear arithmetic hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previously, `omega could not prove the goal` failures collapsed to failure_class="other", so the model got no actionable hint and the stagnation detector could not track repeated omega failures as a coherent class. Surveying 7 post-refactor task runs, 100% of remaining "other" failures (15/15) were omega failures — making this the single highest-leverage unclassified category. Changes: - classify_failure: detect "omega could not prove the goal" → omega_failed - _build_check_hints: explain omega's linear-arithmetic scope and counterexample structure, plus conditional hints that fire when the error details contain division/modulus or variable multiplication (the two shapes that cause 100% of observed omega failures) Evidence (post-commit 3fa4c09 run on openzeppelin/preview_deposit_rounds_down): - 3 "other" failures were all `omega could not prove`, each involving Uint256 div/mul/mod reasoning beyond omega's reach - Across 7 failing tasks in results/agent_runs/custom/interactive-proxy/, the same pattern holds: every unclassified failure is an omega one No regression: all existing classifications (unsolved_goals, unknown_identifier, type_mismatch, rfl_failed, simp_no_progress, no_goals, unknown, other) still route correctly per unit smoke test. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 7759d9fe..ef3d863e 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -818,6 +818,8 @@ def classify_failure(details: str) -> str: return "rfl_failed" if "invalid" in lower and "conv tactic" in lower: return "tactic_misuse" + if "omega could not prove the goal" in lower: + return "omega_failed" return "other" @@ -908,6 +910,28 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: elif failure_class == "tactic_misuse": hints.append("The tactic was used incorrectly for this goal shape.") hints.append("Check the goal state with inspect_lean_goals using a ?_ hole.") + elif failure_class == "omega_failed": + hints.append( + "omega only handles LINEAR integer/natural arithmetic. It cannot close goals " + "containing variable * variable, division, or modulus. Look at the " + "counterexample section — any term on the RHS of `where` that mixes two " + "variables multiplicatively, or uses `/` or `%`, is outside omega's reach." + ) + nonlinear_hints: list[str] = [] + if "/" in details or "% " in details or " mod " in details: + nonlinear_hints.append( + "For division/modulus: first rewrite `a / b` and `a % b` via " + "`Nat.div_add_mod` / `Nat.mul_div_cancel'` so omega sees a linear form, " + "or case-split on whether the divisor is zero and handle each branch." + ) + if "val *" in details or "* ↑" in details: + nonlinear_hints.append( + "For variable multiplications: introduce helper lemmas that bound the " + "product (e.g. `Nat.mul_le_mul`), or try `nlinarith` / `positivity` which " + "handle some nonlinear cases. Pure omega will never close a goal whose " + "counterexample mentions a product of two symbolic `.val` terms." + ) + hints.extend(nonlinear_hints) return hints From 063289436e72e95f5cad3d8a85a3448e42c415dc Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:18:24 +0200 Subject: [PATCH 21/91] feat: classify constructor/module/synthesis failures with targeted hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After the omega_failed classifier (896665f), a corpus survey of 36 remaining "other" lean_check_failed events across results/agent_runs/ shows the residual unclassified categories are: 13 omega could not prove (fixed in 896665f) 11 tactic 'constructor' failed ← this commit 5 don't know how to synthesize placeholder ← this commit 4 unknown module prefix 'Mathlib' ← this commit 3 other long-tail patterns (≤1 each) Together, these three patterns account for 20/23 = 87% of the remaining "other" volume, so classifying them is the highest-leverage next step. Classifiers: - constructor_failed: goal not inductive (wrong `constructor` usage) - module_not_found: unknown import path (most often `Mathlib`, which verity-benchmark does NOT depend on) - synthesis_failed: unfilled `_` / `?_` placeholder Lean can't infer Each class gets a targeted hint explaining the root cause and offering two or three concrete recovery moves (e.g. `refine ⟨_, _⟩` for constructor, remove Mathlib import for module_not_found, use `show` or inspect_lean_goals for synthesis_failed). Regression-tested: all existing classifications (unsolved_goals, unknown_identifier, type_mismatch, rfl_failed, simp_no_progress, no_goals, omega_failed, other) still route correctly. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index ef3d863e..445e953e 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -820,6 +820,12 @@ def classify_failure(details: str) -> str: return "tactic_misuse" if "omega could not prove the goal" in lower: return "omega_failed" + if "tactic 'constructor' failed" in details and "not an inductive datatype" in lower: + return "constructor_failed" + if "unknown module prefix" in lower: + return "module_not_found" + if "don't know how to synthesize placeholder" in lower: + return "synthesis_failed" return "other" @@ -932,6 +938,30 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: "counterexample mentions a product of two symbolic `.val` terms." ) hints.extend(nonlinear_hints) + elif failure_class == "constructor_failed": + hints.append( + "`constructor` only works on inductive-type goals (And, Or, Exists, Sigma, " + "structures). The goal you're targeting is an equality, implication, or an " + "unreduced expression — not a constructor-shaped type. Either (a) `simp` / " + "`unfold` first to expose an inductive head symbol, (b) `intro` pending " + "hypotheses if the goal is `A → B`, or (c) use `refine ⟨_, _⟩` / " + "`exact ⟨_, _⟩` if you already know the witnesses for an And/Exists." + ) + elif failure_class == "module_not_found": + hints.append( + "The import path you requested is not available in this workspace. In " + "particular, `Mathlib` is NOT a dependency of verity-benchmark — only the " + "core Lean 4 prelude, `Batteries`, and the task's own `Benchmark.*` public " + "modules are importable. Remove the offending `import` line and reach for " + "core Lean / Batteries lemmas, or search_public_defs for existing helpers." + ) + elif failure_class == "synthesis_failed": + hints.append( + "Lean could not infer a `_` / `?_` placeholder from context. Either (a) " + "replace `_` with an explicit term, (b) add a `show ` line above " + "the tactic so Lean knows the expected type, or (c) use `?_` (named hole) " + "with `inspect_lean_goals` to see what Lean expected there before filling it." + ) return hints From b9755706cf9cfc0dda6c6c07853cb0b390f8cba8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:22:30 +0200 Subject: [PATCH 22/91] feat: dedupe repeated repair hints + pivot directive on 3+ stagnations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 29 failing tasks (post-commit 0632894) showed 26 hit stagnation warnings, and in every case the `repair_hints` list was emitted *verbatim* across every consecutive same-class failure — 5x in the worst tasks. Example from zama transferFrom_conservation: fail#2..#5 (all unsolved_goals) hint[0] "Use inspect_lean_goals with a ?_ hole..." (identical each time) hint[1] "If simp leaves `if`/`match` with free vars..." (identical) hint[2] "Try restructuring: `by_cases h : ...`" (identical) hint[3] ESCALATION template (identical each time) hint[4] match/if structural advice (identical) This trains the model to ignore the repair_hints list entirely. The model kept writing variations of the same failing proof instead of calling inspect_lean_goals (hint #0 tells it to; it never does). Fix: - Track fingerprints (lowercased first 80 non-whitespace chars) of every hint surfaced in a session. - `_filter_seen_hints` drops hints whose fingerprint is already known. - When dedup would leave the list empty AND same_class_count ≥ 3, substitute a one-shot pivot directive that tells the model to stop re-writing and switch to the inspect_lean_goals / try_tactic_at_hole exploration workflow (which is what the first hint was suggesting all along). Effect on the trace above: fail#2 gets just the stagnation_warning (fresh signal); fail#3..#5 each get a single explicit "pivot now" directive that names the repetition count and the exact next tool to use, instead of five pages of advice already seen. Tested: - classify_failure regression: 11/11 classes route correctly - dedup test: distinct classes (unsolved_goals, unknown_identifier, type_mismatch, omega_failed) each receive fresh hints; the same class emitted 6x in a row shows fresh hints only on fail#1, then pivot directives with varying counts on fail#3+. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 38 ++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 445e953e..ee788ca4 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -40,6 +40,11 @@ def __init__(self, task: dict[str, Any]) -> None: self._task = task # store for hint escalation self._best_error_count: int | None = None self._best_first_error_line: int | None = None + # Fingerprints of hint texts already surfaced this session. Used to + # avoid echoing the same repair advice verbatim across consecutive + # failures — repeated identical hints are pure noise and train the + # model to ignore the list instead of acting on it. + self._emitted_hint_keys: set[str] = set() self.paths = RuntimePaths( editable_rel_path=editable_rel_path, theorem_name=str(task["theorem_name"]), @@ -494,6 +499,23 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: if escalation: hints.append(escalation) + # Dedupe hints we've already shown this session. Repeated-verbatim hints + # are noise: corpus analysis of failing tasks showed the same 4-5 hints + # echoed across 5+ stagnation events, training the model to skip the + # repair_hints list entirely. Only surface *new* advice each time. + hints = self._filter_seen_hints(hints) + if not hints and same_class_count >= 3: + # All the standing advice has already been seen and isn't working. + # Issue a one-shot pivot directive rather than sending an empty list, + # which the model interprets as "nothing new, carry on". + hints = [ + f"All prior repair hints for '{failure_class}' have now been repeated " + f"{same_class_count} times without progress. Stop retrying variations of " + f"the same proof. Next move: write a minimal skeleton with a `?_` hole at " + f"the first failing step, call `inspect_lean_goals` to read the actual " + f"goal state, then use `try_tactic_at_hole` to probe tactics one at a time." + ] + if hints: annotated["repair_hints"] = hints @@ -531,6 +553,22 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: return annotated + def _filter_seen_hints(self, hints: list[str]) -> list[str]: + """Drop hints whose fingerprint has already been surfaced this session. + + Fingerprint = lowercased first 80 non-whitespace chars. Short enough + that wording tweaks still dedupe, long enough to distinguish genuinely + different hints. + """ + fresh: list[str] = [] + for hint in hints: + key = "".join(hint.lower().split())[:80] + if key in self._emitted_hint_keys: + continue + self._emitted_hint_keys.add(key) + fresh.append(hint) + return fresh + def _build_escalation_hint(self, failure_class: str) -> str | None: """Build an escalation hint when the model is stagnating on a failure class.""" terms = extract_contract_simp_terms(self._task) From 33b3bdb1bfa900e0c5310dd5b411b922bbcfb6b0 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:28:15 +0200 Subject: [PATCH 23/91] feat: detect no-progress loops via error-text fingerprinting Corpus analysis of 29 failing tasks found 12 hit "no-progress loops": the model resubmits proofs that produce byte-identical Lean errors. Worst cases streak=4 (zama transfer_conservation, safe swap_owner). Class-level hints deduped by b975570 go silent in that scenario, so nothing tells the model its latest edit had zero effect on what Lean saw. Adds _normalize_details_fp (strips CandidateCheck.lean:L:C markers, collapses whitespace, 512-char cap) and tracks a streak in TaskProofRuntime. When streak >= 2 a "NO-PROGRESS LOOP DETECTED" directive is inserted at hints[0] BEFORE dedup so the fresh streak count surfaces every time. The directive pushes the model to the ?_ + inspect_lean_goals + try_tactic_at_hole flow with an explicit list of tactics it likely hasn't tried. No regression on 7/7 existing classify_failure tests. --- harness/interactive_runtime.py | 54 ++++++++++++++++++++++++++++++++++ 1 file changed, 54 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index ee788ca4..78e7c72d 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -45,6 +45,13 @@ def __init__(self, task: dict[str, Any]) -> None: # failures — repeated identical hints are pure noise and train the # model to ignore the list instead of acting on it. self._emitted_hint_keys: set[str] = set() + # Normalised fingerprint of the previous failing Lean details text, + # plus a count of how many times the same fingerprint has repeated + # in a row. Used to detect "no-progress loops" where the model + # resubmits a proof that yields byte-identical errors — corpus + # analysis found 12/29 failing tasks hit this pattern. + self._last_details_fp: str | None = None + self._same_details_streak: int = 0 self.paths = RuntimePaths( editable_rel_path=editable_rel_path, theorem_name=str(task["theorem_name"]), @@ -483,6 +490,17 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: else: break + # Detect true no-progress loops: the normalized error text matches the + # previous failure byte-for-byte. This is a much stronger signal than + # same-class stagnation — it proves the last edit had zero effect on + # what Lean actually saw. + details_fp = _normalize_details_fp(details) + if details_fp and details_fp == self._last_details_fp: + self._same_details_streak += 1 + else: + self._same_details_streak = 1 + self._last_details_fp = details_fp + # Escalate on either: 2+ consecutive same-class failures, or 4+ total failures if same_class_count >= 2 or total_failures >= 4: if same_class_count >= 2: @@ -499,6 +517,22 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: if escalation: hints.append(escalation) + # When the error text is byte-identical to the previous attempt, the + # model's latest edit had zero effect — hints must call this out + # explicitly, not just repeat class-level advice. Keep this BEFORE + # the dedup so the fingerprint-unique streak count is surfaced fresh + # each time. + if self._same_details_streak >= 2: + hints.insert(0, ( + f"NO-PROGRESS LOOP DETECTED: your last {self._same_details_streak} " + "submissions produced byte-identical Lean errors. The changes you are " + "making do not reach the failing goal. Stop editing around the symptom. " + "Instead: (1) `write_editable_proof` with the failing tactic replaced by " + "`?_`, (2) `inspect_lean_goals` to read the real goal at that hole, " + "(3) `try_tactic_at_hole` with tactics you have NOT tried yet " + "(e.g. `simp_all`, `aesop`, `decide`, `exact?`, `constructor; all_goals ...`)." + )) + # Dedupe hints we've already shown this session. Repeated-verbatim hints # are noise: corpus analysis of failing tasks showed the same 4-5 hints # echoed across 5+ stagnation events, training the model to skip the @@ -762,6 +796,26 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: return terms +_FP_LINE_COL_RE = re.compile(r"CandidateCheck\.lean:\d+:\d+:") +_FP_WS_RE = re.compile(r"\s+") + + +def _normalize_details_fp(details: str) -> str: + """Return a whitespace/line-number-agnostic fingerprint of a Lean error. + + Strips the leading `CandidateCheck.lean:LINE:COL:` markers and collapses + all whitespace runs so two Lean runs that differ only in formatting + noise produce the same fingerprint. Truncated to 512 chars — long + enough to distinguish genuinely different errors, short enough that + minor trailing-hint variation doesn't break the match. + """ + if not details: + return "" + d = _FP_LINE_COL_RE.sub("", details) + d = _FP_WS_RE.sub(" ", d).strip() + return d[:512] + + # Missing-olean errors can be infrastructure (a Benchmark dependency wasn't # pre-built) or the model's fault (imported a module that doesn't exist). We # only classify the former as environment_error so stagnation/temperature From d91d9dab50c296e1fd21ad7e3768f1c8f9f0a08d Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:34:36 +0200 Subject: [PATCH 24/91] feat: priority directive when Lean check fails with ?_ still in proof MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 29 failing tasks found 16 (55%) ended with `?_` or `exact ?_` still in the final submitted proof. `?_` is a probe for `inspect_lean_goals` / `try_tactic_at_hole`, never a proof — Lean can only ever reject it. The existing synthesis_failed hint said "use `?_` (named hole)" without making clear this was for inspection, not submission; the model repeatedly wrote `exact ?_` as a last-resort finisher. When `_annotate_check_result` sees a Lean failure AND `self.current_proof_text` still contains `?_`, prepend a priority UNFILLED HOLE directive at hints[0] naming the hole count and listing concrete tactics for `try_tactic_at_hole` (omega, simp_all, decide, rfl, assumption, trivial, exact h, linarith, aesop, exact?). Inserted after the NO-PROGRESS LOOP handler so when both fire the hole (root cause) is read first and the loop signal (symptom) second. No regression on 8/8 classify_failure tests or 11/11 HOLE_PATTERN tests. --- harness/interactive_runtime.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 78e7c72d..5c918727 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -533,6 +533,31 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: "(e.g. `simp_all`, `aesop`, `decide`, `exact?`, `constructor; all_goals ...`)." )) + # Highest-leverage directive: corpus analysis showed 16/29 failed tasks + # ended with `?_` or `exact ?_` still in the final submitted proof. + # `?_` is a probe for `inspect_lean_goals` / `try_tactic_at_hole`, never + # a valid proof. When Lean fails AND the current proof still contains a + # hole, say so explicitly — the generic synthesis_failed / unsolved_goals + # hints don't make this connection clear, and the `?_` advice elsewhere + # was misread as "write `?_` and submit it". Insert AFTER the no-progress + # directive so this ends up at hints[0] when both fire (hole is the root + # cause, no-progress is the symptom). + if HOLE_PATTERN.search(self.current_proof_text): + hole_count = len(HOLE_PATTERN.findall(self.current_proof_text)) + hints.insert(0, ( + f"UNFILLED HOLE IN SUBMITTED PROOF: your proof still contains " + f"{hole_count} `?_` hole(s). `?_` is a PROBE for `inspect_lean_goals` " + "and `try_tactic_at_hole`, never a final proof — Lean will reject " + "every submission containing `?_`. Do not submit `?_` again. Next " + "move: call `try_tactic_at_hole` with one concrete tactic at a " + "time (`omega`, `simp_all`, `decide`, `rfl`, `assumption`, " + "`trivial`, `exact h`, `linarith`, `aesop`, `exact?`). If any " + "succeeds, the proof updates in place and the task closes. If " + "none do, use `inspect_lean_goals` to read each hole's goal, then " + "`write_editable_proof` with concrete tactics substituted for " + "every `?_`." + )) + # Dedupe hints we've already shown this session. Repeated-verbatim hints # are noise: corpus analysis of failing tasks showed the same 4-5 hints # echoed across 5+ stagnation events, training the model to skip the From a142ad8c5e52e5f3e4b0f68da1f8ea72688fec8b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:39:27 +0200 Subject: [PATCH 25/91] feat: context-aware substitution in try_tactic_at_hole MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 72 failed try_tactic_at_hole calls showed 47 (65%) passed a raw tactic (omega, rfl, simp_all [...]) into a proof whose hole sat at a term position like `exact ?_`. The old substitution was a single `HOLE_PATTERN.sub(tactic, original)`, yielding `exact omega` — rejected by Lean because `omega` is a tactic, not a term. Replaces the blanket sub with `_substitute_holes`: per-hole context detection that wraps raw tactics as `(by )` at term-position holes (`exact`/`refine`/`apply`/`show`/`have`/`let`/`suffices`/`use`/`from`, inside `⟨...⟩`/`(`/`,`/`:=`) and strips a leading `by ` at tactic- position holes so substitution never produces nested `by ... by ...` blocks. Already-wrapped forms (`by ...`, `(by ...)`) pass through unchanged. Across the 29 failing tasks this context-fixes up to 47/72 tactic attempts that currently 0/72 land. Verified: 11/11 substitution tests, 7/7 term-position classification, 8/8 classify_failure regression, 6/6 HOLE_PATTERN regression. --- harness/interactive_runtime.py | 82 ++++++++++++++++++++++++++++++++-- 1 file changed, 78 insertions(+), 4 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 5c918727..fca69620 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -200,10 +200,17 @@ def try_tactic_at_hole(self, tactic: str) -> dict[str, Any]: if not tactic.strip(): return {"status": "rejected", "reason": "tactic_must_not_be_empty"} original = self.current_proof_text - # Replace standalone `?_` holes (not named holes like `?_foo` and not - # identifiers ending in `?_`). Must match HOLE_PATTERN so both tools - # agree on what counts as a hole. - modified = HOLE_PATTERN.sub(tactic.strip(), original) + # Substitute each `?_` with a context-adapted form of `tactic`. Corpus + # analysis of 72 failed try_tactic_at_hole calls found 47 (65%) passed + # a raw tactic (e.g. `omega`, `rfl`, `simp_all [...]`) into a proof + # where the hole sat at a TERM position like `exact ?_` — making the + # substituted proof read `exact omega`, which Lean rejects because + # `omega` is a tactic, not a term. Automatically wrap the substituted + # tactic with `(by ...)` at term-position holes, and strip an existing + # `by ` wrapper at tactic-position holes, so the model's intent + # survives context mismatches. Holes at other positions get the raw + # tactic. + modified = _substitute_holes(original, tactic.strip()) if modified == original: return { "status": "unsupported", @@ -821,10 +828,77 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: return terms +# Term-expecting tokens/punctuation that immediately precede a `?_` hole +# when the hole is in term (expression) position rather than tactic position. +# Matches at end-of-string after the hole's predecessor text is sliced off. +_TERM_POSITION_RE = re.compile( + r"(?:" + r"\b(?:exact|refine|apply|show|have|let|suffices|exact?|refine!|exact!|" + r"use|calc|from|fun)\s*" # term-expecting keywords + r"|[⟨(,\[{]\s*" # inside anonymous constructors / tuples / lists + r"|:=\s*" # RHS of let / have := ?_ + r")$" +) _FP_LINE_COL_RE = re.compile(r"CandidateCheck\.lean:\d+:\d+:") _FP_WS_RE = re.compile(r"\s+") +def _is_term_position_hole(proof: str, hole_start: int) -> bool: + """True iff the `?_` at `hole_start` sits where Lean expects a term. + + Looks back up to 40 chars of the preceding text (stripping trailing + whitespace) and matches against known term-expecting prefixes. Used by + `_substitute_holes` to decide whether a raw tactic substitution must be + wrapped in `(by ...)` so the resulting expression type-checks. + """ + window = proof[max(0, hole_start - 40):hole_start] + # Strip trailing whitespace/newlines — `exact\n ?_` is still term position. + window_r = window.rstrip() + # Re-append a single space so the regex's trailing `\s*$` consistently + # matches with or without original whitespace. + return bool(_TERM_POSITION_RE.search(window_r + " ")) + + +def _substitute_holes(proof: str, tactic: str) -> str: + """Replace every `?_` in `proof` with a context-adapted form of `tactic`. + + At term-position holes (`exact ?_`, `⟨?_, ?_⟩`, `:= ?_`, ...) the + substitute must be a term, so wrap a raw tactic as `(by )` unless + the caller already provided a term form. At tactic-position holes the + substitute must be a tactic, so strip a leading `by ` to avoid nested + `by ... by ...` blocks. + """ + raw = tactic.strip() + # Already a term form? (leading `by `/`by\n`, or fully wrapped in parens) + starts_by = raw.startswith("by ") or raw.startswith("by\n") + fully_paren_wrapped = ( + raw.startswith("(") and raw.endswith(")") and raw.count("(") == raw.count(")") + ) + is_term_form = starts_by or fully_paren_wrapped + # Precompute the tactic-position form: strip a leading `by ` or `by\n` + # so substitution at a tactic hole doesn't nest `by`. Leave paren- + # wrapped forms alone — those often indicate grouping the caller wants + # preserved as a single tactic (`(first | a | b)`). + if starts_by: + tactic_form = raw[3:].lstrip() + else: + tactic_form = raw + # Term-position form: `(by )` unless caller already passed a term. + term_form = raw if is_term_form else f"(by {raw})" + + out: list[str] = [] + cursor = 0 + for match in HOLE_PATTERN.finditer(proof): + out.append(proof[cursor:match.start()]) + if _is_term_position_hole(proof, match.start()): + out.append(term_form) + else: + out.append(tactic_form) + cursor = match.end() + out.append(proof[cursor:]) + return "".join(out) + + def _normalize_details_fp(details: str) -> str: """Return a whitespace/line-number-agnostic fingerprint of a Lean error. From db6951556a312c4ab480a196d374c1ad8e7a7767 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:42:50 +0200 Subject: [PATCH 26/91] feat: strip linter.unusedSimpArgs noise from Lean output MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 37 failed-check detail blobs found 844 of 846 warnings (~99%) were the Lean 4.22 `linter.unusedSimpArgs` warning — a multi-line block (warning header + unused-arg name + "Hint: Omit..." directive + strikethrough-glyph simp reconstruction + "Note: disable with `set_option ...`" footer) that added zero repair signal but accounted for ~20 KB of the average 34 KB details blob the model sees after every failed Lean check. The noise drowns real errors and trains the model to skip the details field entirely. `_strip_noise_warnings` drops only these specific warning blocks: every block between a `CandidateCheck.lean:L:C: warning: This simp argument is unused:` header and the next Lean diagnostic header. Every error and every other warning/note/info diagnostic passes through unchanged. Verified against the full corpus (37 detail blobs): - total size: 1.28 MB → 408 KB (-68%) - average size: 34.6 KB → 11.0 KB per blob - worst case: 365 KB → 33 KB (−332 KB on one pathological task) - all 141 errors preserved (0 lost) All 8 classify_failure tests, 3 substitution tests, and the detail fingerprint invariant still pass. --- harness/interactive_runtime.py | 48 ++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index fca69620..d6b3eec4 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -302,6 +302,15 @@ def evaluate_candidate(self, candidate_text: str, *, check_goals: bool = False) ) command = ["lake", "env", "lean", "--root=.", str(check_path.relative_to(workspace))] code, output = lean_run_command(command, cwd=workspace) + # Strip the "This simp argument is unused" lint blocks from Lean + # output before returning. Corpus analysis of 37 failed-check + # detail blobs found 844/846 warnings (~99%) were this single + # linter, accounting for ~20 KB of the average 34 KB details + # blob. The noise drowns the real errors and trains the model + # to ignore the details block. Filtering preserves every real + # error and every other warning kind — only the known-useless + # linter goes away. + output = _strip_noise_warnings(output) if code != 0: return { "status": "failed", @@ -841,6 +850,45 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: ) _FP_LINE_COL_RE = re.compile(r"CandidateCheck\.lean:\d+:\d+:") _FP_WS_RE = re.compile(r"\s+") +_LEAN_BLOCK_HEADER_RE = re.compile( + r"^CandidateCheck\.lean:\d+:\d+:\s*(error|warning|note|info):" +) + + +def _strip_noise_warnings(output: str) -> str: + """Drop `linter.unusedSimpArgs` warning blocks from Lean stdout. + + Lean 4.22 emits a multi-line warning for every simp argument it deems + unused. Each block spans the header line, the unused-arg name, a + "Hint: Omit it..." directive, a 3–8 line reconstructed simp invocation + with strikethrough glyphs, and a "Note: This linter can be disabled + with `set_option linter.unusedSimpArgs false`" footer. Across the 37 + failed-check blocks in the current corpus these blocks account for + 844/846 total warnings and roughly 20 KB of the average 34 KB + details blob — pure noise from the model's point of view because + the actual repair work is always driven by errors, not by this lint. + + A block begins at a `CandidateCheck.lean:L:C: warning: This simp + argument is unused:` header and ends at the next Lean diagnostic + header (error/warning/note/info) or end-of-output. Every other + diagnostic kind (including unrelated warnings) is preserved + verbatim. + """ + if not output or "This simp argument is unused" not in output: + return output + lines = output.splitlines(keepends=True) + kept: list[str] = [] + skip = False + for line in lines: + header = _LEAN_BLOCK_HEADER_RE.match(line) + if header: + skip = ( + header.group(1) == "warning" + and "This simp argument is unused" in line + ) + if not skip: + kept.append(line) + return "".join(kept) def _is_term_position_hole(proof: str, hole_start: int) -> bool: From 6df49e3f86fd84651b7af133a47c12609e54db67 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:47:22 +0200 Subject: [PATCH 27/91] feat: sync tool-surface descriptions with actual runtime behavior MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The synthesized interactive tool prompt in default_agent.py still said `write_editable_proof` "Does NOT run Lean", but commit b2bd74f folded run_lean_check into write_editable_proof — it now runs Lean automatically. The `try_tactic_at_hole` description also did not mention the auto-wrapping behavior added in commit a142ad8. Encoding the corrections upstream in the tool surface rather than only downstream in repair_hints saves rounds: agents no longer have to learn from a first-round failure that `?_` is not submittable (16/29 failed tasks previously ended with `?_` in the final proof) or discover through trial that `try_tactic_at_hole` handles term-position tactics. Changes: - write_editable_proof: state it runs Lean and returns repair_hints - run_lean_check: note it is redundant right after write_editable_proof - try_tactic_at_hole: note raw tactics are auto-wrapped at term positions - Add standing `?_` probe-vs-proof warning to the typical-loop footer - Mirror the try_tactic_at_hole wording in tool_specs() for function-call schema parity Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 9 +++++---- harness/interactive_runtime.py | 2 +- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 465ddaf4..0f47c0c6 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -476,17 +476,18 @@ def _synthesized_interactive_tools_prompt() -> str: # so we enumerate generic names here instead of calling tool_specs() directly. surface = [ ("read_public_file(path)", "Read one of the task's public Lean files (impl/spec/editable)."), - ("write_editable_proof(content)", "Replace the editable proof file. Returns immediate warnings for placeholders, theorem-signature changes, hidden imports, or unfilled `?_` holes. Does NOT run Lean."), - ("run_lean_check()", "Run `lake env lean` on the editable proof. Returns pass/fail with error details, failure_class, and repair_hints. Auto-retries once on environment errors (missing .olean)."), + ("write_editable_proof(content)", "Replace the editable proof file AND automatically run the Lean check. Response reports status (passed/failed), failure_mode, details, failure_class, and repair_hints. A separate run_lean_check call is not needed after this."), + ("run_lean_check()", "Re-run `lake env lean` without changing the file (redundant immediately after write_editable_proof)."), ("inspect_lean_goals()", "Inspect goal state at explicit `?_` holes. Unsupported if no hole present."), - ("try_tactic_at_hole(tactic)", "Replace all `?_` holes with a tactic and check. Preserves original proof on failure."), + ("try_tactic_at_hole(tactic)", "Replace all `?_` holes with a tactic and check. Pass a raw tactic (e.g. `omega`, `simp_all`, `decide`); substitution auto-wraps as `(by tac)` at term positions like `exact ?_`. Preserves original proof on failure."), ("search_public_defs(query)", "Search the task's public impl/spec files for def/theorem/lemma names."), ] for name, desc in surface: lines.append(f"- `{name}` — {desc}") lines.extend([ "", - "Typical loop: write_editable_proof → run_lean_check → read repair_hints → iterate.", + "Typical loop: write_editable_proof (which runs Lean) → read repair_hints → iterate.", + "`?_` is a PROBE for `inspect_lean_goals` / `try_tactic_at_hole`, never a final proof — Lean rejects every submission containing `?_`.", "Do NOT emit `lake build` or `scripts/...`; there is no shell tool.", ]) return "\n".join(lines) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index d6b3eec4..a4f142df 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -408,7 +408,7 @@ def tool_specs(self) -> list[dict[str, Any]]: "type": "function", "function": { "name": "try_tactic_at_hole", - "description": "Try replacing all `?_` holes in the current proof with a specific tactic and check if it compiles. Preserves the original proof if it fails. Useful for testing tactics like `simp_all [...]`, `omega`, `decide`, or `duper [...]`.", + "description": "Try replacing all `?_` holes in the current proof with a specific tactic and check if it compiles. Pass a raw tactic (e.g. `omega`, `simp_all [foo]`, `decide`, `exact h`); substitution auto-wraps as `(by tac)` when the hole is at a term position like `exact ?_`. Preserves the original proof if it fails.", "parameters": { "type": "object", "additionalProperties": False, From 740ef8a7f9951329d09fcff3c7041d939c1f66c9 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:51:41 +0200 Subject: [PATCH 28/91] feat: cache run_lean_check result when proof text is unchanged MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 interactive runs (results/agent_runs/custom/ interactive-proxy) found that 201/201 — 100% — of run_lean_check calls were made immediately after a write_editable_proof that had already run Lean on the exact same content (commit b2bd74f folded the check into the write). Every one of those 201 calls re-invoked `lake env lean` for a byte-identical evaluation, wasting seconds of Lean startup per round and an entire model turn. Fix: cache the (proof_text, result) pair at the end of every run_lean_check evaluation. On the next run_lean_check call, if self.current_proof_text matches the cached key, return a deep copy of the cached result with `cached: true` and a `note` that explains the call was redundant. This both saves the Lean invocation and trains the model out of the redundant pattern over subsequent turns. Cache is only read/written inside the run_lean_check branch of execute_tool, so write_editable_proof's internal check call populates the cache on fresh content and a follow-up bare run_lean_check hits the fast path. A subsequent write_editable_proof with different content updates self.current_proof_text first, causing the next run_lean_check to miss the cache and re-run Lean, which is the desired behavior. Verified via unit harness: 3 consecutive run_lean_check calls on unchanged text trigger exactly 1 Lean invocation; a content change followed by another run_lean_check triggers a second invocation and re-caches. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index a4f142df..e9f93e73 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -1,5 +1,6 @@ from __future__ import annotations +import copy import json import os import re @@ -52,6 +53,14 @@ def __init__(self, task: dict[str, Any]) -> None: # analysis found 12/29 failing tasks hit this pattern. self._last_details_fp: str | None = None self._same_details_streak: int = 0 + # Cache of the most recent run_lean_check evaluation keyed by the + # exact proof text that produced it. A redundant run_lean_check call + # against unchanged content (corpus analysis found 201/201 — 100% — + # of run_lean_check calls were immediately after a write_editable_proof + # that had already run Lean) returns this cached result instantly + # plus a `cached: true` marker telling the model the call was + # redundant, saving a full Lean invocation and a round. + self._last_eval_cache: tuple[str, dict[str, Any]] | None = None self.paths = RuntimePaths( editable_rel_path=editable_rel_path, theorem_name=str(task["theorem_name"]), @@ -430,6 +439,26 @@ def execute_tool(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]: if name == "write_editable_proof": return self.write_editable_proof(str(arguments.get("content", ""))) if name == "run_lean_check": + # Short-circuit if the proof text is unchanged since the last + # evaluation. Corpus analysis of 83 interactive runs found that + # 201/201 (100%) of run_lean_check calls were made immediately + # after a write_editable_proof that had already run Lean on the + # same content. Returning the cached evaluation saves a full + # Lean invocation (seconds) and teaches the model the call was + # redundant via the `cached: true` marker + note. + if self._last_eval_cache is not None: + cached_text, cached_result = self._last_eval_cache + if cached_text == self.current_proof_text: + reused = copy.deepcopy(cached_result) + reused["cached"] = True + reused["note"] = ( + "Proof text is unchanged since the last evaluation; " + "returning cached result without re-running Lean. " + "`write_editable_proof` already runs the Lean check — " + "a follow-up `run_lean_check` on unchanged content is " + "redundant." + ) + return reused result = self.evaluate_current() # Auto-heal environment errors (missing .olean) once before annotating. if result.get("status") == "failed" and result.get("failure_mode") == "lean_check_failed": @@ -451,6 +480,9 @@ def execute_tool(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]: result["repair_hints"] = existing else: result["repair_hints"] = [existing, guidance] if existing else [guidance] + # Cache the fresh evaluation against the current proof text so a + # follow-up run_lean_check on unchanged content hits the fast path. + self._last_eval_cache = (self.current_proof_text, copy.deepcopy(result)) return result if name == "inspect_lean_goals": return self.inspect_goals() From 4f8632dbcef3a497f6c7224b65251b8d9e8f565b Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:55:22 +0200 Subject: [PATCH 29/91] feat: add scope-clarifying hint when search_public_defs returns empty MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 interactive runs (results/agent_runs/custom/ interactive-proxy) found that 55 of 75 (73%) `search_public_defs` calls returned empty results. Inspection of the failing queries shows the overwhelming majority are for core Lean / Batteries / Mathlib-style lemma names — `add_zero`, `Uint256.add`, `div_pos`, `div_mul_le`, `Nat.div_mul_le`, `sub_add`, `val_mul`, etc. — not for task-specific definitions. `search_public_defs` only indexes the task's implementation_files and specification_files. Mathlib is not a dependency of this project and standard-library searches cannot succeed via this tool. Previously the empty response gave no indication of that scope limit, leaving the model to burn successive rounds on variants of the same library query. Fix: when `matches` is empty, attach a `hint` field that (a) names the scope explicitly, (b) tells the agent Mathlib is unavailable, and (c) redirects it to `exact?`/`apply?`/`rw?` via `try_tactic_at_hole` or to tactics like `simp`/`omega`/`decide` that already know common arithmetic facts. Non-empty responses are unchanged. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index e9f93e73..b9622166 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -182,6 +182,32 @@ def search_public_defs(self, query: str, *, limit: int = 20) -> dict[str, Any]: ) if len(matches) >= limit: return {"status": "ok", "query": query_text, "matches": matches, "truncated": True} + if not matches: + # Corpus analysis (83 runs) found 55/75 (73%) of search_public_defs + # calls returned empty — overwhelmingly because agents searched for + # Mathlib / core Lean library names like `Nat.div_mul_le`, + # `add_zero`, `div_pos`, etc. This tool only searches the task's + # public impl/spec files, not the standard library. Surface that + # scope limit explicitly so the agent stops burning rounds on + # library searches. + return { + "status": "ok", + "query": query_text, + "matches": matches, + "truncated": False, + "hint": ( + "No match in the task's public impl/spec files. " + "`search_public_defs` only indexes definitions inside " + "implementation_files and specification_files for this " + "task — it does NOT search Lean core, Batteries, or " + "Mathlib (Mathlib is not a dependency of this project). " + "For standard-library lemmas use `exact?` / `apply?` / " + "`rw?` via `try_tactic_at_hole`, or rely on `simp` / " + "`omega` / `decide` which already know common arithmetic " + "and boolean facts. Retry this tool only with names you " + "expect to be defined in the current task's spec/impl." + ), + } return {"status": "ok", "query": query_text, "matches": matches, "truncated": False} def inspect_goals(self) -> dict[str, Any]: From 7e2df674f2245c4be3880ee6dd92a12cdda2a001 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 19:58:25 +0200 Subject: [PATCH 30/91] feat: encode search_public_defs scope limit upstream in tool surface MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Follow-up to 4f8632d which added a scope-clarifying hint downstream on empty matches. The tool_specs() description and the synthesized interactive-tools prompt both still advertised `search_public_defs` with no scope qualifier, so agents had no way to know upfront that Mathlib / core Lean lemmas are out of scope. Burying the clarification in the empty-response path means every agent has to learn it by issuing at least one wasted query first. Corpus-backed motivation (from 4f8632d): 55 of 75 (73%) queries in the run corpus were for library names like `Nat.div_mul_le`, `add_zero`, `div_pos`, `val_mul`, etc. Encoding the scope limit in the tool description — both in the runtime's tool_specs() and in default_agent's synthesized prompt — prevents the first-query waste by the same reasoning we used in commit 6df49e3 for the `write_editable_proof` / `try_tactic_at_hole` descriptions. Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 2 +- harness/interactive_runtime.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 0f47c0c6..af0ed98e 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -480,7 +480,7 @@ def _synthesized_interactive_tools_prompt() -> str: ("run_lean_check()", "Re-run `lake env lean` without changing the file (redundant immediately after write_editable_proof)."), ("inspect_lean_goals()", "Inspect goal state at explicit `?_` holes. Unsupported if no hole present."), ("try_tactic_at_hole(tactic)", "Replace all `?_` holes with a tactic and check. Pass a raw tactic (e.g. `omega`, `simp_all`, `decide`); substitution auto-wraps as `(by tac)` at term positions like `exact ?_`. Preserves original proof on failure."), - ("search_public_defs(query)", "Search the task's public impl/spec files for def/theorem/lemma names."), + ("search_public_defs(query)", "Search the task's public impl/spec files for def/theorem/lemma names. Does NOT search Lean core / Batteries / Mathlib — use `exact?`/`apply?`/`rw?` via `try_tactic_at_hole` for standard-library lemmas."), ] for name, desc in surface: lines.append(f"- `{name}` — {desc}") diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index b9622166..2e919cfc 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -427,7 +427,7 @@ def tool_specs(self) -> list[dict[str, Any]]: "type": "function", "function": { "name": "search_public_defs", - "description": "Search public implementation/specification files for matching def/theorem/lemma names.", + "description": "Search the task's public implementation/specification files for matching def/theorem/lemma names. Scope is ONLY those task files — it does NOT search Lean core, Batteries, or Mathlib (Mathlib is not a dependency of this project). For standard-library lemmas, prefer `exact?` / `apply?` / `rw?` via `try_tactic_at_hole`, or tactics like `simp` / `omega` / `decide` that already know common arithmetic and boolean facts.", "parameters": { "type": "object", "additionalProperties": False, From f5ceebec004ada5494694af277404971d9bea262 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:01:37 +0200 Subject: [PATCH 31/91] feat: stop instructing agents to call run_lean_check after write_editable_proof Commit 740ef8a added a cache that turns the post-write run_lean_check into an instant no-op. That fix surfaced a root-cause: five distinct instruction sites in default_agent.py were *telling* the agent to run the redundant pair, driving the 201/201 (100%) redundant-call pattern from the start of every task: * build_user_prompt interactive workflow line * cut-off recovery nudge ("cut off. Immediately call ... then call run_lean_check") * lean_check_failed repair-message suffix * placeholder_detected / theorem_statement_mismatch retry message * empty-response nudge Since commit b2bd74f folded the Lean check into write_editable_proof, every "then call run_lean_check" is wasted: the proof is unchanged, the cache would short-circuit the call, and the agent burns a round for zero new information. Removing the instruction at the source prevents the pattern from being introduced in the first place; the cache in 740ef8a remains as a belt-and-braces fallback if the model still reaches for run_lean_check on its own. No semantic change to any non-prompt logic; the tool-name bookkeeping sets that still list "run_lean_check" are untouched because the tool remains callable (it is still the canonical way to re-check a proof if the model really wants to). Co-Authored-By: Claude Opus 4.6 --- harness/default_agent.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index af0ed98e..1d168987 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -606,7 +606,7 @@ def build_user_prompt(task: dict[str, Any], *, interactive: bool) -> str: "You are in interactive mode with verification tools.\n" "All implementation, specification, and editable proof files are already provided below. " "Do NOT re-read them with read_public_file — start working immediately.\n" - "Workflow: call write_editable_proof with your complete proof file, then call run_lean_check to verify.\n" + "Workflow: call write_editable_proof with your complete proof file — it returns the Lean check result directly, you do NOT need a separate run_lean_check call afterward.\n" "If the check fails, read the failure_class and repair_hints in the result.\n" "For unknown_identifier errors: use search_public_defs to find correct names.\n" "For unsolved_goals: use inspect_lean_goals with a ?_ hole to see the exact goal, then write targeted tactics.\n" @@ -1878,8 +1878,8 @@ def execute_interactive_agent_task( "role": "user", "content": ( "Your response was cut off. Do not over-think. " - "Immediately call write_editable_proof with a simple proof attempt, " - "then call run_lean_check. Keep the proof short." + "Immediately call write_editable_proof with a simple proof attempt " + "(it runs the Lean check automatically). Keep the proof short." ), }) # Reset budget back to configured value after persistent overruns @@ -1939,15 +1939,15 @@ def execute_interactive_agent_task( ) if guidance: repair_msg += f"\nRepair guidance:\n{guidance}\n" - repair_msg += "\nUse write_editable_proof to write a corrected proof, then run_lean_check to verify." + repair_msg += "\nUse write_editable_proof to write a corrected proof (it runs the Lean check automatically; no separate run_lean_check needed)." transcript.append({"role": "assistant", "content": response_text or ""}) transcript.append({"role": "user", "content": repair_msg}) elif failure_mode in ("placeholder_detected", "theorem_statement_mismatch"): retry_msg = ( f"Your response did not produce a valid proof candidate (proof attempt {proof_attempts} of {config.max_attempts}, " f"failure: {failure_mode}).\n" - "Use the write_editable_proof tool to submit the complete editable Lean proof file, " - "then use run_lean_check to verify it.\n" + "Use the write_editable_proof tool to submit the complete editable Lean proof file " + "(it runs the Lean check automatically; no separate run_lean_check needed).\n" "Do not explain or analyze. Use the tools directly.\n" ) transcript.append({"role": "assistant", "content": response_text}) @@ -1957,8 +1957,8 @@ def execute_interactive_agent_task( else: # Empty response or no valid candidate: nudge model to use tools nudge_msg = ( - "You must use the write_editable_proof tool to submit your proof, " - "then call run_lean_check to verify it. Do not respond with text only.\n" + "You must use the write_editable_proof tool to submit your proof " + "(it runs the Lean check automatically). Do not respond with text only.\n" ) transcript.append({"role": "assistant", "content": response_text or ""}) transcript.append({"role": "user", "content": nudge_msg}) @@ -2040,7 +2040,7 @@ def execute_interactive_agent_task( "content": ( "Stop searching and write a proof now. The search_public_defs tool only searches " "this task's implementation and specification files, not the Lean standard library. " - "Use write_editable_proof to submit your best proof attempt, then run_lean_check to verify." + "Use write_editable_proof to submit your best proof attempt (it runs the Lean check automatically)." ), } ) From 6f45164171d8bdcac47dd34b1ea2880edc2c1f51 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:09:24 +0200 Subject: [PATCH 32/91] refactor: consolidate repair-hint generation into single builder Corpus analysis of 83 interactive runs found that the _build_repair_guidance pass, appended after _annotate_check_result, duplicated 68% of the hints already emitted by _build_check_hints and in some cases contradicted them (e.g. _build_check_hints said "Do NOT use `split` after simp" while _build_repair_guidance said "Use `split` to case-split on the match" for the same tool_result). Among 160 failed lean_check_failed tool_results with two or more hints: 109 (68.1%) had fully-redundant bulleted blocks, 31 (19.4%) partial overlap, 20 (12.5%) unique. Migrate the three patterns the second pass uniquely covered (failed-to-infer-binder-type, unexpected-token / expected-'by', Function-expected-at / ContractState.storage) into _build_check_hints as pattern-based appends that run regardless of failure class, then delete the now-dead _build_repair_guidance function and its call site. Preserves all previously-emitted unique advice while removing contradictory and verbatim-duplicate hints from the tool-result feedback the model reads. --- harness/interactive_runtime.py | 80 +++++++--------------------------- 1 file changed, 16 insertions(+), 64 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 2e919cfc..09c31e12 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -496,16 +496,6 @@ def execute_tool(self, name: str, arguments: dict[str, Any]) -> dict[str, Any]: result = self.evaluate_current() if result.get("status") == "failed": result = self._annotate_check_result(result) - # Also add structured repair hints from main's guidance - if result.get("failure_mode") == "lean_check_failed": - guidance = _build_repair_guidance(str(result.get("details", ""))) - if guidance: - existing = result.get("repair_hints", []) - if isinstance(existing, list): - existing.append(guidance) - result["repair_hints"] = existing - else: - result["repair_hints"] = [existing, guidance] if existing else [guidance] # Cache the fresh evaluation against the current proof text so a # follow-up run_lean_check on unchanged content hits the fast path. self._last_eval_cache = (self.current_proof_text, copy.deepcopy(result)) @@ -1259,69 +1249,31 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: "the tactic so Lean knows the expected type, or (c) use `?_` (named hole) " "with `inspect_lean_goals` to see what Lean expected there before filling it." ) - return hints - -def _build_repair_guidance(details: str) -> str: - """Build structured repair guidance string from Lean error details (from main).""" - hints: list[str] = [] - if "tactic 'split' failed" in details: - hints.append( - "- Do not `split` the final post-state blindly. Prove branch-specific helper theorems first, then use `by_cases` plus `simpa`." - ) - if "no goals to be solved" in details: - hints.append( - "- A previous `simp` likely closed the goal already. Remove trailing tactics after the goal is solved." - ) - if "expected type must not contain free variables" in details: - hints.append( - "- Do not use `native_decide` or `decide` on goals that still contain parameters. First reduce to concrete equalities." - ) - if "unknown constant" in details or "Unknown identifier" in details or "unknown identifier" in details: - hints.append( - "- You are referencing a lemma or constant that does not exist in this Lean 4 environment. " - "Do not guess lemma names. Instead, use `simp` with the relevant definitions, `omega` for arithmetic, " - "or `decide`/`native_decide` for decidable propositions. Remove all references to unknown names." - ) - if "unsolved goals" in details and "match" in details: - hints.append( - "- The remaining goal contains a `match` expression. Use `split` to case-split on the match, " - "then solve each branch separately. If the match is on a ContractResult, try " - "`simp only [...]` to reduce it first, or use `cases` on the matched expression." - ) - if "unsolved goals" in details and "if " in details: - hints.append( - "- The remaining goal contains an `if` expression. Use `by_cases h : ` to split on the condition, " - "then `simp [h, ...]` in each branch. Alternatively, add the condition's hypothesis to the `simp` call." - ) - if "unsolved goals" in details and "match" not in details and "if " not in details: - hints.append( - "- Unsolved goals remain. Check that `simp` is given all necessary definitions and hypotheses." - ) - if "type mismatch" in details: - hints.append( - "- A type mismatch often means the proof term or tactic result does not match the goal. Re-read the spec and ensure your proof targets the correct type." - ) - if "simp made no progress" in details: - hints.append( - "- `simp` made no progress with the given arguments. Add more definitions to unfold, " - "or the simp arguments may already be fully reduced. Try removing the unproductive simp call." - ) + # Pattern-based hints that cut across failure classes. These used to live in + # a separate `_build_repair_guidance` pass that was appended after this + # function ran; corpus analysis showed 68% of its output was semantically + # redundant (sometimes contradictory) with the class-based hints above, so + # that pass was removed. The few patterns it uniquely covered — binder-type + # inference, Lean syntax errors, and the ContractState.storage function + # hint — are preserved here. if "failed to infer binder type" in details: hints.append( - "- Lean cannot infer a binder type. Add explicit type annotations to your helper lemma parameters." + "Lean cannot infer a binder type. Add explicit type annotations to " + "your helper lemma parameters." ) if "unexpected token" in details or "expected 'by'" in details: hints.append( - "- Syntax error. Ensure the theorem body uses `:= by` followed by tactics. " - "Do not use `:=` with a term-mode proof unless you are certain of the syntax." + "Syntax error. Ensure the theorem body uses `:= by` followed by " + "tactics. Do not use `:=` with a term-mode proof unless you are " + "certain of the syntax." ) - if "Function expected at" in details or "unknown identifier" in details: + if "Function expected at" in details: hints.append( - "- Use `s.storage 0` (function application) not `s.storage[0]` or `s.storage.0`. " - "ContractState.storage is a function `Nat → Uint256`." + "Use `s.storage 0` (function application) not `s.storage[0]` or " + "`s.storage.0`. `ContractState.storage` is a function `Nat → Uint256`." ) - return "\n".join(hints) + return hints def tool_result_json(result: dict[str, Any]) -> str: From 804a00a25c00d0486e688486b9425c539bdce469 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:17:24 +0200 Subject: [PATCH 33/91] docs: align run_lean_check API-tool description with prompt and runtime MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two earlier commits (740ef8a cache-when-unchanged, f5ceebe remove "then run_lean_check" from natural-language prompts) documented the post-b2bd74f reality that write_editable_proof already runs Lean and a follow-up run_lean_check on unchanged content is redundant. The API-level tool description — which is prepended by most OpenAI-compatible clients to the system prompt and is the most prominent copy the model reads before choosing a tool — still said only "Run the official harness Lean check for the current editable proof," with no mention of either redundancy or the cached-short-circuit behavior. Align it so the model learns the same thing from every source it reads. Corpus evidence reused from 740ef8a: 201/201 (100%) run_lean_check calls in 83 interactive runs followed a write_editable_proof on identical content. No code-path change. --- harness/interactive_runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 09c31e12..db358aa1 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -403,7 +403,7 @@ def tool_specs(self) -> list[dict[str, Any]]: "type": "function", "function": { "name": "run_lean_check", - "description": "Run the official harness Lean check for the current editable proof.", + "description": "Re-run the Lean check on the current editable proof without modifying it. Redundant immediately after `write_editable_proof`, which already runs the check — if the proof text is unchanged since the last evaluation, this call returns a cached result tagged `cached: true` rather than re-invoking Lean.", "parameters": { "type": "object", "additionalProperties": False, From 2a2b61557af6fbbfc9ce3a0b8ce28dea06749466 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:28:50 +0200 Subject: [PATCH 34/91] feat: emit repair_hints on failed try_tactic_at_hole Corpus analysis of 83 interactive runs found 76/76 (100%) of failed `try_tactic_at_hole` results returned zero `repair_hints`, even though the same failure_class distribution (45 unknown_identifier, 18 unsolved_goals, 7 type_mismatch, 4 other, 1 no_goals, 1 free_variables) already has targeted hints built by `_build_check_hints` when the identical error surfaces via `run_lean_check` or `write_editable_proof`. Reuse `_build_check_hints(failure_class, details)` on the failure path so `try_tactic_at_hole` returns the same class-based advice as the other two tools. The helper is a pure function over (class, details), so reuse is safe; the change is additive (no existing key removed). This closes the consistency gap across the tool surface and gives the model a concrete next move after a probe tactic fails, instead of just a raw Lean error payload. --- harness/interactive_runtime.py | 21 ++++++++++++++++++--- 1 file changed, 18 insertions(+), 3 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index db358aa1..e1fbbea3 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -260,12 +260,27 @@ def try_tactic_at_hole(self, tactic: str) -> dict[str, Any]: "tactic": tactic.strip(), "details": "Tactic succeeded. Proof updated.", } - return { + # Produce the same class-based repair_hints as run_lean_check / + # write_editable_proof do on failure. Corpus analysis of 83 interactive + # runs found 76/76 (100%) of failed try_tactic_at_hole results returned + # no hints, even though the failure_class distribution (45 unknown_ + # identifier, 18 unsolved_goals, 7 type_mismatch, …) maps onto hints + # already produced by `_build_check_hints` when the same error comes + # from the other two tools. Reusing that helper keeps the advice + # consistent across the tool surface and gives the model a concrete + # next tactic to try instead of a bare error payload. + details = str(evaluation.get("details", "")) + failure_class = classify_failure(details) + result = { "status": "failed", "tactic": tactic.strip(), - "details": evaluation.get("details", "")[:2000], - "failure_class": classify_failure(str(evaluation.get("details", ""))), + "details": details[:2000], + "failure_class": failure_class, } + hints = _build_check_hints(failure_class, details) + if hints: + result["repair_hints"] = hints + return result def evaluate_current(self, *, check_goals: bool = False) -> dict[str, Any]: return self.evaluate_candidate(self.current_proof_text, check_goals=check_goals) From 39adfdf5e3ef5971790ac2674be3fb6856ac5770 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:36:01 +0200 Subject: [PATCH 35/91] feat: cap Lean output at 16 KB to bound context consumption MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 201 `run_lean_check` results (post noise-strip) found a heavy-tailed size distribution: median 1.4 KB, p95 32 KB, max 136 KB (pre-strip max 300 KB — a single tool call returning ~74 k tokens, enough to blow the entire context budget on one request). The tail is real Lean output: goals whose state contains deeply nested `match`/`if` chains over contract state, often 16 errors each showing a 10 KB goal. Those later errors cascade from the first one, so the first 1-2 errors are typically the only actionable content — the remainder just pushes the context budget and buries the actionable bits. Truncate stripped Lean output at 16 KB with a clear marker suffix, cut on a line boundary so we never slice mid-token. Applied in `evaluate_candidate`, so it covers every tool that surfaces Lean output (`run_lean_check`, `write_editable_proof`, `inspect_lean_goals`). Simulated on the full corpus (661 outputs): - truncated: 34 / 661 (5.1 %) - after max: 16 244 chars (was 136 332) - after p95: 16 121 chars (was 16 242) — 89 % of real cases untouched - <1 KB outputs: all unchanged First errors always preserved; the suffix tells the model output was elided and that Lean errors cascade, so it should address the top of the list first. --- harness/interactive_runtime.py | 37 ++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index e1fbbea3..8259009c 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -361,6 +361,7 @@ def evaluate_candidate(self, candidate_text: str, *, check_goals: bool = False) # error and every other warning kind — only the known-useless # linter goes away. output = _strip_noise_warnings(output) + output = _cap_lean_output(output) if code != 0: return { "status": "failed", @@ -918,6 +919,42 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: ) +_LEAN_OUTPUT_CAP_CHARS = 16000 + + +def _cap_lean_output(output: str, max_chars: int = _LEAN_OUTPUT_CAP_CHARS) -> str: + """Bound Lean-check output to a character budget the model can read. + + Corpus analysis of 201 interactive `run_lean_check` results found the + stripped-output distribution was heavy-tailed: median 1.4 KB, p95 32 KB, + max 136 KB (pre-strip max 300 KB — a single call consuming >70 k tokens). + The tail is driven by goals whose state contains deeply nested + `match`/`if` chains over contract state; 16 separate errors each + displaying a 10 KB goal easily adds up to 100 KB. That blows the + context budget and buries the first (usually most actionable) error. + + Truncate to `max_chars` with a clear marker so the first errors stay + intact and the model knows output was elided. 16 KB keeps ~89 % of + real corpus outputs untouched while capping the worst case at about + 4 k tokens. + """ + if len(output) <= max_chars: + return output + # Cut on a line boundary inside the budget so we never slice mid-token. + head = output[:max_chars] + last_newline = head.rfind("\n") + if last_newline > max_chars // 2: + head = head[:last_newline] + dropped = len(output) - len(head) + return ( + f"{head}\n" + f"[... Lean output truncated: {dropped} more characters elided to " + f"keep the tool result within the model's context budget. The first " + f"errors are preserved above — address them before expecting the " + f"later diagnostics to matter, since Lean errors cascade.]" + ) + + def _strip_noise_warnings(output: str) -> str: """Drop `linter.unusedSimpArgs` warning blocks from Lean stdout. From e1af718e5f005e0e35b7f20689d5fe2c7b433921 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:51:18 +0200 Subject: [PATCH 36/91] feat: stop re-truncating try_tactic_at_hole details to 2000 chars MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 interactive runs (78 failed try_tactic_at_hole calls) found 41/78 (53%) hit the hard `details[:2000]` cap, chopping off already-cleaned diagnostic content — goal state, context, line numbers — that `run_lean_check` surfaces in full on the identical Lean failure. The 2000-char truncation was a legacy band-aid from when Lean output averaged ~34 KB, 99% of which was `linter.unusedSimpArgs` noise. The upstream pipeline now: 1. strips that linter via `_strip_noise_warnings` (db69515), and 2. caps total output at 16 KB via `_cap_lean_output` (39adfdf). So by the time `details` reaches this site it is already bounded and noise-free. Re-truncating to 2000 chars now just discards useful signal the model needs to pick the next tactic. Dropping the extra truncation makes all three Lean-backed tools (write_editable_proof, run_lean_check, try_tactic_at_hole) surface the same error fidelity; the 16 KB pipeline cap remains the backstop against runaway growth. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 8259009c..b4377556 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -269,12 +269,22 @@ def try_tactic_at_hole(self, tactic: str) -> dict[str, Any]: # from the other two tools. Reusing that helper keeps the advice # consistent across the tool surface and gives the model a concrete # next tactic to try instead of a bare error payload. + # `details` is already stripped of `linter.unusedSimpArgs` noise and + # capped at `_LEAN_OUTPUT_CAP_CHARS` (16 KB) by `evaluate_candidate`. + # Earlier code re-truncated to 2000 chars — a legacy band-aid from + # before the upstream cleanup pipeline existed. Corpus analysis of + # the 78 try_tactic_at_hole failures in the current corpus found + # 41/78 (53%) hit that 2000-char cap, chopping off already-cleaned + # diagnostic content (goal state, context, line numbers) that + # run_lean_check would have returned in full on the same failure. + # Drop the extra truncation so all three tools surface the same + # error fidelity; the 16 KB pipeline cap remains the backstop. details = str(evaluation.get("details", "")) failure_class = classify_failure(details) result = { "status": "failed", "tactic": tactic.strip(), - "details": details[:2000], + "details": details, "failure_class": failure_class, } hints = _build_check_hints(failure_class, details) From 53b68c221e346a514d92300a5584394bd16f0aba Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 20:56:20 +0200 Subject: [PATCH 37/91] feat: strip Lean noise and normalize fingerprints for any source path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `_LEAN_BLOCK_HEADER_RE` and `_FP_LINE_COL_RE` hardcoded the source prefix to `CandidateCheck.lean:`, which only fires for the stub file `evaluate_candidate` writes when `check_goals=False` (i.e. the `run_lean_check` / `write_editable_proof` code paths). The `inspect_lean_goals` path runs Lean against the actual editable file — paths like `Benchmark/Generated/Foo/Bar.lean:` — and so fell through both filters untouched. Corpus analysis of 83 interactive runs found: - 32/88 (36%) of inspect_lean_goals outputs still carried full `linter.unusedSimpArgs` warning blocks (verified on the largest leaker: 9.8 KB of noise inside a 24.5 KB output — a 40% reduction with the fix applied). - Stagnation detection's error-text fingerprint left the file path in place for inspect_lean_goals outputs, so the same underlying error surfaced via two different Lean invocations produced two different fingerprints and escaped the no-progress loop detector. Broadening both regexes to `\S+\.lean:LINE:COL:` lets a single strip + fingerprint pipeline cover every tool that reports Lean diagnostics, regardless of which source file Lean named. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 31 +++++++++++++++++++++++-------- 1 file changed, 23 insertions(+), 8 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index b4377556..5532c3c3 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -922,10 +922,22 @@ def extract_contract_simp_terms(task: dict[str, Any]) -> list[str]: r"|:=\s*" # RHS of let / have := ?_ r")$" ) -_FP_LINE_COL_RE = re.compile(r"CandidateCheck\.lean:\d+:\d+:") +# Lean's diagnostic header format is `:LINE:COL: : `. +# Two code paths reach this regex family: +# 1. `evaluate_candidate` (run_lean_check / write_editable_proof) writes a +# `CandidateCheck.lean` stub and reports errors against that name. +# 2. `inspect_lean_goals` runs Lean against the actual editable file path +# (e.g. `Benchmark/Generated/Foo/Bar.lean`) because it needs `check_goals` +# to introspect the real `?_` hole — no stub wrapper. +# Corpus analysis of 83 runs found 32/88 (36%) of inspect_lean_goals outputs +# still contained `linter.unusedSimpArgs` blocks because the old, hardcoded +# `CandidateCheck\.lean:` regex silently skipped them. Accepting any +# `.lean:LINE:COL:` header lets the same strip + fingerprint logic +# apply to both code paths uniformly. +_FP_LINE_COL_RE = re.compile(r"\S+\.lean:\d+:\d+:") _FP_WS_RE = re.compile(r"\s+") _LEAN_BLOCK_HEADER_RE = re.compile( - r"^CandidateCheck\.lean:\d+:\d+:\s*(error|warning|note|info):" + r"^\S+\.lean:\d+:\d+:\s*(error|warning|note|info):" ) @@ -978,11 +990,14 @@ def _strip_noise_warnings(output: str) -> str: details blob — pure noise from the model's point of view because the actual repair work is always driven by errors, not by this lint. - A block begins at a `CandidateCheck.lean:L:C: warning: This simp - argument is unused:` header and ends at the next Lean diagnostic - header (error/warning/note/info) or end-of-output. Every other - diagnostic kind (including unrelated warnings) is preserved - verbatim. + A block begins at a `.lean:L:C: warning: This simp argument + is unused:` header and ends at the next Lean diagnostic header + (error/warning/note/info) or end-of-output. The `` prefix + is matched generically so outputs from `inspect_lean_goals` (which + runs Lean against the editable file directly, not the + `CandidateCheck.lean` stub) are stripped the same as outputs from + `run_lean_check`. Every other diagnostic kind (including unrelated + warnings) is preserved verbatim. """ if not output or "This simp argument is unused" not in output: return output @@ -1060,7 +1075,7 @@ def _substitute_holes(proof: str, tactic: str) -> str: def _normalize_details_fp(details: str) -> str: """Return a whitespace/line-number-agnostic fingerprint of a Lean error. - Strips the leading `CandidateCheck.lean:LINE:COL:` markers and collapses + Strips the leading `.lean:LINE:COL:` markers and collapses all whitespace runs so two Lean runs that differ only in formatting noise produce the same fingerprint. Truncated to 512 chars — long enough to distinguish genuinely different errors, short enough that From b9ce6aa7ec97c1bb2a0fbb6ea766063b58ecc5bc Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:12:56 +0200 Subject: [PATCH 38/91] fix: drop stale 'before run_lean_check' advice in placeholder warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The `placeholder_detected` write-warning told the agent to replace sorry/admit/axiom "before run_lean_check". That instruction is stale on two counts: * commit b2bd74f (2026-04-22 18:29) folded run_lean_check into write_editable_proof, so by the time this warning is returned the Lean check has already run. There is no separate "before" moment. * commit f5ceebe (2026-04-22 20:01) removed every prompt-side hint telling the agent to call run_lean_check after write_editable_proof, aligning the tool surface with the folded architecture. The current wording therefore gestures at a workflow that no longer exists, undercutting the "run_lean_check is not needed after write_editable_proof" description already in the tool surface. Corpus analysis (83 interactive runs, 372 write_editable_proof calls) found this warning fired 13 times, always alongside a Lean verdict for the same proof — so the agent was seeing a "do X before Y" hint at the exact moment it was also reading Y's output, a self-contradiction that adds no new signal. Rewording to "Lean rejects these — replace with a real tactic or a `?_` hole." keeps the actionable advice while dropping the stale sequencing language and pointing the agent at the legitimate `?_` probe workflow. Co-Authored-By: Claude Opus 4.6 --- harness/interactive_runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 5532c3c3..bed909bf 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -107,7 +107,7 @@ def write_editable_proof(self, content: str, *, check: bool = True) -> dict[str, if PLACEHOLDER_PATTERN.search(self.current_proof_text): warnings.append({ "kind": "placeholder_detected", - "detail": "contains `sorry`/`admit`/`axiom`; replace before run_lean_check.", + "detail": "contains `sorry`/`admit`/`axiom`; Lean rejects these — replace with a real tactic or a `?_` hole.", }) if HIDDEN_PROOF_IMPORT_PATTERN.search(self.current_proof_text): warnings.append({ From 0f9b1f85f6bcb0d97c8fdf3fd42bff844dec51b5 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:17:32 +0200 Subject: [PATCH 39/91] fix: stop truncating theorem signatures at first `:=` inside `let` `_extract_theorem_signature` used the non-greedy pattern `.*?(?::=)` to find the proof marker, but that stops at the first `:=` token, which in 74 of 88 (84%) benchmark task files lives inside a `let x := ...` binding within the statement. The result is a truncated signature that makes the theorem_statement_mismatch check a false-negative for any edit past that let-binding. All 88 task files use `:= by` as the actual proof start, so anchoring on `:=\s*by\b` cleanly captures the full signature without regressing the 14 let-free cases. --- harness/interactive_runtime.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index bed909bf..310fdcdb 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -779,7 +779,7 @@ def _materialize_workspace(self, workspace: Path) -> None: def _extract_theorem_signature(self, text: str) -> str | None: short_name = self.paths.theorem_name.rsplit(".", 1)[-1] pattern = re.compile( - rf"theorem\s+{re.escape(short_name)}\b(?P.*?)(?::=)", + rf"theorem\s+{re.escape(short_name)}\b(?P.*?):=\s*by\b", re.DOTALL, ) match = pattern.search(text) From 19bb7323c04976cd92715e6d158d102634f510ea Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:24:07 +0200 Subject: [PATCH 40/91] fix: redirect `unknown identifier ''` hint away from search_public_defs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 interactive runs found 20/29 failed tasks (69%) hit "unknown identifier 'simp'" / '' errors at least once. The corpus-wide count of unknown-identifier incidents is dominated by tactic names — 17 `simp`, 10 `simpa`, 6 `omega`, 4 `exact`, 4 `native_decide`, 3 `intro`, 3 `simp_all`, 2 `by_cases` — i.e. ~60% of cases where Lean emits `unknown identifier` are tactic-in-term- position mistakes (`exact simp [...]`, `:= by_cases h`, etc.), not missing definitions. The existing `unknown_identifier` hint (both the per-failure hint and the stagnation-escalation hint) sent the model to `search_public_defs` to look for the name, which cannot help: these are language keywords, not definitions. The model would burn tool calls searching and loop. Detect the tactic-name case and emit a position-specific hint instead ("wrap with `by`, or drop the `exact`/`refine` prefix"), and skip the misleading "Use search_public_defs" hint in that branch. Non-tactic identifiers (missing lemmas, typos like `Nat.div_mul_le`) still get the original advice. --- harness/interactive_runtime.py | 49 +++++++++++++++++++++++++++++++--- 1 file changed, 45 insertions(+), 4 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 310fdcdb..9871cf5c 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -21,6 +21,25 @@ ) IMPORT_PATTERN = re.compile(r"^\s*import\s+([A-Za-z0-9_.']+)\s*$", re.MULTILINE) +# Well-known Lean 4 tactics that Lean reports as "unknown identifier" when +# written in *term* position (e.g. `exact simp [...]`, `refine omega`, `:= by_cases h`). +# Corpus analysis of 83 runs: 20 of 29 failed tasks (69%) hit this at least once, +# with `simp`, `simpa`, `omega`, `exact`, `native_decide`, `intro`, `simp_all`, and +# `by_cases` accounting for 61 occurrences. The existing `unknown_identifier` hint +# sends the model to `search_public_defs`, which cannot help here — these are +# language keywords, not definitions. +_LEAN_TACTIC_NAMES = frozenset({ + "simp", "simpa", "simp_all", "dsimp", + "omega", "decide", "native_decide", + "exact", "refine", "apply", "intro", "intros", + "constructor", "cases", "induction", "by_cases", "obtain", + "unfold", "rfl", "rw", "rewrite", "ring", "linarith", "nlinarith", + "split", "left", "right", "use", "show", "have", "suffices", "let", + "trivial", "tauto", "contradiction", "assumption", "skip", + "ext", "funext", "congr", "norm_num", "field_simp", "abel", +}) +_UNKNOWN_IDENT_RE = re.compile(r"unknown (?:identifier|constant) '([^']+)'") + @dataclass(frozen=True) class RuntimePaths: @@ -603,7 +622,7 @@ def _annotate_check_result(self, result: dict[str, Any]) -> dict[str, Any]: f"You have failed {total_failures} times across different error classes. " "Step back and reconsider your proof strategy from scratch." ) - escalation = self._build_escalation_hint(failure_class) + escalation = self._build_escalation_hint(failure_class, details) if escalation: hints.append(escalation) @@ -718,7 +737,7 @@ def _filter_seen_hints(self, hints: list[str]) -> list[str]: fresh.append(hint) return fresh - def _build_escalation_hint(self, failure_class: str) -> str | None: + def _build_escalation_hint(self, failure_class: str, details: str = "") -> str | None: """Build an escalation hint when the model is stagnating on a failure class.""" terms = extract_contract_simp_terms(self._task) if terms: @@ -739,6 +758,16 @@ def _build_escalation_hint(self, failure_class: str) -> str | None: f"5. Never use bare `simp [h]` or `unfold ContractName.functionName`" ) if failure_class == "unknown_identifier": + unknown_names = _UNKNOWN_IDENT_RE.findall(details or "") + tactic_hits = [n for n in unknown_names if n in _LEAN_TACTIC_NAMES] + if tactic_hits: + name = tactic_hits[0] + return ( + f"ESCALATION: `{name}` is a TACTIC, not an identifier to search for. " + f"You are writing `{name}` in term position (after `exact`/`refine`/`apply`/`:=` or " + f"inside `⟨ ⟩`). Either wrap with `by` (e.g. `exact by {name} ...`) or drop the " + f"`exact`/`refine` prefix so `{name}` runs in tactic mode." + ) return ( "ESCALATION: Stop guessing identifier names. Use the search_public_defs tool " "to find the exact names from the implementation and specification files." @@ -1241,11 +1270,23 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: ) return hints if failure_class == "unknown_identifier": - if "decide_True" in details or "decide_False" in details: + unknown_names = _UNKNOWN_IDENT_RE.findall(details) + tactic_hits = [n for n in unknown_names if n in _LEAN_TACTIC_NAMES] + if tactic_hits: + name = tactic_hits[0] + hints.append( + f"`{name}` is a TACTIC, not an identifier. Lean reports `unknown identifier " + f"'{name}'` when a tactic is written in TERM position (after `exact`, `refine`, " + f"`apply`, `:=`, inside `⟨ ⟩`, etc.). Fix: wrap the tactic in `by` — e.g. " + f"`exact by {name} ...` or `:= by {name} ...`. If the goal is already in tactic " + f"mode, remove the `exact`/`refine` prefix and call `{name}` directly." + ) + elif "decide_True" in details or "decide_False" in details: hints.append("CRITICAL: `decide_True` and `decide_False` do not exist. Remove them. Instead, pass precondition hypotheses directly to `simp` - it handles `decide` reduction automatically.") else: hints.append("Use search_public_defs to find correct names from spec/impl files.") - hints.append("Check imports. Standard names: Nat.lt_of_not_ge, Nat.not_le_of_lt.") + if not tactic_hits: + hints.append("Check imports. Standard names: Nat.lt_of_not_ge, Nat.not_le_of_lt.") elif failure_class == "unsolved_goals": hints.append("Use inspect_lean_goals with a ?_ hole to see exact goal state.") if "if " in details or "match" in details: From b24ef3fc8bdd0347818fc510924d901644ea4633 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:35:23 +0200 Subject: [PATCH 41/91] fix: redirect `unknown identifier ''` hint away from search_public_defs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 runs under results/agent_runs/custom/interactive-proxy/ classified unknown-identifier incidents across failed tasks: 51 tactic-in-term-position (addressed by 19bb732) 34 "other" — dominated by camelCase locals: prevOwner (17), owner (6), ... 12 snake_case lemma guesses 12 qualified-name misses 4 var_like already matching a narrower pattern 38 of these (the 34 "other" + 4 var_like) are local-variable-shaped names (camelCase, no dot, no underscore, leading lowercase) affecting 6/29 failed tasks. The previous hint sent the agent to search_public_defs, which cannot resolve local binders and produced budget-wasting loops. New branch detects var-like names and instead points at inspect_lean_goals plus re-reading the theorem signature for the real parameter names, explicitly warning that search_public_defs is wrong for this shape. Tactic-in-term-position path is preserved (checked first), and the fallback "Check imports" hint still fires for snake/qualified misses that match neither category. --- harness/interactive_runtime.py | 18 +++++++++++++++++- 1 file changed, 17 insertions(+), 1 deletion(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 9871cf5c..8eace91a 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -1272,6 +1272,11 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: if failure_class == "unknown_identifier": unknown_names = _UNKNOWN_IDENT_RE.findall(details) tactic_hits = [n for n in unknown_names if n in _LEAN_TACTIC_NAMES] + var_hits = [ + n for n in unknown_names + if n not in _LEAN_TACTIC_NAMES and "." not in n + and n and n[0].islower() and "_" not in n + ] if tactic_hits: name = tactic_hits[0] hints.append( @@ -1281,11 +1286,22 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: f"`exact by {name} ...` or `:= by {name} ...`. If the goal is already in tactic " f"mode, remove the `exact`/`refine` prefix and call `{name}` directly." ) + elif var_hits: + name = var_hits[0] + hints.append( + f"`{name}` looks like a LOCAL VARIABLE name, not a definition. " + f"`unknown identifier '{name}'` means `{name}` is not in scope at that point — " + f"it may have been introduced in a different branch, shadowed, or never bound. " + f"Use `inspect_lean_goals` to see the exact binders in scope at each `?_`, and " + f"re-check the theorem signature for the actual parameter names. Do NOT call " + f"search_public_defs for a local-variable-shaped name — it searches definitions, " + f"not binders." + ) elif "decide_True" in details or "decide_False" in details: hints.append("CRITICAL: `decide_True` and `decide_False` do not exist. Remove them. Instead, pass precondition hypotheses directly to `simp` - it handles `decide` reduction automatically.") else: hints.append("Use search_public_defs to find correct names from spec/impl files.") - if not tactic_hits: + if not tactic_hits and not var_hits: hints.append("Check imports. Standard names: Nat.lt_of_not_ge, Nat.not_le_of_lt.") elif failure_class == "unsolved_goals": hints.append("Use inspect_lean_goals with a ?_ hole to see exact goal state.") From 31ff5e89f3c875a28ed713fb2354029329ec2294 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:41:52 +0200 Subject: [PATCH 42/91] fix: warn about absent Mathlib when `unknown identifier` is Mathlib-shaped MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of the 29 failed runs under results/agent_runs/custom/interactive-proxy/ shows 5 tasks (17%) stagnating on guesses that only exist in Mathlib: ethereum.../full_deposit_preserves_partial_gap: sub_eq_sub_right, add_sub_add_right_eq_sub openzeppelin.../preview_deposit_rounds_down: Nat.cast_mk, Nat.div_def, Nat.div_mul_le, Nat.le_div_mul paladin.../weth_claimed_plus_allocated: add_assoc, add_left_comm, sub_eq_add_neg, add_comm safe.../in_list_reachable: not_eq zama.../transfer_conservation: lt_of_add_lt_add_right, Nat.not_ge.mp This workspace has no Mathlib dependency — only core Lean 4, Batteries, and the task's own Benchmark.* modules. The prior fallback hint ("Check imports. Standard names: Nat.lt_of_not_ge, Nat.not_le_of_lt") was misleading because the agent kept searching for imports that cannot be added. Worse, it cited a `Nat.*` name as a "standard" example while the agent was already being burned by the Nat.* namespace. Detect Mathlib-shape names (arithmetic-prefix `add_*`/`sub_*`/`le_*`/..., common exact names like `add_assoc`/`add_comm`, and any `Nat.*`-qualified guess) and emit a hint that: (a) names the absent dependency explicitly, (b) redirects to `omega`/`ring`/`simp arith` tactics, (c) reminds the agent that search_public_defs takes a KEYWORD, not a guessed lemma name. Precedence order preserved: tactic-in-term (19bb732) ranks highest, then var_like (b24ef3f), then decide_True (legacy), then Mathlib, then generic search_public_defs. The misleading "Check imports..." trailer is suppressed when the Mathlib branch fires. --- harness/interactive_runtime.py | 49 ++++++++++++++++++++++++++++++++-- 1 file changed, 47 insertions(+), 2 deletions(-) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 8eace91a..0bbe2e50 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -40,6 +40,35 @@ }) _UNKNOWN_IDENT_RE = re.compile(r"unknown (?:identifier|constant) '([^']+)'") +# Names that look like Mathlib lemmas (e.g. `add_sub_add_right_eq_sub`, +# `lt_of_add_lt_add_right`, `Nat.div_mul_le`). Corpus analysis of 83 runs +# found 5 of 29 failed tasks (17%) stagnating on such guesses — +# `add_sub_add_right_eq_sub`, `sub_eq_sub_right`, `add_assoc`, `add_comm`, +# `sub_eq_add_neg`, `lt_of_add_lt_add_right`, `Nat.div_mul_le`, +# `Nat.le_div_mul`, `Nat.div_def`, `Nat.cast_mk`, `Nat.not_ge.mp`, … +# This workspace has NO Mathlib dependency, so these searches can never +# succeed; the agent should be pointed at `omega`/`ring`/`simp` instead. +_MATHLIB_SHAPE_PREFIX_RE = re.compile( + r"^(add_|sub_|mul_|div_|mod_|le_|lt_|ge_|gt_|eq_|ne_|not_|neg_|pos_|zero_|one_)" +) +_MATHLIB_SHAPE_EXACT = frozenset({ + "add_assoc", "add_comm", "add_left_comm", + "mul_comm", "mul_assoc", "mul_left_comm", + "sub_zero", "zero_add", "add_zero", "mul_one", "one_mul", + "not_eq", +}) + + +def _is_mathlib_shaped(name: str) -> bool: + if name in _MATHLIB_SHAPE_EXACT: + return True + if _MATHLIB_SHAPE_PREFIX_RE.match(name): + return True + # `Nat.*` lemma guesses are overwhelmingly Mathlib-only in this corpus. + if name.startswith("Nat."): + return True + return False + @dataclass(frozen=True) class RuntimePaths: @@ -1277,6 +1306,10 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: if n not in _LEAN_TACTIC_NAMES and "." not in n and n and n[0].islower() and "_" not in n ] + mathlib_hits = [ + n for n in unknown_names + if n not in _LEAN_TACTIC_NAMES and _is_mathlib_shaped(n) + ] if tactic_hits: name = tactic_hits[0] hints.append( @@ -1300,8 +1333,20 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: elif "decide_True" in details or "decide_False" in details: hints.append("CRITICAL: `decide_True` and `decide_False` do not exist. Remove them. Instead, pass precondition hypotheses directly to `simp` - it handles `decide` reduction automatically.") else: - hints.append("Use search_public_defs to find correct names from spec/impl files.") - if not tactic_hits and not var_hits: + if mathlib_hits: + name = mathlib_hits[0] + hints.append( + f"`{name}` is a Mathlib-style lemma name, but this workspace has NO " + f"Mathlib dependency — only core Lean 4, Batteries, and the task's own " + f"`Benchmark.*` modules are importable. Do not keep guessing names like " + f"`add_sub_*`, `sub_eq_*`, `lt_of_*`, or `Nat.div_*` — they will not be " + f"found. For arithmetic goals use `omega` (linear Nat/Int), `ring` " + f"(commutative rings), or `simp arith` directly; for project helpers " + f"use search_public_defs on a keyword, not a guessed lemma name." + ) + else: + hints.append("Use search_public_defs to find correct names from spec/impl files.") + if not tactic_hits and not var_hits and not mathlib_hits: hints.append("Check imports. Standard names: Nat.lt_of_not_ge, Nat.not_le_of_lt.") elif failure_class == "unsolved_goals": hints.append("Use inspect_lean_goals with a ?_ hole to see exact goal state.") From 0c6c7adf9636771db8706ea594bc4b0041aa058e Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:43:53 +0200 Subject: [PATCH 43/91] fix: detect `.val` coercion asymmetry in type_mismatch details MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 runs under results/agent_runs/custom/interactive-proxy/: 6 of 16 `type_mismatch` incidents (38% of this class), spanning 5 of 29 failed tasks (17%), hit the same recurring shape: has type ¬x = 0 : Prop but is expected ¬x.val = 0 : Prop The `.val` projection on Uint256 / Address / Nat is missing on one side. Affected failed tasks: openzeppelin.../preview_deposit_rounds_down openzeppelin.../positive_deposit_mints_positive_shares_under_rate_bound lido.../ceildiv_sandwich safe.../remove_owner_in_list_reachable safe.../add_owner_is_owner_correctness The agent repeatedly tried `exact h` against the asymmetric goal instead of letting simp bridge the coercion. The prior hint ("Unfold definitions to align types") did not name the structural issue, so the agent cycled on `exact`-family tactics until the tool budget ran out. New hint fires only when the "has type … but is expected to have type …" substrings differ by `.val` on exactly one side, and points the agent at: - `by simpa using h` / `by simp_all` (simp bridges the projection) - `by omega` once `.val` is exposed on both sides for arithmetic goals - the underlying injectivity lemma via search_public_defs for negated- equality mismatches Precedence is preserved: the existing `decide` branch runs first (when both match), and the generic "Unfold definitions" trailer is still appended as a fallback. --- harness/interactive_runtime.py | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 0bbe2e50..2b55306b 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -1358,6 +1358,28 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: elif failure_class == "type_mismatch": if "decide" in details: hints.append("The goal contains `decide` expressions. Pass all precondition hypotheses to `simp` and it will reduce `decide` automatically. Do NOT try to manually match `decide` types.") + # Detect the recurring Uint256/Address `.val` coercion asymmetry: + # one side of the mismatch has a `.val` projection and the other + # does not. Corpus analysis of 83 runs: 6 of 16 type_mismatch + # incidents across 5 of 29 failed tasks (17%) hit this exact + # pattern — e.g. hypothesis `hx : ¬x = 0` but goal expected + # `¬x.val = 0`. The agent repeatedly retried `exact hx` instead + # of bridging through simp/simpa. + _tm = re.search( + r"has type\s+(.{5,300}?)\s+but is expected to have type\s+(.{5,300})", + details, re.DOTALL, + ) + if _tm and (".val" in _tm.group(1)) != (".val" in _tm.group(2)): + hints.append( + "Your hypothesis differs from the expected type by a `.val` projection " + "(Uint256/Address/Nat). Do NOT keep retrying `exact h` — Lean will not " + "insert the coercion for you. Use `by simpa using h` or `by simp_all` " + "to let simp bridge the `.val`; if the goal is a Prop inequality, " + "`by omega` after exposing `.val` on both sides also works. If the " + "mismatch is inside a negation like `¬x = 0` vs `¬x.val = 0`, rewrite " + "with the underlying injectivity lemma (e.g. `Core.Uint256.val_eq_zero`, " + "`Core.Address.ofNat_eq_zero`) found via search_public_defs." + ) hints.append("Unfold definitions to align types. Check spec matches impl.") elif failure_class == "split_failed": hints.append("Do not split the post-state. Use by_cases with branch-specific helpers.") From 843e45932ba710e92d5c668e67f2c8e49b1b0b94 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:46:11 +0200 Subject: [PATCH 44/91] fix: propagate var-like and Mathlib routing into escalation hints MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commits b24ef3f and 31ff5e8 taught `_build_check_hints` to distinguish `unknown identifier ''` (local binder) and `unknown identifier ''` (unavailable dependency) from generic definition misses. However `_build_escalation_hint` — which fires once the agent has repeated the same failure_class multiple times — still emitted only "Stop guessing identifier names. Use the search_public_defs tool" for every non-tactic case. That meant an agent stagnating on `prevOwner` / `add_comm` / `Nat.div_mul_le` kept being told to search_public_defs even though: * local binders will never be found by search_public_defs, and * Mathlib lemmas do not exist in this workspace at all. Corpus evidence (same 83-run dataset): 6/29 failed tasks surface a var-like unknown identifier at escalation time 5/29 failed tasks surface a Mathlib-shaped unknown identifier at escalation Route the escalation hint the same way as the check hint: tactic -> wrap in `by` (existing) var_like -> inspect_lean_goals to see real binders mathlib -> omega / ring / simp arith; search_public_defs takes a keyword else -> original generic "stop guessing" fallback No behavior change when the details string is empty (falls through to the generic fallback as before). --- harness/interactive_runtime.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 2b55306b..43ea30ee 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -797,6 +797,34 @@ def _build_escalation_hint(self, failure_class: str, details: str = "") -> str | f"inside `⟨ ⟩`). Either wrap with `by` (e.g. `exact by {name} ...`) or drop the " f"`exact`/`refine` prefix so `{name}` runs in tactic mode." ) + var_hits = [ + n for n in unknown_names + if n not in _LEAN_TACTIC_NAMES and "." not in n + and n and n[0].islower() and "_" not in n + ] + if var_hits: + name = var_hits[0] + return ( + f"ESCALATION: `{name}` is a LOCAL VARIABLE shape, not a definition. " + f"search_public_defs cannot find binders — it only searches public " + f"definitions. Call `inspect_lean_goals` on a `?_` hole to see which " + f"binders are in scope, then match the actual parameter names from the " + f"theorem signature." + ) + mathlib_hits = [ + n for n in unknown_names + if n not in _LEAN_TACTIC_NAMES and _is_mathlib_shaped(n) + ] + if mathlib_hits: + name = mathlib_hits[0] + return ( + f"ESCALATION: `{name}` is a Mathlib lemma name, but this workspace has " + f"NO Mathlib dependency. Stop searching for `add_*` / `sub_*` / `Nat.*` " + f"lemmas — they do not exist here. Close arithmetic goals with `omega` " + f"(linear Nat/Int), `ring` (commutative rings), or `simp arith`. For " + f"project helpers call search_public_defs with a KEYWORD, not a guessed " + f"lemma name." + ) return ( "ESCALATION: Stop guessing identifier names. Use the search_public_defs tool " "to find the exact names from the implementation and specification files." From 99fb42e48720bb159332f174c75715b006da7978 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:50:07 +0200 Subject: [PATCH 45/91] feat: classify Lean parse errors and emit syntax-targeted hint MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Corpus analysis of 83 runs under results/agent_runs/custom/interactive-proxy/: 21 parse-error events across 14 of 29 failed tasks (48%) surface one of - "unexpected token '…'" (11 incidents; `using`/`at`/`:`/`;`/`|`/…) - "unexpected identifier" (5 incidents) - "expected '{' or indented tactic sequence" (5 incidents) Most of these incidents (19/21) coexist with a classifiable semantic error and so get routed to `unknown_identifier`, `unsolved_goals`, etc. But 2 failed-task incidents collapse to `failure_class=other` because the parse error is the ONLY signal the checker emits: lido.../locked_funds_solvency unexpected token ';' zama.../burn_decreases_supply unexpected token '|' (first failure turn — lost the whole budget starting from a malformed signature) Extend `classify_failure` with a `parse_error` bucket and a dedicated hint that: (a) names the likely causes (tactic-in-term, missing `by`, stray `;`/`|`/`using`, bullet-indentation mismatch), (b) tells the agent to re-read the editable file via read_public_file to see character positions, (c) advises a clean `:= by ` rewrite over token-by-token patching. Placement: `parse_error` comes LAST in classify_failure before "other", so it never steals classification from more specific errors. The existing cross-cutting "Syntax error. Ensure the theorem body uses `:= by`…" catch-all still fires as a secondary hint. --- harness/interactive_runtime.py | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 43ea30ee..3802fdd8 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -1276,6 +1276,19 @@ def classify_failure(details: str) -> str: return "module_not_found" if "don't know how to synthesize placeholder" in lower: return "synthesis_failed" + # Parse errors (`unexpected token '…'`, `unexpected identifier`, or the + # "expected '{' or indented tactic sequence" shape) indicate malformed + # Lean syntax rather than a semantic proof failure. Corpus analysis of + # 83 runs: 21 failed-run events across 14 tasks contain a parse error + # as one of the error lines, and 2 tasks surface it with no other + # classifiable signal (collapsing to "other"). Giving those cases an + # explicit class unlocks a targeted syntax hint. + if ( + "error: unexpected token" in lower + or "error: unexpected identifier" in lower + or "expected '{' or indented tactic sequence" in lower + ): + return "parse_error" return "other" @@ -1478,6 +1491,18 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: "the tactic so Lean knows the expected type, or (c) use `?_` (named hole) " "with `inspect_lean_goals` to see what Lean expected there before filling it." ) + elif failure_class == "parse_error": + hints.append( + "Lean rejected the proof before type-checking — the candidate contains " + "invalid Lean 4 syntax. Common causes: (a) a tactic written in term " + "position (e.g. `exact simp [...]` instead of `exact by simp [...]`), " + "(b) a `by` block without an indented tactic on the next line, (c) stray " + "`;`, `|`, or `using` tokens outside a `have`/`simpa` context, (d) a " + "`· simp [...]` branch indented less than the bullet. Re-read the " + "editable file via read_public_file to see the exact character positions " + "in the error, and rewrite the proof body as a clean `:= by ` " + "block — do not try to patch token-by-token." + ) # Pattern-based hints that cut across failure classes. These used to live in # a separate `_build_repair_guidance` pass that was appended after this From d24904db04724d324e1b6e7d1557be584b9c6053 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 21:52:11 +0200 Subject: [PATCH 46/91] fix: stop telling the agent to search_public_defs for EVERY unknown_identifier MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interactive-mode user prompt previously instructed: "For unknown_identifier errors: use search_public_defs to find correct names." Corpus analysis of 83 runs under results/agent_runs/custom/interactive-proxy/ shows that this blanket advice misleads the agent on 24 of 29 failed tasks (83%). Those tasks hit at least one `unknown identifier '…'` where search_public_defs cannot possibly help because the missing name is one of: - a Lean TACTIC written in term position (simp, simpa, omega, native_decide, by_cases, …) — search_public_defs searches *definitions*, not syntactic keywords. Fix is `by `. - a LOCAL BINDER (prevOwner, owner, hKey, …) that is simply out-of-scope at the use site — binders are not public defs. - a MATHLIB LEMMA (add_comm, add_sub_*, Nat.div_mul_*, …). Mathlib is not a dependency in this workspace. The downstream repair_hints (commits 19bb732, b24ef3f, 31ff5e8) already route each shape correctly, but the initial instruction primes the agent to call search_public_defs first and overwrites any subsequent hint. Rewrite the bullet to: - tell the agent to read repair_hints before searching, - enumerate the three "wrong-shape" cases and their remedies, and - reserve search_public_defs for genuine project-defined names. Passed runs hit this scenario only 6/54 times (11%), so the clearer instruction should not distract correct agents while sharply reducing the misled population in failed runs. --- harness/default_agent.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/harness/default_agent.py b/harness/default_agent.py index 1d168987..f8b0d042 100644 --- a/harness/default_agent.py +++ b/harness/default_agent.py @@ -608,7 +608,7 @@ def build_user_prompt(task: dict[str, Any], *, interactive: bool) -> str: "Do NOT re-read them with read_public_file — start working immediately.\n" "Workflow: call write_editable_proof with your complete proof file — it returns the Lean check result directly, you do NOT need a separate run_lean_check call afterward.\n" "If the check fails, read the failure_class and repair_hints in the result.\n" - "For unknown_identifier errors: use search_public_defs to find correct names.\n" + "For unknown_identifier errors: read the repair_hints before searching — the missing name may be a tactic in term position (wrap in `by`), a local binder (call inspect_lean_goals instead), or a Mathlib lemma (this workspace has NO Mathlib; use `omega`/`ring`/`simp arith`). Only call search_public_defs for a genuine project-defined name from the implementation or spec file.\n" "For unsolved_goals: use inspect_lean_goals with a ?_ hole to see the exact goal, then write targeted tactics.\n" "Fix the specific error, write the corrected proof, and re-check. Do not rewrite from scratch unless the approach is fundamentally wrong.\n" "Only use read_public_file or search_public_defs if you need a definition not shown below.\n" From ec2c2004a13fbd96fbdc6f99ae40dc2c8dc49213 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 22:00:20 +0200 Subject: [PATCH 47/91] feat: classify `cases`/`induction` on non-inductive targets MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Lean phrases `cases h` on a non-inductive major premise (implication `A → B`, function type, unreduced equality, Bool-valued `==`) as `tactic 'cases' failed, major premise type is not an inductive type` — distinct from the `constructor failed, not an inductive datatype` shape already handled by `constructor_failed`. Corpus analysis of 83 runs at results/agent_runs/custom/interactive-proxy/: the pattern occurs 22 times in one failed task (safe__owner_manager_reach__setup_owners_acyclicity.json), where the agent repeatedly retried `cases h` on an `hkey = SENTINEL → False` implication because the generic \"other\" bucket gave no actionable hint — a 22-cycle stagnation loop with no escape. Split into its own `cases_failed` class covering both `cases` and `induction` variants, with a hint that names the four actual remedies: apply the implication with an argument, `by_cases` for decidable Props, `absurd`/`.elim` for contradiction, or `Bool.ne_iff`/`beq_iff_eq` to rewrite Bool equalities before case-splitting. --- harness/interactive_runtime.py | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/harness/interactive_runtime.py b/harness/interactive_runtime.py index 3802fdd8..34e45a04 100644 --- a/harness/interactive_runtime.py +++ b/harness/interactive_runtime.py @@ -1272,6 +1272,19 @@ def classify_failure(details: str) -> str: return "omega_failed" if "tactic 'constructor' failed" in details and "not an inductive datatype" in lower: return "constructor_failed" + # `cases` / `induction` on a non-inductive target (e.g. an implication + # `A → B`, a function, or a Prop that isn't a recognised eliminator) is a + # distinct failure mode from `constructor` — Lean phrases it as "major + # premise type is not an inductive type". Corpus analysis: 22 incidents in + # 1 failed task (setup_owners_acyclicity) all repeating the same `cases h` + # on an implication, because the generic "other" bucket gave no actionable + # hint. Split into its own class so the hint can cover `intro` / + # `by_cases` / `absurd` — the actual remedies for this shape. + if ( + ("tactic 'cases' failed" in details or "tactic 'induction' failed" in details) + and "not an inductive type" in lower + ): + return "cases_failed" if "unknown module prefix" in lower: return "module_not_found" if "don't know how to synthesize placeholder" in lower: @@ -1476,6 +1489,21 @@ def _build_check_hints(failure_class: str, details: str) -> list[str]: "hypotheses if the goal is `A → B`, or (c) use `refine ⟨_, _⟩` / " "`exact ⟨_, _⟩` if you already know the witnesses for an And/Exists." ) + elif failure_class == "cases_failed": + hints.append( + "`cases` / `induction` requires an inductive-type term. The major " + "premise here is NOT inductive — most commonly it's an implication " + "`A → B` (the agent tried `cases h` where `h : A → B`), a function " + "type, or a raw equality between non-inductive values. Remedies: " + "(a) if the hypothesis is `A → B`, first produce `A` and apply it " + "(`have hb := h ha`) or `intro` if the implication is the goal; " + "(b) for a decidable Prop use `by_cases h : P` instead of `cases`; " + "(c) to derive `False` from a contradictory hypothesis use " + "`exact absurd … h` or `exact (h …).elim`; (d) for `Bool`-valued " + "equalities like `x == y = true`, rewrite with `Bool.ne_iff` / " + "`beq_iff_eq` before case-splitting. Do NOT keep retrying `cases` " + "on the same target." + ) elif failure_class == "module_not_found": hints.append( "The import path you requested is not available in this workspace. In " From 77cb481e57e835325bec8d2d67ba32b5c0a862ee Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 22 Apr 2026 22:01:53 +0200 Subject: [PATCH 48/91] feat: detect case-labelled unsolved_goals and stop suggesting re-split MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When Lean reports unsolved goals with a `case