diff --git a/experiment/README.md b/experiment/README.md
index edf4366..ddd999c 100644
--- a/experiment/README.md
+++ b/experiment/README.md
@@ -17,6 +17,16 @@ For sweep narratives and headline numbers, see [`results/README.md`](results/REA
 - [Ollama](https://ollama.com/) at `http://localhost:11434`
 - Model: `qwen2.5:7b-instruct-q4_K_M` (see `SMALL_JUDGE_MODEL` in `run_pilot.py`)
 
+**Optional** — only for OpenAI judge arms:
+
+- `OPENAI_API_KEY` in the environment
+- OpenAI judge models: `gpt-5.5`, `gpt-5.4`, `gpt-5.4-mini`, `gpt-5.4-nano` (see `OPENAI_JUDGE_MODELS` in `run_pilot.py`)
+
+**Optional** — only for Gemini judge arms:
+
+- `GOOGLE_API_KEY` in the environment
+- Gemini judge models: `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.5-flash-lite` (see `GEMINI_JUDGE_MODELS` in `run_pilot.py`)
+
 ```bash
 claude --version          # must work before running sweeps
 ollama pull qwen2.5:7b-instruct-q4_K_M   # only if using echo-small-judge
@@ -29,7 +39,7 @@ cd experiment
 python3 -m venv .venv
 source .venv/bin/activate
 
-pip install "langchain-core>=0.3,<0.4" "langchain>=0.3,<0.4" "langchain-ollama>=0.2,<0.4"
+pip install "langchain-core>=0.3,<0.4" "langchain>=0.3,<0.4" "langchain-ollama>=0.2,<0.4" "langchain-openai>=0.3" "langchain-google-genai>=2.0"
 ```
 
 ## Quick start (1 task)
@@ -52,7 +62,7 @@ Tasks **100–163** (64 tasks), same range the team used for main results:
 python run_pilot.py --start 100 --n-tasks 64
 ```
 
-Default runs **all 7 arms** → 448 model calls. Expect long runtime and Claude CLI usage. Confirm the 1-task run first.
+Default runs all arms. Expect long runtime, Claude CLI usage, and provider API usage for the judge arms. Confirm the 1-task run first.
 
 Subset of arms:
 
@@ -60,6 +70,24 @@ Subset of arms:
 python run_pilot.py --start 100 --n-tasks 5 --arms haiku-only,sonnet-only,echo-small-judge
 ```
 
+Compare OpenAI judges:
+
+```bash
+python run_pilot.py --start 100 --n-tasks 10 --arms haiku-only,sonnet-only,echo-judge-openai,echo-judge-openai-gpt-5.4,echo-judge-openai-gpt-5.4-mini,echo-judge-openai-gpt-5.4-nano
+```
+
+Compare Gemini judges:
+
+```bash
+python run_pilot.py --start 100 --n-tasks 10 --arms haiku-only,sonnet-only,echo-judge-gemini-pro,echo-judge-gemini-flash,echo-judge-gemini-flash-lite
+```
+
+Compare all provider judges:
+
+```bash
+python run_pilot.py --start 100 --n-tasks 10 --arms echo-judge-openai,echo-judge-openai-gpt-5.4,echo-judge-openai-gpt-5.4-mini,echo-judge-openai-gpt-5.4-nano,echo-judge-gemini-pro,echo-judge-gemini-flash,echo-judge-gemini-flash-lite
+```
+
 ## CLI reference
 
 | Flag | Default | Meaning |
@@ -78,6 +106,13 @@ python run_pilot.py --start 100 --n-tasks 5 --arms haiku-only,sonnet-only,echo-s
 | `echo-ast` | Two Haiku personas; escalate if AST structure differs |
 | `echo-judge` | Two Haiku personas; Haiku judges equivalence |
 | `echo-small-judge` | Two Haiku personas; local Qwen 7B judge (Ollama) |
+| `echo-judge-openai` | Two Haiku personas; GPT-5.5 judges equivalence via OpenAI |
+| `echo-judge-openai-gpt-5.4` | Two Haiku personas; GPT-5.4 judges equivalence via OpenAI |
+| `echo-judge-openai-gpt-5.4-mini` | Two Haiku personas; GPT-5.4 mini judges equivalence via OpenAI |
+| `echo-judge-openai-gpt-5.4-nano` | Two Haiku personas; GPT-5.4 nano judges equivalence via OpenAI |
+| `echo-judge-gemini-pro` | Two Haiku personas; Gemini 2.5 Pro judges equivalence |
+| `echo-judge-gemini-flash` | Two Haiku personas; Gemini 2.5 Flash judges equivalence |
+| `echo-judge-gemini-flash-lite` | Two Haiku personas; Gemini 2.5 Flash-Lite judges equivalence |
 | `echo-oracle` | Two Haiku personas; escalate only if both fail tests (upper bound, not deployable) |
 
 ## Output format
@@ -91,6 +126,21 @@ Each sweep writes `results/<timestamp>_n<tasks>.jsonl`. One line per run:
 - **`passed`** — automated HumanEval tests succeeded
 - **`sub_calls`** — model calls for that task (Echo escalate → typically 3)
 
+Aggregate metrics:
+
+| Metric | Meaning |
+|--------|---------|
+| `n` | Number of tasks run for that arm |
+| `pass_rate` | Fraction of final selected implementations that passed HumanEval tests |
+| `escalation_rate` | Fraction of tasks where the arm called Sonnet after the cheap pair/judge disagreed |
+| `mean_wall_seconds` | Average elapsed seconds per task for that arm |
+| `total_sub_calls` | Total counted model calls for that arm |
+| `mean_sub_calls` | Average counted model calls per task |
+| `failures` | Number of tasks that failed tests or hit provider/harness errors |
+| `top_failure_details` | Most common failure reasons, useful for spotting provider limits, auth errors, syntax errors, and timeouts |
+
+For provider judge arms, `3 calls` means two Haiku candidates plus one judge call. `4 calls` means the judge disagreed and the arm escalated to Sonnet.
+
 ## View existing results (no run)
 
 Committed JSONL files are under `results/`. Summary and interpretation: [`results/README.md`](results/README.md).
@@ -108,6 +158,8 @@ jq -s 'group_by(.arm) | map({arm: .[0].arm, n: length, passed: (map(select(.pass
 | `claude: command not found` | Install [Claude Code](https://docs.anthropic.com/en/docs/claude-code) and ensure `claude` is on your `PATH` |
 | `claude --print failed` | Log in to Claude Code; confirm Max/subscription access |
 | `echo-small-judge requires langchain-ollama` | `pip install "langchain-ollama>=0.2,<0.4"` or omit that arm |
+| `echo-judge-openai requires langchain-openai` | `pip install "langchain-openai>=0.3"` and set `OPENAI_API_KEY` |
+| `echo-judge-gemini requires langchain-google-genai` | `pip install "langchain-google-genai>=2.0"` and set `GOOGLE_API_KEY` |
 | Ollama connection errors | Start Ollama; `ollama pull qwen2.5:7b-instruct-q4_K_M`; check `SMALL_JUDGE_BASE_URL` in `run_pilot.py` |
 | Very slow runs | Expected — each call spawns `claude --print` (~seconds overhead per call) |
 
diff --git a/experiment/benchmarks/bbh.py b/experiment/benchmarks/bbh.py
index 582329b..d2aec32 100644
--- a/experiment/benchmarks/bbh.py
+++ b/experiment/benchmarks/bbh.py
@@ -50,6 +50,29 @@
 ]
 
 _CHOICE_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+BINARY_CHOICE_TEXTS = {
+    "yes": ("Yes", "No"),
+    "no": ("Yes", "No"),
+    "true": ("True", "False"),
+    "false": ("True", "False"),
+    "valid": ("valid", "invalid"),
+    "invalid": ("valid", "invalid"),
+}
+
+
+def _clean_label(label: str) -> str:
+    return str(label).strip().strip("()").upper()
+
+
+def _clean_answer_text(text: str) -> str:
+    return re.sub(r"\s+", " ", str(text).strip().lower())
+
+
+def _synthetic_binary_choices(target: str) -> dict[str, list[str]] | None:
+    texts = BINARY_CHOICE_TEXTS.get(_clean_answer_text(target))
+    if texts is None:
+        return None
+    return {"label": ["A", "B"], "text": list(texts)}
 
 
 def _format_choices(choices: dict[str, Any]) -> str:
@@ -88,41 +111,98 @@ def normalize_gold(target: str) -> str:
     return letter
 
 
+def normalize_gold_for_choices(target: str, choices: dict[str, Any]) -> str:
+    """Normalize a gold target against a concrete choice list.
+
+    Some BBH configs have letter targets ("C"); binary configs can have text
+    targets ("Yes"/"No"). Convert either form to the matching choice label.
+    """
+    labels = [_clean_label(label) for label in choices.get("label") or []]
+    texts = [_clean_answer_text(text) for text in choices.get("text") or []]
+    # A choice list may carry text without explicit labels; fall back to
+    # positional A, B, C... so the texts.index() lookups below can't raise
+    # IndexError (mirrors the _CHOICE_LETTERS fallback in score_bbh).
+    if not labels and texts:
+        labels = list(_CHOICE_LETTERS[: len(texts)])
+    target_text = _clean_answer_text(target)
+
+    # Prefer label parsing first for single-letter targets ("C"), so a decoy
+    # choice whose *text* happens to be a bare letter can't remap the gold.
+    # Binary/textual targets ("Yes"/"No") extract_choice()-to-None and fall
+    # through to text matching below.
+    letter = extract_choice(str(target))
+    if letter is not None and letter in labels:
+        return letter
+
+    if target_text in texts:
+        return labels[texts.index(target_text)]
+
+    raise ValueError(f"Could not parse gold target: {target!r}")
+
+
 def extract_choice(text: str) -> str | None:
     """Parse a multiple-choice letter from model output.
 
-    Tries explicit patterns first, then the last standalone A–Z near the end.
+    High-confidence patterns (explicit "Answer: X" / "the answer is X") are
+    matched against the FULL text, so an answer stated early and followed by
+    trailing reasoning is still recovered. Weak positional fallbacks (a lone
+    "(A)" line, a trailing single letter) are matched only against the last
+    few lines, where a stray capital is least likely to be prose.
+
+    Each capture is followed by a ``(?![A-Za-z])`` guard: under ``re.I`` the
+    class ``[A-Z]`` also matches lowercase, so without the guard a phrase like
+    "the answer is straightforward" would wrongly yield "S".
+
+    Among the high-confidence patterns the *latest* match in the text wins
+    (recency across ALL pattern families, by source offset) — so a chain of
+    thought like "Answer: A ... therefore the answer is C" resolves to C even
+    though the two declarations use different phrasings / different families.
+
     Returns uppercase A–Z or None if unparseable.
     """
     if not text or not text.strip():
         return None
 
-    patterns = [
-        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])\s*\)?\s*\.?\s*$",
-        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])\s*\)?",
+    body = text.strip()
+    # High-confidence: explicit answer declarations, anywhere in the output.
+    high_confidence = [
+        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s*\.?\s*$",
+        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+        r"(?i)\b(?:final\s+answer|answer|correct\s+answer|correct\s+choice)\s*(?:is|:)\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+        r"(?i)\b(?:option|choice)\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s+(?:is\s+)?(?:correct|best|right)",
+        r"(?i)\b(?:therefore|so|thus),?\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s+(?:is\s+)?(?:correct|best|right)",
+        r"(?i)\b(?:therefore|so|thus),?\s+(?:the\s+)?(?:answer|correct\s+answer|choice)\s+is\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+    ]
+    # Pick the high-confidence match with the largest source offset, so the
+    # final declaration wins regardless of which pattern family caught it.
+    best_pos, best_letter = -1, None
+    for pat in high_confidence:
+        for m in re.finditer(pat, body):
+            if m.start(1) > best_pos:
+                best_pos, best_letter = m.start(1), m.group(1)
+    if best_letter is not None:
+        return best_letter.upper()
+
+    # Weak positional fallbacks: only trusted near the end of the output.
+    tail = "\n".join(body.splitlines()[-5:])
+    weak = [
         r"(?im)^\s*\(?\s*([A-Z])\s*\)\s*$",
-        r"(?im)correct\s+(?:answer\s+is|choice\s+is)\s*\(?\s*([A-Z])\s*\)?",
-        r"\(\s*([A-Z])\s*\)",
+        r"(?i)\b(?:option|choice|answer)\s+is\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+        r"(?i)\b([A-Z])(?![A-Za-z])\s*\.?\s*$",
     ]
-    for pat in patterns:
-        matches = re.findall(pat, text)
+    for pat in weak:
+        matches = re.findall(pat, tail)
         if matches:
             return matches[-1].upper()
-
-    tail = "\n".join(text.strip().splitlines()[-5:])
-    for pat in (
-        r"(?i)\b(?:option|choice|answer)\s+is\s+\(?\s*([A-Z])\s*\)?",
-        r"(?i)\b([A-Z])\s*\.?\s*$",
-    ):
-        m = re.search(pat, tail.strip())
-        if m:
-            return m.group(1).upper()
     return None
 
 
 def score_bbh(model_output: str, task: dict) -> tuple[bool, str]:
     """Grade a BBH response against task['gold']."""
     pred = extract_choice(model_output)
+    valid_labels = set(task.get("choice_labels") or _CHOICE_LETTERS)
+    if pred not in valid_labels:
+        pred = extract_choice_text(model_output, task)
     if pred is None:
         return False, "unparseable"
     gold = task["gold"]
@@ -131,17 +211,49 @@ def score_bbh(model_output: str, task: dict) -> tuple[bool, str]:
     return False, f"expected {gold} got {pred}"
 
 
+def extract_choice_text(text: str, task: dict) -> str | None:
+    """Map answer text like 'Answer: No' to its choice label for binary tasks."""
+    choices = task.get("choices") or {}
+    labels = [_clean_label(label) for label in choices.get("label") or []]
+    texts = [_clean_answer_text(choice_text) for choice_text in choices.get("text") or []]
+    if not labels or not texts:
+        return None
+
+    # Matched against the FULL text (not just the tail): an answer stated
+    # early followed by trailing reasoning must still resolve. Over-matching
+    # is harmless here because a capture is only accepted if it is in `texts`.
+    body = str(text).strip()
+    patterns = [
+        r"(?im)^\s*answer\s*:\s*(.+?)\s*\.?\s*$",
+        r"(?i)\b(?:final\s+answer|answer|correct\s+answer)\s*(?:is|:)\s*(.+?)(?:\.|\n|$)",
+    ]
+    for pat in patterns:
+        matches = re.findall(pat, body)
+        for match in reversed(matches):
+            answer = _clean_answer_text(str(match).strip().strip("()"))
+            if answer in texts:
+                return labels[texts.index(answer)]
+    return None
+
+
 def _row_to_task(subtask: str, index: int, row: dict) -> dict:
-    gold = normalize_gold(row["target"])
+    choices = row.get("choices") or _synthetic_binary_choices(row["target"])
+    if choices is None:
+        raise ValueError(
+            f"BBH subtask {subtask!r} is not multiple-choice or binary; "
+            "this harness only supports choice-label scoring."
+        )
+    gold = normalize_gold_for_choices(row["target"], choices)
     return {
         "task_id": f"bbh/{subtask}/{index}",
-        "prompt": format_prompt(row["question"], row["choices"]),
+        "prompt": format_prompt(row["question"], choices),
         "gold": gold,
+        "choice_labels": [_clean_label(label) for label in choices.get("label") or []],
         "benchmark": "bbh",
         "subtask": subtask,
         # Echo arms use task["prompt"]; keep raw fields for debugging.
         "question": row["question"],
-        "choices": row["choices"],
+        "choices": choices,
     }
 
 
diff --git a/experiment/pyproject.toml b/experiment/pyproject.toml
index cb1bc0b..86052b0 100644
--- a/experiment/pyproject.toml
+++ b/experiment/pyproject.toml
@@ -8,4 +8,6 @@ dependencies = [
   "langchain>=0.3,<0.4",
   "langchain-ollama>=0.2,<0.4",
   "datasets>=2.14",
+  "langchain-openai>=0.3",
+  "langchain-google-genai>=2.0",
 ]
diff --git a/experiment/run_pilot.py b/experiment/run_pilot.py
index d811104..1c105af 100644
--- a/experiment/run_pilot.py
+++ b/experiment/run_pilot.py
@@ -20,8 +20,10 @@
 import ast
 import json
 import re
+import sys
 import textwrap
 import time
+from collections import Counter
 from dataclasses import asdict, dataclass
 from datetime import datetime, timezone
 from pathlib import Path
@@ -132,7 +134,12 @@ def run_tests(implementation: str, task: dict) -> tuple[bool, str]:
     has_top_level_def = bool(re.search(r"^def\s", body, re.MULTILINE))
 
     if has_top_level_def:
-        program = body + "\n" + task["test"] + f"\ncheck({task['entry_point']})\n"
+        prompt_imports = "\n".join(
+            ln for ln in task["prompt"].splitlines()
+            if ln.startswith("from ") or ln.startswith("import ")
+        )
+        preamble = prompt_imports + "\n" if prompt_imports else ""
+        program = preamble + body + "\n" + task["test"] + f"\ncheck({task['entry_point']})\n"
     else:
         # Body is a function-body fragment. Two sub-cases via ast.parse:
         #   - bare expression (e.g. `[x+1 for x in l]`): wrap as `return <expr>`
@@ -147,7 +154,7 @@ def run_tests(implementation: str, task: dict) -> tuple[bool, str]:
 
     try:
         result = _sp.run(
-            ["python3", "-c", full],
+            [sys.executable, "-c", full],
             capture_output=True,
             text=True,
             timeout=TEST_TIMEOUT_SECONDS,
@@ -297,6 +304,17 @@ def arm_echo_judge(task: dict) -> tuple[str, int]:
 #   qwen2.5:7b-instruct — middle ground we're now testing
 SMALL_JUDGE_MODEL = "qwen2.5:7b-instruct-q4_K_M"
 SMALL_JUDGE_BASE_URL = "http://localhost:11434"
+OPENAI_JUDGE_MODELS = {
+    "gpt-5.5": "gpt-5.5",
+    "gpt-5.4": "gpt-5.4",
+    "gpt-5.4-mini": "gpt-5.4-mini",
+    "gpt-5.4-nano": "gpt-5.4-nano",
+}
+GEMINI_JUDGE_MODELS = {
+    "gemini-2.5-pro": "gemini-2.5-pro",
+    "gemini-2.5-flash": "gemini-2.5-flash",
+    "gemini-2.5-flash-lite": "gemini-2.5-flash-lite",
+}
 
 
 def arm_echo_small_judge(task: dict) -> tuple[str, int]:
@@ -324,6 +342,74 @@ def arm_echo_small_judge(task: dict) -> tuple[str, int]:
     return call_with_persona(sonnet, PERSONA_A, task["prompt"]), 3
 
 
+def arm_echo_judge_openai_model(task: dict, model_name: str) -> tuple[str, int]:
+    """Echo with a cross-family OpenAI judge model.
+
+    Same structure as echo-judge but the agreement call goes to OpenAI
+    instead of Haiku, removing same-family bias from the signal.
+    Requires OPENAI_API_KEY in the environment.
+    """
+    try:
+        from langchain_openai import ChatOpenAI
+    except ImportError as exc:
+        raise RuntimeError(
+            "echo-judge-openai requires langchain-openai. "
+            "Run: pip install langchain-openai"
+        ) from exc
+    pair = _haiku_pair(task["prompt"])
+    openai_judge = ChatOpenAI(model=model_name, temperature=0)
+    if judge_agree(pair["a"], pair["b"], task, judge=openai_judge):
+        return pair["a"], 3
+    sonnet = ChatClaudeCode(model="sonnet")
+    return call_with_persona(sonnet, PERSONA_A, task["prompt"]), 4
+
+
+def arm_echo_judge_openai(task: dict) -> tuple[str, int]:
+    """Default OpenAI judge arm, currently GPT-5.5."""
+    return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.5"])
+
+
+def arm_echo_judge_openai_gpt_5_4(task: dict) -> tuple[str, int]:
+    return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.4"])
+
+
+def arm_echo_judge_openai_gpt_5_4_mini(task: dict) -> tuple[str, int]:
+    return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.4-mini"])
+
+
+def arm_echo_judge_openai_gpt_5_4_nano(task: dict) -> tuple[str, int]:
+    return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.4-nano"])
+
+
+def arm_echo_judge_gemini_model(task: dict, model_name: str) -> tuple[str, int]:
+    """Echo with a Gemini judge model via Google AI Studio/Gemini API."""
+    try:
+        from langchain_google_genai import ChatGoogleGenerativeAI
+    except ImportError as exc:
+        raise RuntimeError(
+            "echo-judge-gemini requires langchain-google-genai. "
+            "Run: pip install langchain-google-genai"
+        ) from exc
+    pair = _haiku_pair(task["prompt"])
+    gemini_judge = ChatGoogleGenerativeAI(model=model_name, temperature=0)
+    if judge_agree(pair["a"], pair["b"], task, judge=gemini_judge):
+        return pair["a"], 3
+    sonnet = ChatClaudeCode(model="sonnet")
+    return call_with_persona(sonnet, PERSONA_A, task["prompt"]), 4
+
+
+def arm_echo_judge_gemini_pro(task: dict) -> tuple[str, int]:
+    return arm_echo_judge_gemini_model(task, GEMINI_JUDGE_MODELS["gemini-2.5-pro"])
+
+
+def arm_echo_judge_gemini_flash(task: dict) -> tuple[str, int]:
+    return arm_echo_judge_gemini_model(task, GEMINI_JUDGE_MODELS["gemini-2.5-flash"])
+
+
+def arm_echo_judge_gemini_flash_lite(task: dict) -> tuple[str, int]:
+    return arm_echo_judge_gemini_model(task, GEMINI_JUDGE_MODELS["gemini-2.5-flash-lite"])
+
+
 def arm_echo_oracle(task: dict) -> tuple[str, int]:
     """Oracle agreement signal: ground-truth test pass/fail.
 
@@ -352,6 +438,13 @@ def arm_echo_oracle(task: dict) -> tuple[str, int]:
     "echo-ast": arm_echo_ast,
     "echo-judge": arm_echo_judge,
     "echo-small-judge": arm_echo_small_judge,
+    "echo-judge-openai": arm_echo_judge_openai,
+    "echo-judge-openai-gpt-5.4": arm_echo_judge_openai_gpt_5_4,
+    "echo-judge-openai-gpt-5.4-mini": arm_echo_judge_openai_gpt_5_4_mini,
+    "echo-judge-openai-gpt-5.4-nano": arm_echo_judge_openai_gpt_5_4_nano,
+    "echo-judge-gemini-pro": arm_echo_judge_gemini_pro,
+    "echo-judge-gemini-flash": arm_echo_judge_gemini_flash,
+    "echo-judge-gemini-flash-lite": arm_echo_judge_gemini_flash_lite,
     "echo-oracle": arm_echo_oracle,
 }
 
@@ -374,12 +467,18 @@ def run_one(task: dict, arm_name: str, arm_fn: Callable[[dict], tuple[str, int]]
     t0 = time.perf_counter()
     try:
         output, sub_calls = arm_fn(task)
-        passed, detail = run_tests(output, task)
     except Exception as e:
         return TaskResult(
             task["task_id"], arm_name, False, f"{type(e).__name__}: {str(e)[:200]}",
             time.perf_counter() - t0, 0,
         )
+    try:
+        passed, detail = run_tests(output, task)
+    except Exception as e:
+        return TaskResult(
+            task["task_id"], arm_name, False, f"test runner {type(e).__name__}: {str(e)[:200]}",
+            time.perf_counter() - t0, sub_calls,
+        )
     return TaskResult(task["task_id"], arm_name, passed, detail, time.perf_counter() - t0, sub_calls)
 
 
@@ -391,10 +490,13 @@ def summarize(results: list[TaskResult]) -> dict:
     for arm, rs in by_arm.items():
         n = len(rs)
         passed = sum(1 for r in rs if r.passed)
-        # "Escalation rate" for Echo arms: fraction of tasks that used >2 calls
-        # (oracle and lexical both escalate when sub_calls jumps from 2 to 3).
-        # For non-Echo arms this is just an extra metric that happens to be 0%.
-        escalated = sum(1 for r in rs if r.sub_calls > 2)
+        failure_details = Counter(r.detail for r in rs if not r.passed)
+        # Escalation = Sonnet was called. Threshold depends on arm:
+        #   lexical/ast/oracle/small-judge: accept=2, escalate=3 -> threshold >2
+        #   judge/provider-judge: accept=3 (pair+judge), escalate=4 -> threshold >3
+        provider_judge_prefixes = ("echo-judge-openai", "echo-judge-gemini")
+        threshold = 3 if arm == "echo-judge" or arm.startswith(provider_judge_prefixes) else 2
+        escalated = sum(1 for r in rs if r.sub_calls > threshold)
         summary[arm] = {
             "n": n,
             "pass_rate": round(passed / n, 3) if n else None,
@@ -402,6 +504,8 @@ def summarize(results: list[TaskResult]) -> dict:
             "mean_wall_seconds": round(sum(r.wall_seconds for r in rs) / n, 2) if n else None,
             "total_sub_calls": sum(r.sub_calls for r in rs),
             "mean_sub_calls": round(sum(r.sub_calls for r in rs) / n, 2) if n else None,
+            "failures": n - passed,
+            "top_failure_details": dict(failure_details.most_common(5)),
         }
     return summary
 
diff --git a/experiment/scripts/run_bbh_pilot.py b/experiment/scripts/run_bbh_pilot.py
index 38ce526..ec14652 100644
--- a/experiment/scripts/run_bbh_pilot.py
+++ b/experiment/scripts/run_bbh_pilot.py
@@ -25,7 +25,6 @@ def run_one_bbh(task: dict, arm_name: str, arm_fn) -> TaskResult:
     t0 = time.perf_counter()
     try:
         output, sub_calls = arm_fn(task)
-        passed, detail = score_bbh(output, task)
     except Exception as e:
         return TaskResult(
             task["task_id"],
@@ -35,6 +34,17 @@ def run_one_bbh(task: dict, arm_name: str, arm_fn) -> TaskResult:
             time.perf_counter() - t0,
             0,
         )
+    try:
+        passed, detail = score_bbh(output, task)
+    except Exception as e:
+        return TaskResult(
+            task["task_id"],
+            arm_name,
+            False,
+            f"scorer {type(e).__name__}: {str(e)[:200]}",
+            time.perf_counter() - t0,
+            sub_calls,
+        )
     return TaskResult(
         task["task_id"], arm_name, passed, detail,
         time.perf_counter() - t0, sub_calls,
diff --git a/experiment/tests/test_bbh_scoring.py b/experiment/tests/test_bbh_scoring.py
index fb524af..04cad94 100644
--- a/experiment/tests/test_bbh_scoring.py
+++ b/experiment/tests/test_bbh_scoring.py
@@ -9,6 +9,7 @@
 sys.path.insert(0, str(Path(__file__).resolve().parent.parent))
 
 from benchmarks.bbh import extract_choice, format_prompt, normalize_gold, score_bbh
+from benchmarks.bbh import _row_to_task, normalize_gold_for_choices
 
 
 class TestExtractChoice(unittest.TestCase):
@@ -33,6 +34,68 @@ def test_last_line_letter(self) -> None:
             "D",
         )
 
+    def test_final_answer_sentence_beats_reasoning_option_mentions(self) -> None:
+        self.assertEqual(
+            extract_choice("I considered (A) and then (B). Therefore the answer is C."),
+            "C",
+        )
+
+    def test_correct_answer_sentence_beats_reasoning_option_mentions(self) -> None:
+        self.assertEqual(
+            extract_choice("Option (A) is tempting, but the correct answer is (C)."),
+            "C",
+        )
+
+    def test_reasoning_option_mentions_alone_are_not_parseable(self) -> None:
+        self.assertIsNone(extract_choice("I considered (A), then (B), then (C)."))
+
+    # Regression: an answer stated EARLY followed by trailing reasoning must
+    # still be recovered. Tail-only matching dropped these (cage-match #335).
+    def test_answer_stated_early_then_six_trailing_lines(self) -> None:
+        out = "The answer is C.\n\nl1\nl2\nl3\nl4\nl5\nl6"
+        self.assertEqual(extract_choice(out), "C")
+
+    def test_answer_colon_early_then_trailing_lines(self) -> None:
+        out = "Answer: C\nthanks\nbye\n.\n-\n="
+        self.assertEqual(extract_choice(out), "C")
+
+    # Regression: under re.I, [A-Z] also matches lowercase, so a broad
+    # "answer is <word>" pattern must NOT grab the first letter of prose.
+    def test_answer_is_lowercase_word_is_not_a_false_letter(self) -> None:
+        self.assertIsNone(extract_choice("After analysis, the answer is straightforward."))
+        self.assertIsNone(extract_choice("The answer is dependent on the framing."))
+        self.assertIsNone(extract_choice("So the final answer is best understood as follows."))
+
+    # Regression: the latest high-confidence declaration wins even when it
+    # uses a DIFFERENT pattern family than an earlier one (cage-match r2:
+    # Carnot). "Answer: A" early, "therefore the answer is C" late -> C.
+    def test_latest_high_confidence_wins_across_pattern_families(self) -> None:
+        self.assertEqual(
+            extract_choice("Answer: A\nLet me reconsider.\nTherefore the answer is C."),
+            "C",
+        )
+        self.assertEqual(
+            extract_choice("Answer: A\nactually, the answer is C."),
+            "C",
+        )
+
+
+class TestNormalizeGoldForChoices(unittest.TestCase):
+    # Regression: choices carrying text without an explicit "label" key must
+    # not raise IndexError (cage-match: Kelvin).
+    def test_text_choices_without_labels_fall_back_to_positional(self) -> None:
+        self.assertEqual(
+            normalize_gold_for_choices("No", {"text": ["Yes", "No"]}),
+            "B",
+        )
+
+    # Regression: a single-letter target must resolve to its own label, not be
+    # remapped by a decoy choice whose text is that same letter (cage-match:
+    # Carnot).
+    def test_single_letter_target_not_remapped_by_decoy_text(self) -> None:
+        choices = {"label": ["A", "B", "C"], "text": ["apple", "A", "cat"]}
+        self.assertEqual(normalize_gold_for_choices("A", choices), "A")
+
 
 class TestScoreBbh(unittest.TestCase):
     def _task(self, gold: str = "C") -> dict:
@@ -57,6 +120,26 @@ def test_unparseable(self) -> None:
         self.assertFalse(ok)
         self.assertEqual(detail, "unparseable")
 
+    def test_binary_answer_text_scores_against_synthetic_choices(self) -> None:
+        task = _row_to_task(
+            "causal_judgement",
+            0,
+            {"question": "Did X cause Y?", "target": "No"},
+        )
+        ok, detail = score_bbh("Reasoning...\nAnswer: No", task)
+        self.assertTrue(ok)
+        self.assertEqual(detail, "passed")
+
+    def test_binary_answer_letter_scores_against_synthetic_choices(self) -> None:
+        task = _row_to_task(
+            "causal_judgement",
+            0,
+            {"question": "Did X cause Y?", "target": "No"},
+        )
+        ok, detail = score_bbh("Answer: B", task)
+        self.assertTrue(ok)
+        self.assertEqual(detail, "passed")
+
 
 class TestFormatPrompt(unittest.TestCase):
     def test_includes_choices(self) -> None: