enspyrco · nickmeinhold · Jun 2, 2026 · Jun 2, 2026 · Jun 2, 2026 · Jun 3, 2026
diff --git a/experiment/README.md b/experiment/README.md
@@ -17,6 +17,16 @@ For sweep narratives and headline numbers, see [`results/README.md`](results/REA
 - [Ollama](https://ollama.com/) at `http://localhost:11434`
 - Model: `qwen2.5:7b-instruct-q4_K_M` (see `SMALL_JUDGE_MODEL` in `run_pilot.py`)
 
+**Optional** — only for OpenAI judge arms:
+
+- `OPENAI_API_KEY` in the environment
+- OpenAI judge models: `gpt-5.5`, `gpt-5.4`, `gpt-5.4-mini`, `gpt-5.4-nano` (see `OPENAI_JUDGE_MODELS` in `run_pilot.py`)
+
+**Optional** — only for Gemini judge arms:
+
+- `GOOGLE_API_KEY` in the environment
+- Gemini judge models: `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.5-flash-lite` (see `GEMINI_JUDGE_MODELS` in `run_pilot.py`)
+
 ```bash
 claude --version          # must work before running sweeps
 ollama pull qwen2.5:7b-instruct-q4_K_M   # only if using echo-small-judge
@@ -29,7 +39,7 @@ cd experiment
 python3 -m venv .venv
 source .venv/bin/activate
 
-pip install "langchain-core>=0.3,<0.4" "langchain>=0.3,<0.4" "langchain-ollama>=0.2,<0.4"
+pip install "langchain-core>=0.3,<0.4" "langchain>=0.3,<0.4" "langchain-ollama>=0.2,<0.4" "langchain-openai>=0.3" "langchain-google-genai>=2.0"
 ```
 
 ## Quick start (1 task)
@@ -52,14 +62,32 @@ Tasks **100–163** (64 tasks), same range the team used for main results:
 python run_pilot.py --start 100 --n-tasks 64
 ```
 
-Default runs **all 7 arms** → 448 model calls. Expect long runtime and Claude CLI usage. Confirm the 1-task run first.
+Default runs all arms. Expect long runtime, Claude CLI usage, and provider API usage for the judge arms. Confirm the 1-task run first.
 
 Subset of arms:
 
 ```bash
 python run_pilot.py --start 100 --n-tasks 5 --arms haiku-only,sonnet-only,echo-small-judge
 ```
 
+Compare OpenAI judges:
+
+```bash
+python run_pilot.py --start 100 --n-tasks 10 --arms haiku-only,sonnet-only,echo-judge-openai,echo-judge-openai-gpt-5.4,echo-judge-openai-gpt-5.4-mini,echo-judge-openai-gpt-5.4-nano
+```
+
+Compare Gemini judges:
+
+```bash
+python run_pilot.py --start 100 --n-tasks 10 --arms haiku-only,sonnet-only,echo-judge-gemini-pro,echo-judge-gemini-flash,echo-judge-gemini-flash-lite
+```
+
+Compare all provider judges:
+
+```bash
+python run_pilot.py --start 100 --n-tasks 10 --arms echo-judge-openai,echo-judge-openai-gpt-5.4,echo-judge-openai-gpt-5.4-mini,echo-judge-openai-gpt-5.4-nano,echo-judge-gemini-pro,echo-judge-gemini-flash,echo-judge-gemini-flash-lite
+```
+
 ## CLI reference
 
 | Flag | Default | Meaning |
@@ -78,6 +106,13 @@ python run_pilot.py --start 100 --n-tasks 5 --arms haiku-only,sonnet-only,echo-s
 | `echo-ast` | Two Haiku personas; escalate if AST structure differs |
 | `echo-judge` | Two Haiku personas; Haiku judges equivalence |
 | `echo-small-judge` | Two Haiku personas; local Qwen 7B judge (Ollama) |
+| `echo-judge-openai` | Two Haiku personas; GPT-5.5 judges equivalence via OpenAI |
+| `echo-judge-openai-gpt-5.4` | Two Haiku personas; GPT-5.4 judges equivalence via OpenAI |
+| `echo-judge-openai-gpt-5.4-mini` | Two Haiku personas; GPT-5.4 mini judges equivalence via OpenAI |
+| `echo-judge-openai-gpt-5.4-nano` | Two Haiku personas; GPT-5.4 nano judges equivalence via OpenAI |
+| `echo-judge-gemini-pro` | Two Haiku personas; Gemini 2.5 Pro judges equivalence |
+| `echo-judge-gemini-flash` | Two Haiku personas; Gemini 2.5 Flash judges equivalence |
+| `echo-judge-gemini-flash-lite` | Two Haiku personas; Gemini 2.5 Flash-Lite judges equivalence |
 | `echo-oracle` | Two Haiku personas; escalate only if both fail tests (upper bound, not deployable) |
 
 ## Output format
@@ -91,6 +126,21 @@ Each sweep writes `results/<timestamp>_n<tasks>.jsonl`. One line per run:
 - **`passed`** — automated HumanEval tests succeeded
 - **`sub_calls`** — model calls for that task (Echo escalate → typically 3)
 
+Aggregate metrics:
+
+| Metric | Meaning |
+|--------|---------|
+| `n` | Number of tasks run for that arm |
+| `pass_rate` | Fraction of final selected implementations that passed HumanEval tests |
+| `escalation_rate` | Fraction of tasks where the arm called Sonnet after the cheap pair/judge disagreed |
+| `mean_wall_seconds` | Average elapsed seconds per task for that arm |
+| `total_sub_calls` | Total counted model calls for that arm |
+| `mean_sub_calls` | Average counted model calls per task |
+| `failures` | Number of tasks that failed tests or hit provider/harness errors |
+| `top_failure_details` | Most common failure reasons, useful for spotting provider limits, auth errors, syntax errors, and timeouts |
+
+For provider judge arms, `3 calls` means two Haiku candidates plus one judge call. `4 calls` means the judge disagreed and the arm escalated to Sonnet.
+
 ## View existing results (no run)
 
 Committed JSONL files are under `results/`. Summary and interpretation: [`results/README.md`](results/README.md).
@@ -108,6 +158,8 @@ jq -s 'group_by(.arm) | map({arm: .[0].arm, n: length, passed: (map(select(.pass
 | `claude: command not found` | Install [Claude Code](https://docs.anthropic.com/en/docs/claude-code) and ensure `claude` is on your `PATH` |
 | `claude --print failed` | Log in to Claude Code; confirm Max/subscription access |
 | `echo-small-judge requires langchain-ollama` | `pip install "langchain-ollama>=0.2,<0.4"` or omit that arm |
+| `echo-judge-openai requires langchain-openai` | `pip install "langchain-openai>=0.3"` and set `OPENAI_API_KEY` |
+| `echo-judge-gemini requires langchain-google-genai` | `pip install "langchain-google-genai>=2.0"` and set `GOOGLE_API_KEY` |
 | Ollama connection errors | Start Ollama; `ollama pull qwen2.5:7b-instruct-q4_K_M`; check `SMALL_JUDGE_BASE_URL` in `run_pilot.py` |
 | Very slow runs | Expected — each call spawns `claude --print` (~seconds overhead per call) |
 

diff --git a/experiment/benchmarks/bbh.py b/experiment/benchmarks/bbh.py
@@ -50,6 +50,29 @@
 ]
 
 _CHOICE_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
+BINARY_CHOICE_TEXTS = {
+    "yes": ("Yes", "No"),
+    "no": ("Yes", "No"),
+    "true": ("True", "False"),
+    "false": ("True", "False"),
+    "valid": ("valid", "invalid"),
+    "invalid": ("valid", "invalid"),
+}
+
+
+def _clean_label(label: str) -> str:
+    return str(label).strip().strip("()").upper()
+
+
+def _clean_answer_text(text: str) -> str:
+    return re.sub(r"\s+", " ", str(text).strip().lower())
+
+
+def _synthetic_binary_choices(target: str) -> dict[str, list[str]] | None:
+    texts = BINARY_CHOICE_TEXTS.get(_clean_answer_text(target))
+    if texts is None:
+        return None
+    return {"label": ["A", "B"], "text": list(texts)}
 
 
 def _format_choices(choices: dict[str, Any]) -> str:
@@ -88,41 +111,98 @@ def normalize_gold(target: str) -> str:
     return letter
 
 
+def normalize_gold_for_choices(target: str, choices: dict[str, Any]) -> str:
+    """Normalize a gold target against a concrete choice list.
+
+    Some BBH configs have letter targets ("C"); binary configs can have text
+    targets ("Yes"/"No"). Convert either form to the matching choice label.
+    """
+    labels = [_clean_label(label) for label in choices.get("label") or []]
+    texts = [_clean_answer_text(text) for text in choices.get("text") or []]
+    # A choice list may carry text without explicit labels; fall back to
+    # positional A, B, C... so the texts.index() lookups below can't raise
+    # IndexError (mirrors the _CHOICE_LETTERS fallback in score_bbh).
+    if not labels and texts:
+        labels = list(_CHOICE_LETTERS[: len(texts)])
+    target_text = _clean_answer_text(target)
+
+    # Prefer label parsing first for single-letter targets ("C"), so a decoy
+    # choice whose *text* happens to be a bare letter can't remap the gold.
+    # Binary/textual targets ("Yes"/"No") extract_choice()-to-None and fall
+    # through to text matching below.
+    letter = extract_choice(str(target))
+    if letter is not None and letter in labels:
+        return letter
+
+    if target_text in texts:
+        return labels[texts.index(target_text)]
+
+    raise ValueError(f"Could not parse gold target: {target!r}")
+
+
 def extract_choice(text: str) -> str | None:
     """Parse a multiple-choice letter from model output.
 
-    Tries explicit patterns first, then the last standalone A–Z near the end.
+    High-confidence patterns (explicit "Answer: X" / "the answer is X") are
+    matched against the FULL text, so an answer stated early and followed by
+    trailing reasoning is still recovered. Weak positional fallbacks (a lone
+    "(A)" line, a trailing single letter) are matched only against the last
+    few lines, where a stray capital is least likely to be prose.
+
+    Each capture is followed by a ``(?![A-Za-z])`` guard: under ``re.I`` the
+    class ``[A-Z]`` also matches lowercase, so without the guard a phrase like
+    "the answer is straightforward" would wrongly yield "S".
+
+    Among the high-confidence patterns the *latest* match in the text wins
+    (recency across ALL pattern families, by source offset) — so a chain of
+    thought like "Answer: A ... therefore the answer is C" resolves to C even
+    though the two declarations use different phrasings / different families.
+
     Returns uppercase A–Z or None if unparseable.
     """
     if not text or not text.strip():
         return None
 
-    patterns = [
-        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])\s*\)?\s*\.?\s*$",
-        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])\s*\)?",
+    body = text.strip()
+    # High-confidence: explicit answer declarations, anywhere in the output.
+    high_confidence = [
+        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s*\.?\s*$",
+        r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+        r"(?i)\b(?:final\s+answer|answer|correct\s+answer|correct\s+choice)\s*(?:is|:)\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+        r"(?i)\b(?:option|choice)\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s+(?:is\s+)?(?:correct|best|right)",
+        r"(?i)\b(?:therefore|so|thus),?\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s+(?:is\s+)?(?:correct|best|right)",
+        r"(?i)\b(?:therefore|so|thus),?\s+(?:the\s+)?(?:answer|correct\s+answer|choice)\s+is\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+    ]
+    # Pick the high-confidence match with the largest source offset, so the
+    # final declaration wins regardless of which pattern family caught it.
+    best_pos, best_letter = -1, None
+    for pat in high_confidence:
+        for m in re.finditer(pat, body):
+            if m.start(1) > best_pos:
+                best_pos, best_letter = m.start(1), m.group(1)
+    if best_letter is not None:
+        return best_letter.upper()
+
+    # Weak positional fallbacks: only trusted near the end of the output.
+    tail = "\n".join(body.splitlines()[-5:])
+    weak = [
         r"(?im)^\s*\(?\s*([A-Z])\s*\)\s*$",
-        r"(?im)correct\s+(?:answer\s+is|choice\s+is)\s*\(?\s*([A-Z])\s*\)?",
-        r"\(\s*([A-Z])\s*\)",
+        r"(?i)\b(?:option|choice|answer)\s+is\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?",
+        r"(?i)\b([A-Z])(?![A-Za-z])\s*\.?\s*$",
     ]
-    for pat in patterns:
-        matches = re.findall(pat, text)
+    for pat in weak:
+        matches = re.findall(pat, tail)
         if matches:
             return matches[-1].upper()
-
-    tail = "\n".join(text.strip().splitlines()[-5:])
-    for pat in (
-        r"(?i)\b(?:option|choice|answer)\s+is\s+\(?\s*([A-Z])\s*\)?",
-        r"(?i)\b([A-Z])\s*\.?\s*$",
-    ):
-        m = re.search(pat, tail.strip())
-        if m:
-            return m.group(1).upper()
     return None
 
 
 def score_bbh(model_output: str, task: dict) -> tuple[bool, str]:
     """Grade a BBH response against task['gold']."""
     pred = extract_choice(model_output)
+    valid_labels = set(task.get("choice_labels") or _CHOICE_LETTERS)
+    if pred not in valid_labels:
+        pred = extract_choice_text(model_output, task)
     if pred is None:
         return False, "unparseable"
     gold = task["gold"]
@@ -131,17 +211,49 @@ def score_bbh(model_output: str, task: dict) -> tuple[bool, str]:
     return False, f"expected {gold} got {pred}"
 
 
+def extract_choice_text(text: str, task: dict) -> str | None:
+    """Map answer text like 'Answer: No' to its choice label for binary tasks."""
+    choices = task.get("choices") or {}
+    labels = [_clean_label(label) for label in choices.get("label") or []]
+    texts = [_clean_answer_text(choice_text) for choice_text in choices.get("text") or []]
+    if not labels or not texts:
+        return None
+
+    # Matched against the FULL text (not just the tail): an answer stated
+    # early followed by trailing reasoning must still resolve. Over-matching
+    # is harmless here because a capture is only accepted if it is in `texts`.
+    body = str(text).strip()
+    patterns = [
+        r"(?im)^\s*answer\s*:\s*(.+?)\s*\.?\s*$",
+        r"(?i)\b(?:final\s+answer|answer|correct\s+answer)\s*(?:is|:)\s*(.+?)(?:\.|\n|$)",
+    ]
+    for pat in patterns:
+        matches = re.findall(pat, body)
+        for match in reversed(matches):
+            answer = _clean_answer_text(str(match).strip().strip("()"))
+            if answer in texts:
+                return labels[texts.index(answer)]
+    return None
+
+
 def _row_to_task(subtask: str, index: int, row: dict) -> dict:
-    gold = normalize_gold(row["target"])
+    choices = row.get("choices") or _synthetic_binary_choices(row["target"])
+    if choices is None:
+        raise ValueError(
+            f"BBH subtask {subtask!r} is not multiple-choice or binary; "
+            "this harness only supports choice-label scoring."
+        )
+    gold = normalize_gold_for_choices(row["target"], choices)
     return {
         "task_id": f"bbh/{subtask}/{index}",
-        "prompt": format_prompt(row["question"], row["choices"]),
+        "prompt": format_prompt(row["question"], choices),
         "gold": gold,
+        "choice_labels": [_clean_label(label) for label in choices.get("label") or []],
         "benchmark": "bbh",
         "subtask": subtask,
         # Echo arms use task["prompt"]; keep raw fields for debugging.
         "question": row["question"],
-        "choices": row["choices"],
+        "choices": choices,
     }
 
 

diff --git a/experiment/pyproject.toml b/experiment/pyproject.toml
@@ -8,4 +8,6 @@ dependencies = [
   "langchain>=0.3,<0.4",
   "langchain-ollama>=0.2,<0.4",
   "datasets>=2.14",
+  "langchain-openai>=0.3",
+  "langchain-google-genai>=2.0",
 ]