diff --git a/experiment/README.md b/experiment/README.md index edf4366..ddd999c 100644 --- a/experiment/README.md +++ b/experiment/README.md @@ -17,6 +17,16 @@ For sweep narratives and headline numbers, see [`results/README.md`](results/REA - [Ollama](https://ollama.com/) at `http://localhost:11434` - Model: `qwen2.5:7b-instruct-q4_K_M` (see `SMALL_JUDGE_MODEL` in `run_pilot.py`) +**Optional** — only for OpenAI judge arms: + +- `OPENAI_API_KEY` in the environment +- OpenAI judge models: `gpt-5.5`, `gpt-5.4`, `gpt-5.4-mini`, `gpt-5.4-nano` (see `OPENAI_JUDGE_MODELS` in `run_pilot.py`) + +**Optional** — only for Gemini judge arms: + +- `GOOGLE_API_KEY` in the environment +- Gemini judge models: `gemini-2.5-pro`, `gemini-2.5-flash`, `gemini-2.5-flash-lite` (see `GEMINI_JUDGE_MODELS` in `run_pilot.py`) + ```bash claude --version # must work before running sweeps ollama pull qwen2.5:7b-instruct-q4_K_M # only if using echo-small-judge @@ -29,7 +39,7 @@ cd experiment python3 -m venv .venv source .venv/bin/activate -pip install "langchain-core>=0.3,<0.4" "langchain>=0.3,<0.4" "langchain-ollama>=0.2,<0.4" +pip install "langchain-core>=0.3,<0.4" "langchain>=0.3,<0.4" "langchain-ollama>=0.2,<0.4" "langchain-openai>=0.3" "langchain-google-genai>=2.0" ``` ## Quick start (1 task) @@ -52,7 +62,7 @@ Tasks **100–163** (64 tasks), same range the team used for main results: python run_pilot.py --start 100 --n-tasks 64 ``` -Default runs **all 7 arms** → 448 model calls. Expect long runtime and Claude CLI usage. Confirm the 1-task run first. +Default runs all arms. Expect long runtime, Claude CLI usage, and provider API usage for the judge arms. Confirm the 1-task run first. Subset of arms: @@ -60,6 +70,24 @@ Subset of arms: python run_pilot.py --start 100 --n-tasks 5 --arms haiku-only,sonnet-only,echo-small-judge ``` +Compare OpenAI judges: + +```bash +python run_pilot.py --start 100 --n-tasks 10 --arms haiku-only,sonnet-only,echo-judge-openai,echo-judge-openai-gpt-5.4,echo-judge-openai-gpt-5.4-mini,echo-judge-openai-gpt-5.4-nano +``` + +Compare Gemini judges: + +```bash +python run_pilot.py --start 100 --n-tasks 10 --arms haiku-only,sonnet-only,echo-judge-gemini-pro,echo-judge-gemini-flash,echo-judge-gemini-flash-lite +``` + +Compare all provider judges: + +```bash +python run_pilot.py --start 100 --n-tasks 10 --arms echo-judge-openai,echo-judge-openai-gpt-5.4,echo-judge-openai-gpt-5.4-mini,echo-judge-openai-gpt-5.4-nano,echo-judge-gemini-pro,echo-judge-gemini-flash,echo-judge-gemini-flash-lite +``` + ## CLI reference | Flag | Default | Meaning | @@ -78,6 +106,13 @@ python run_pilot.py --start 100 --n-tasks 5 --arms haiku-only,sonnet-only,echo-s | `echo-ast` | Two Haiku personas; escalate if AST structure differs | | `echo-judge` | Two Haiku personas; Haiku judges equivalence | | `echo-small-judge` | Two Haiku personas; local Qwen 7B judge (Ollama) | +| `echo-judge-openai` | Two Haiku personas; GPT-5.5 judges equivalence via OpenAI | +| `echo-judge-openai-gpt-5.4` | Two Haiku personas; GPT-5.4 judges equivalence via OpenAI | +| `echo-judge-openai-gpt-5.4-mini` | Two Haiku personas; GPT-5.4 mini judges equivalence via OpenAI | +| `echo-judge-openai-gpt-5.4-nano` | Two Haiku personas; GPT-5.4 nano judges equivalence via OpenAI | +| `echo-judge-gemini-pro` | Two Haiku personas; Gemini 2.5 Pro judges equivalence | +| `echo-judge-gemini-flash` | Two Haiku personas; Gemini 2.5 Flash judges equivalence | +| `echo-judge-gemini-flash-lite` | Two Haiku personas; Gemini 2.5 Flash-Lite judges equivalence | | `echo-oracle` | Two Haiku personas; escalate only if both fail tests (upper bound, not deployable) | ## Output format @@ -91,6 +126,21 @@ Each sweep writes `results/_n.jsonl`. One line per run: - **`passed`** — automated HumanEval tests succeeded - **`sub_calls`** — model calls for that task (Echo escalate → typically 3) +Aggregate metrics: + +| Metric | Meaning | +|--------|---------| +| `n` | Number of tasks run for that arm | +| `pass_rate` | Fraction of final selected implementations that passed HumanEval tests | +| `escalation_rate` | Fraction of tasks where the arm called Sonnet after the cheap pair/judge disagreed | +| `mean_wall_seconds` | Average elapsed seconds per task for that arm | +| `total_sub_calls` | Total counted model calls for that arm | +| `mean_sub_calls` | Average counted model calls per task | +| `failures` | Number of tasks that failed tests or hit provider/harness errors | +| `top_failure_details` | Most common failure reasons, useful for spotting provider limits, auth errors, syntax errors, and timeouts | + +For provider judge arms, `3 calls` means two Haiku candidates plus one judge call. `4 calls` means the judge disagreed and the arm escalated to Sonnet. + ## View existing results (no run) Committed JSONL files are under `results/`. Summary and interpretation: [`results/README.md`](results/README.md). @@ -108,6 +158,8 @@ jq -s 'group_by(.arm) | map({arm: .[0].arm, n: length, passed: (map(select(.pass | `claude: command not found` | Install [Claude Code](https://docs.anthropic.com/en/docs/claude-code) and ensure `claude` is on your `PATH` | | `claude --print failed` | Log in to Claude Code; confirm Max/subscription access | | `echo-small-judge requires langchain-ollama` | `pip install "langchain-ollama>=0.2,<0.4"` or omit that arm | +| `echo-judge-openai requires langchain-openai` | `pip install "langchain-openai>=0.3"` and set `OPENAI_API_KEY` | +| `echo-judge-gemini requires langchain-google-genai` | `pip install "langchain-google-genai>=2.0"` and set `GOOGLE_API_KEY` | | Ollama connection errors | Start Ollama; `ollama pull qwen2.5:7b-instruct-q4_K_M`; check `SMALL_JUDGE_BASE_URL` in `run_pilot.py` | | Very slow runs | Expected — each call spawns `claude --print` (~seconds overhead per call) | diff --git a/experiment/benchmarks/bbh.py b/experiment/benchmarks/bbh.py index 582329b..d2aec32 100644 --- a/experiment/benchmarks/bbh.py +++ b/experiment/benchmarks/bbh.py @@ -50,6 +50,29 @@ ] _CHOICE_LETTERS = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" +BINARY_CHOICE_TEXTS = { + "yes": ("Yes", "No"), + "no": ("Yes", "No"), + "true": ("True", "False"), + "false": ("True", "False"), + "valid": ("valid", "invalid"), + "invalid": ("valid", "invalid"), +} + + +def _clean_label(label: str) -> str: + return str(label).strip().strip("()").upper() + + +def _clean_answer_text(text: str) -> str: + return re.sub(r"\s+", " ", str(text).strip().lower()) + + +def _synthetic_binary_choices(target: str) -> dict[str, list[str]] | None: + texts = BINARY_CHOICE_TEXTS.get(_clean_answer_text(target)) + if texts is None: + return None + return {"label": ["A", "B"], "text": list(texts)} def _format_choices(choices: dict[str, Any]) -> str: @@ -88,41 +111,98 @@ def normalize_gold(target: str) -> str: return letter +def normalize_gold_for_choices(target: str, choices: dict[str, Any]) -> str: + """Normalize a gold target against a concrete choice list. + + Some BBH configs have letter targets ("C"); binary configs can have text + targets ("Yes"/"No"). Convert either form to the matching choice label. + """ + labels = [_clean_label(label) for label in choices.get("label") or []] + texts = [_clean_answer_text(text) for text in choices.get("text") or []] + # A choice list may carry text without explicit labels; fall back to + # positional A, B, C... so the texts.index() lookups below can't raise + # IndexError (mirrors the _CHOICE_LETTERS fallback in score_bbh). + if not labels and texts: + labels = list(_CHOICE_LETTERS[: len(texts)]) + target_text = _clean_answer_text(target) + + # Prefer label parsing first for single-letter targets ("C"), so a decoy + # choice whose *text* happens to be a bare letter can't remap the gold. + # Binary/textual targets ("Yes"/"No") extract_choice()-to-None and fall + # through to text matching below. + letter = extract_choice(str(target)) + if letter is not None and letter in labels: + return letter + + if target_text in texts: + return labels[texts.index(target_text)] + + raise ValueError(f"Could not parse gold target: {target!r}") + + def extract_choice(text: str) -> str | None: """Parse a multiple-choice letter from model output. - Tries explicit patterns first, then the last standalone A–Z near the end. + High-confidence patterns (explicit "Answer: X" / "the answer is X") are + matched against the FULL text, so an answer stated early and followed by + trailing reasoning is still recovered. Weak positional fallbacks (a lone + "(A)" line, a trailing single letter) are matched only against the last + few lines, where a stray capital is least likely to be prose. + + Each capture is followed by a ``(?![A-Za-z])`` guard: under ``re.I`` the + class ``[A-Z]`` also matches lowercase, so without the guard a phrase like + "the answer is straightforward" would wrongly yield "S". + + Among the high-confidence patterns the *latest* match in the text wins + (recency across ALL pattern families, by source offset) — so a chain of + thought like "Answer: A ... therefore the answer is C" resolves to C even + though the two declarations use different phrasings / different families. + Returns uppercase A–Z or None if unparseable. """ if not text or not text.strip(): return None - patterns = [ - r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])\s*\)?\s*\.?\s*$", - r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])\s*\)?", + body = text.strip() + # High-confidence: explicit answer declarations, anywhere in the output. + high_confidence = [ + r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s*\.?\s*$", + r"(?im)^\s*answer\s*:\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?", + r"(?i)\b(?:final\s+answer|answer|correct\s+answer|correct\s+choice)\s*(?:is|:)\s*\(?\s*([A-Z])(?![A-Za-z])\s*\)?", + r"(?i)\b(?:option|choice)\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s+(?:is\s+)?(?:correct|best|right)", + r"(?i)\b(?:therefore|so|thus),?\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?\s+(?:is\s+)?(?:correct|best|right)", + r"(?i)\b(?:therefore|so|thus),?\s+(?:the\s+)?(?:answer|correct\s+answer|choice)\s+is\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?", + ] + # Pick the high-confidence match with the largest source offset, so the + # final declaration wins regardless of which pattern family caught it. + best_pos, best_letter = -1, None + for pat in high_confidence: + for m in re.finditer(pat, body): + if m.start(1) > best_pos: + best_pos, best_letter = m.start(1), m.group(1) + if best_letter is not None: + return best_letter.upper() + + # Weak positional fallbacks: only trusted near the end of the output. + tail = "\n".join(body.splitlines()[-5:]) + weak = [ r"(?im)^\s*\(?\s*([A-Z])\s*\)\s*$", - r"(?im)correct\s+(?:answer\s+is|choice\s+is)\s*\(?\s*([A-Z])\s*\)?", - r"\(\s*([A-Z])\s*\)", + r"(?i)\b(?:option|choice|answer)\s+is\s+\(?\s*([A-Z])(?![A-Za-z])\s*\)?", + r"(?i)\b([A-Z])(?![A-Za-z])\s*\.?\s*$", ] - for pat in patterns: - matches = re.findall(pat, text) + for pat in weak: + matches = re.findall(pat, tail) if matches: return matches[-1].upper() - - tail = "\n".join(text.strip().splitlines()[-5:]) - for pat in ( - r"(?i)\b(?:option|choice|answer)\s+is\s+\(?\s*([A-Z])\s*\)?", - r"(?i)\b([A-Z])\s*\.?\s*$", - ): - m = re.search(pat, tail.strip()) - if m: - return m.group(1).upper() return None def score_bbh(model_output: str, task: dict) -> tuple[bool, str]: """Grade a BBH response against task['gold'].""" pred = extract_choice(model_output) + valid_labels = set(task.get("choice_labels") or _CHOICE_LETTERS) + if pred not in valid_labels: + pred = extract_choice_text(model_output, task) if pred is None: return False, "unparseable" gold = task["gold"] @@ -131,17 +211,49 @@ def score_bbh(model_output: str, task: dict) -> tuple[bool, str]: return False, f"expected {gold} got {pred}" +def extract_choice_text(text: str, task: dict) -> str | None: + """Map answer text like 'Answer: No' to its choice label for binary tasks.""" + choices = task.get("choices") or {} + labels = [_clean_label(label) for label in choices.get("label") or []] + texts = [_clean_answer_text(choice_text) for choice_text in choices.get("text") or []] + if not labels or not texts: + return None + + # Matched against the FULL text (not just the tail): an answer stated + # early followed by trailing reasoning must still resolve. Over-matching + # is harmless here because a capture is only accepted if it is in `texts`. + body = str(text).strip() + patterns = [ + r"(?im)^\s*answer\s*:\s*(.+?)\s*\.?\s*$", + r"(?i)\b(?:final\s+answer|answer|correct\s+answer)\s*(?:is|:)\s*(.+?)(?:\.|\n|$)", + ] + for pat in patterns: + matches = re.findall(pat, body) + for match in reversed(matches): + answer = _clean_answer_text(str(match).strip().strip("()")) + if answer in texts: + return labels[texts.index(answer)] + return None + + def _row_to_task(subtask: str, index: int, row: dict) -> dict: - gold = normalize_gold(row["target"]) + choices = row.get("choices") or _synthetic_binary_choices(row["target"]) + if choices is None: + raise ValueError( + f"BBH subtask {subtask!r} is not multiple-choice or binary; " + "this harness only supports choice-label scoring." + ) + gold = normalize_gold_for_choices(row["target"], choices) return { "task_id": f"bbh/{subtask}/{index}", - "prompt": format_prompt(row["question"], row["choices"]), + "prompt": format_prompt(row["question"], choices), "gold": gold, + "choice_labels": [_clean_label(label) for label in choices.get("label") or []], "benchmark": "bbh", "subtask": subtask, # Echo arms use task["prompt"]; keep raw fields for debugging. "question": row["question"], - "choices": row["choices"], + "choices": choices, } diff --git a/experiment/pyproject.toml b/experiment/pyproject.toml index cb1bc0b..86052b0 100644 --- a/experiment/pyproject.toml +++ b/experiment/pyproject.toml @@ -8,4 +8,6 @@ dependencies = [ "langchain>=0.3,<0.4", "langchain-ollama>=0.2,<0.4", "datasets>=2.14", + "langchain-openai>=0.3", + "langchain-google-genai>=2.0", ] diff --git a/experiment/run_pilot.py b/experiment/run_pilot.py index d811104..1c105af 100644 --- a/experiment/run_pilot.py +++ b/experiment/run_pilot.py @@ -20,8 +20,10 @@ import ast import json import re +import sys import textwrap import time +from collections import Counter from dataclasses import asdict, dataclass from datetime import datetime, timezone from pathlib import Path @@ -132,7 +134,12 @@ def run_tests(implementation: str, task: dict) -> tuple[bool, str]: has_top_level_def = bool(re.search(r"^def\s", body, re.MULTILINE)) if has_top_level_def: - program = body + "\n" + task["test"] + f"\ncheck({task['entry_point']})\n" + prompt_imports = "\n".join( + ln for ln in task["prompt"].splitlines() + if ln.startswith("from ") or ln.startswith("import ") + ) + preamble = prompt_imports + "\n" if prompt_imports else "" + program = preamble + body + "\n" + task["test"] + f"\ncheck({task['entry_point']})\n" else: # Body is a function-body fragment. Two sub-cases via ast.parse: # - bare expression (e.g. `[x+1 for x in l]`): wrap as `return ` @@ -147,7 +154,7 @@ def run_tests(implementation: str, task: dict) -> tuple[bool, str]: try: result = _sp.run( - ["python3", "-c", full], + [sys.executable, "-c", full], capture_output=True, text=True, timeout=TEST_TIMEOUT_SECONDS, @@ -297,6 +304,17 @@ def arm_echo_judge(task: dict) -> tuple[str, int]: # qwen2.5:7b-instruct — middle ground we're now testing SMALL_JUDGE_MODEL = "qwen2.5:7b-instruct-q4_K_M" SMALL_JUDGE_BASE_URL = "http://localhost:11434" +OPENAI_JUDGE_MODELS = { + "gpt-5.5": "gpt-5.5", + "gpt-5.4": "gpt-5.4", + "gpt-5.4-mini": "gpt-5.4-mini", + "gpt-5.4-nano": "gpt-5.4-nano", +} +GEMINI_JUDGE_MODELS = { + "gemini-2.5-pro": "gemini-2.5-pro", + "gemini-2.5-flash": "gemini-2.5-flash", + "gemini-2.5-flash-lite": "gemini-2.5-flash-lite", +} def arm_echo_small_judge(task: dict) -> tuple[str, int]: @@ -324,6 +342,74 @@ def arm_echo_small_judge(task: dict) -> tuple[str, int]: return call_with_persona(sonnet, PERSONA_A, task["prompt"]), 3 +def arm_echo_judge_openai_model(task: dict, model_name: str) -> tuple[str, int]: + """Echo with a cross-family OpenAI judge model. + + Same structure as echo-judge but the agreement call goes to OpenAI + instead of Haiku, removing same-family bias from the signal. + Requires OPENAI_API_KEY in the environment. + """ + try: + from langchain_openai import ChatOpenAI + except ImportError as exc: + raise RuntimeError( + "echo-judge-openai requires langchain-openai. " + "Run: pip install langchain-openai" + ) from exc + pair = _haiku_pair(task["prompt"]) + openai_judge = ChatOpenAI(model=model_name, temperature=0) + if judge_agree(pair["a"], pair["b"], task, judge=openai_judge): + return pair["a"], 3 + sonnet = ChatClaudeCode(model="sonnet") + return call_with_persona(sonnet, PERSONA_A, task["prompt"]), 4 + + +def arm_echo_judge_openai(task: dict) -> tuple[str, int]: + """Default OpenAI judge arm, currently GPT-5.5.""" + return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.5"]) + + +def arm_echo_judge_openai_gpt_5_4(task: dict) -> tuple[str, int]: + return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.4"]) + + +def arm_echo_judge_openai_gpt_5_4_mini(task: dict) -> tuple[str, int]: + return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.4-mini"]) + + +def arm_echo_judge_openai_gpt_5_4_nano(task: dict) -> tuple[str, int]: + return arm_echo_judge_openai_model(task, OPENAI_JUDGE_MODELS["gpt-5.4-nano"]) + + +def arm_echo_judge_gemini_model(task: dict, model_name: str) -> tuple[str, int]: + """Echo with a Gemini judge model via Google AI Studio/Gemini API.""" + try: + from langchain_google_genai import ChatGoogleGenerativeAI + except ImportError as exc: + raise RuntimeError( + "echo-judge-gemini requires langchain-google-genai. " + "Run: pip install langchain-google-genai" + ) from exc + pair = _haiku_pair(task["prompt"]) + gemini_judge = ChatGoogleGenerativeAI(model=model_name, temperature=0) + if judge_agree(pair["a"], pair["b"], task, judge=gemini_judge): + return pair["a"], 3 + sonnet = ChatClaudeCode(model="sonnet") + return call_with_persona(sonnet, PERSONA_A, task["prompt"]), 4 + + +def arm_echo_judge_gemini_pro(task: dict) -> tuple[str, int]: + return arm_echo_judge_gemini_model(task, GEMINI_JUDGE_MODELS["gemini-2.5-pro"]) + + +def arm_echo_judge_gemini_flash(task: dict) -> tuple[str, int]: + return arm_echo_judge_gemini_model(task, GEMINI_JUDGE_MODELS["gemini-2.5-flash"]) + + +def arm_echo_judge_gemini_flash_lite(task: dict) -> tuple[str, int]: + return arm_echo_judge_gemini_model(task, GEMINI_JUDGE_MODELS["gemini-2.5-flash-lite"]) + + def arm_echo_oracle(task: dict) -> tuple[str, int]: """Oracle agreement signal: ground-truth test pass/fail. @@ -352,6 +438,13 @@ def arm_echo_oracle(task: dict) -> tuple[str, int]: "echo-ast": arm_echo_ast, "echo-judge": arm_echo_judge, "echo-small-judge": arm_echo_small_judge, + "echo-judge-openai": arm_echo_judge_openai, + "echo-judge-openai-gpt-5.4": arm_echo_judge_openai_gpt_5_4, + "echo-judge-openai-gpt-5.4-mini": arm_echo_judge_openai_gpt_5_4_mini, + "echo-judge-openai-gpt-5.4-nano": arm_echo_judge_openai_gpt_5_4_nano, + "echo-judge-gemini-pro": arm_echo_judge_gemini_pro, + "echo-judge-gemini-flash": arm_echo_judge_gemini_flash, + "echo-judge-gemini-flash-lite": arm_echo_judge_gemini_flash_lite, "echo-oracle": arm_echo_oracle, } @@ -374,12 +467,18 @@ def run_one(task: dict, arm_name: str, arm_fn: Callable[[dict], tuple[str, int]] t0 = time.perf_counter() try: output, sub_calls = arm_fn(task) - passed, detail = run_tests(output, task) except Exception as e: return TaskResult( task["task_id"], arm_name, False, f"{type(e).__name__}: {str(e)[:200]}", time.perf_counter() - t0, 0, ) + try: + passed, detail = run_tests(output, task) + except Exception as e: + return TaskResult( + task["task_id"], arm_name, False, f"test runner {type(e).__name__}: {str(e)[:200]}", + time.perf_counter() - t0, sub_calls, + ) return TaskResult(task["task_id"], arm_name, passed, detail, time.perf_counter() - t0, sub_calls) @@ -391,10 +490,13 @@ def summarize(results: list[TaskResult]) -> dict: for arm, rs in by_arm.items(): n = len(rs) passed = sum(1 for r in rs if r.passed) - # "Escalation rate" for Echo arms: fraction of tasks that used >2 calls - # (oracle and lexical both escalate when sub_calls jumps from 2 to 3). - # For non-Echo arms this is just an extra metric that happens to be 0%. - escalated = sum(1 for r in rs if r.sub_calls > 2) + failure_details = Counter(r.detail for r in rs if not r.passed) + # Escalation = Sonnet was called. Threshold depends on arm: + # lexical/ast/oracle/small-judge: accept=2, escalate=3 -> threshold >2 + # judge/provider-judge: accept=3 (pair+judge), escalate=4 -> threshold >3 + provider_judge_prefixes = ("echo-judge-openai", "echo-judge-gemini") + threshold = 3 if arm == "echo-judge" or arm.startswith(provider_judge_prefixes) else 2 + escalated = sum(1 for r in rs if r.sub_calls > threshold) summary[arm] = { "n": n, "pass_rate": round(passed / n, 3) if n else None, @@ -402,6 +504,8 @@ def summarize(results: list[TaskResult]) -> dict: "mean_wall_seconds": round(sum(r.wall_seconds for r in rs) / n, 2) if n else None, "total_sub_calls": sum(r.sub_calls for r in rs), "mean_sub_calls": round(sum(r.sub_calls for r in rs) / n, 2) if n else None, + "failures": n - passed, + "top_failure_details": dict(failure_details.most_common(5)), } return summary diff --git a/experiment/scripts/run_bbh_pilot.py b/experiment/scripts/run_bbh_pilot.py index 38ce526..ec14652 100644 --- a/experiment/scripts/run_bbh_pilot.py +++ b/experiment/scripts/run_bbh_pilot.py @@ -25,7 +25,6 @@ def run_one_bbh(task: dict, arm_name: str, arm_fn) -> TaskResult: t0 = time.perf_counter() try: output, sub_calls = arm_fn(task) - passed, detail = score_bbh(output, task) except Exception as e: return TaskResult( task["task_id"], @@ -35,6 +34,17 @@ def run_one_bbh(task: dict, arm_name: str, arm_fn) -> TaskResult: time.perf_counter() - t0, 0, ) + try: + passed, detail = score_bbh(output, task) + except Exception as e: + return TaskResult( + task["task_id"], + arm_name, + False, + f"scorer {type(e).__name__}: {str(e)[:200]}", + time.perf_counter() - t0, + sub_calls, + ) return TaskResult( task["task_id"], arm_name, passed, detail, time.perf_counter() - t0, sub_calls, diff --git a/experiment/tests/test_bbh_scoring.py b/experiment/tests/test_bbh_scoring.py index fb524af..04cad94 100644 --- a/experiment/tests/test_bbh_scoring.py +++ b/experiment/tests/test_bbh_scoring.py @@ -9,6 +9,7 @@ sys.path.insert(0, str(Path(__file__).resolve().parent.parent)) from benchmarks.bbh import extract_choice, format_prompt, normalize_gold, score_bbh +from benchmarks.bbh import _row_to_task, normalize_gold_for_choices class TestExtractChoice(unittest.TestCase): @@ -33,6 +34,68 @@ def test_last_line_letter(self) -> None: "D", ) + def test_final_answer_sentence_beats_reasoning_option_mentions(self) -> None: + self.assertEqual( + extract_choice("I considered (A) and then (B). Therefore the answer is C."), + "C", + ) + + def test_correct_answer_sentence_beats_reasoning_option_mentions(self) -> None: + self.assertEqual( + extract_choice("Option (A) is tempting, but the correct answer is (C)."), + "C", + ) + + def test_reasoning_option_mentions_alone_are_not_parseable(self) -> None: + self.assertIsNone(extract_choice("I considered (A), then (B), then (C).")) + + # Regression: an answer stated EARLY followed by trailing reasoning must + # still be recovered. Tail-only matching dropped these (cage-match #335). + def test_answer_stated_early_then_six_trailing_lines(self) -> None: + out = "The answer is C.\n\nl1\nl2\nl3\nl4\nl5\nl6" + self.assertEqual(extract_choice(out), "C") + + def test_answer_colon_early_then_trailing_lines(self) -> None: + out = "Answer: C\nthanks\nbye\n.\n-\n=" + self.assertEqual(extract_choice(out), "C") + + # Regression: under re.I, [A-Z] also matches lowercase, so a broad + # "answer is " pattern must NOT grab the first letter of prose. + def test_answer_is_lowercase_word_is_not_a_false_letter(self) -> None: + self.assertIsNone(extract_choice("After analysis, the answer is straightforward.")) + self.assertIsNone(extract_choice("The answer is dependent on the framing.")) + self.assertIsNone(extract_choice("So the final answer is best understood as follows.")) + + # Regression: the latest high-confidence declaration wins even when it + # uses a DIFFERENT pattern family than an earlier one (cage-match r2: + # Carnot). "Answer: A" early, "therefore the answer is C" late -> C. + def test_latest_high_confidence_wins_across_pattern_families(self) -> None: + self.assertEqual( + extract_choice("Answer: A\nLet me reconsider.\nTherefore the answer is C."), + "C", + ) + self.assertEqual( + extract_choice("Answer: A\nactually, the answer is C."), + "C", + ) + + +class TestNormalizeGoldForChoices(unittest.TestCase): + # Regression: choices carrying text without an explicit "label" key must + # not raise IndexError (cage-match: Kelvin). + def test_text_choices_without_labels_fall_back_to_positional(self) -> None: + self.assertEqual( + normalize_gold_for_choices("No", {"text": ["Yes", "No"]}), + "B", + ) + + # Regression: a single-letter target must resolve to its own label, not be + # remapped by a decoy choice whose text is that same letter (cage-match: + # Carnot). + def test_single_letter_target_not_remapped_by_decoy_text(self) -> None: + choices = {"label": ["A", "B", "C"], "text": ["apple", "A", "cat"]} + self.assertEqual(normalize_gold_for_choices("A", choices), "A") + class TestScoreBbh(unittest.TestCase): def _task(self, gold: str = "C") -> dict: @@ -57,6 +120,26 @@ def test_unparseable(self) -> None: self.assertFalse(ok) self.assertEqual(detail, "unparseable") + def test_binary_answer_text_scores_against_synthetic_choices(self) -> None: + task = _row_to_task( + "causal_judgement", + 0, + {"question": "Did X cause Y?", "target": "No"}, + ) + ok, detail = score_bbh("Reasoning...\nAnswer: No", task) + self.assertTrue(ok) + self.assertEqual(detail, "passed") + + def test_binary_answer_letter_scores_against_synthetic_choices(self) -> None: + task = _row_to_task( + "causal_judgement", + 0, + {"question": "Did X cause Y?", "target": "No"}, + ) + ok, detail = score_bbh("Answer: B", task) + self.assertTrue(ok) + self.assertEqual(detail, "passed") + class TestFormatPrompt(unittest.TestCase): def test_includes_choices(self) -> None: