diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py index 8b55a92b..8f275037 100644 --- a/benchmarks/skillsbench/config.py +++ b/benchmarks/skillsbench/config.py @@ -1,8 +1,11 @@ """SkillsBench configuration defaults.""" +from benchmarks.utils.harbor_compat import get_harbor_dataset + + # Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { - "dataset": "benchflow/skillsbench", + "dataset": get_harbor_dataset("skillsbench"), "output_dir": "./evaluation_outputs", "num_workers": 1, } diff --git a/benchmarks/terminalbench/config.py b/benchmarks/terminalbench/config.py index 35748c0f..e132c062 100644 --- a/benchmarks/terminalbench/config.py +++ b/benchmarks/terminalbench/config.py @@ -1,8 +1,11 @@ """Terminal-Bench configuration defaults.""" +from benchmarks.utils.harbor_compat import get_harbor_dataset + + # Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { - "dataset": "terminal-bench@2.0", + "dataset": get_harbor_dataset("terminalbench"), "output_dir": "./evaluation_outputs", "num_workers": 1, } diff --git a/benchmarks/utils/harbor_compat.py b/benchmarks/utils/harbor_compat.py new file mode 100644 index 00000000..8ea2ca62 --- /dev/null +++ b/benchmarks/utils/harbor_compat.py @@ -0,0 +1,51 @@ +"""Harbor dataset names for benchmarks with Harbor-backed wrappers.""" + +from __future__ import annotations + +from collections.abc import Mapping +from types import MappingProxyType + + +HARBOR_DATASET_BY_BENCHMARK: Mapping[str, str] = MappingProxyType( + { + "gaia": "gaia", + "multiswebench": "multi-swe-bench", + "skillsbench": "benchflow/skillsbench", + "swebench": "swebench-verified", + "swebenchmultilingual": "swebench_multilingual", + "swebenchpro": "swebenchpro", + "swesmith": "swesmith", + "swtbench": "swtbench-verified", + "swegym": "swegym", + "terminalbench": "terminal-bench@2.0", + } +) + + +def normalize_benchmark_name(benchmark_name: str) -> str: + """Normalize benchmark names to the package/module key used in this repo.""" + return ( + benchmark_name.strip() + .lower() + .replace("-", "") + .replace("_", "") + .replace("/", "") + ) + + +def is_harbor_covered(benchmark_name: str) -> bool: + """Return whether the benchmark has a Harbor dataset mapping.""" + return normalize_benchmark_name(benchmark_name) in HARBOR_DATASET_BY_BENCHMARK + + +def get_harbor_dataset(benchmark_name: str) -> str: + """Return the Harbor dataset name for a covered benchmark.""" + key = normalize_benchmark_name(benchmark_name) + try: + return HARBOR_DATASET_BY_BENCHMARK[key] + except KeyError as exc: + supported = ", ".join(sorted(HARBOR_DATASET_BY_BENCHMARK)) + raise KeyError( + f"No Harbor dataset mapping for {benchmark_name!r}. " + f"Supported benchmarks: {supported}" + ) from exc diff --git a/tests/test_harbor_compat.py b/tests/test_harbor_compat.py new file mode 100644 index 00000000..016e1fdb --- /dev/null +++ b/tests/test_harbor_compat.py @@ -0,0 +1,56 @@ +"""Tests for Harbor benchmark compatibility mappings.""" + +import operator +from typing import Any, cast + +import pytest + +from benchmarks.skillsbench.config import INFER_DEFAULTS as SKILLS_DEFAULTS +from benchmarks.terminalbench.config import INFER_DEFAULTS as TERMINAL_DEFAULTS +from benchmarks.utils.harbor_compat import ( + HARBOR_DATASET_BY_BENCHMARK, + get_harbor_dataset, + is_harbor_covered, + normalize_benchmark_name, +) + + +def test_known_harbor_dataset_mappings() -> None: + """Test Harbor dataset names for covered OpenHands benchmark modules.""" + assert get_harbor_dataset("swebench") == "swebench-verified" + assert get_harbor_dataset("swebenchpro") == "swebenchpro" + assert get_harbor_dataset("swebenchmultilingual") == "swebench_multilingual" + assert get_harbor_dataset("swtbench") == "swtbench-verified" + assert get_harbor_dataset("swesmith") == "swesmith" + assert get_harbor_dataset("multiswebench") == "multi-swe-bench" + assert get_harbor_dataset("gaia") == "gaia" + assert get_harbor_dataset("skillsbench") == "benchflow/skillsbench" + assert get_harbor_dataset("terminalbench") == "terminal-bench@2.0" + assert get_harbor_dataset("swegym") == "swegym" + + +def test_harbor_dataset_mapping_is_read_only() -> None: + """Test callers cannot mutate the shared Harbor dataset registry.""" + mapping_as_any = cast(Any, HARBOR_DATASET_BY_BENCHMARK) + with pytest.raises(TypeError): + operator.setitem(mapping_as_any, "commit0", "commit0") + + +def test_name_normalization_accepts_cli_style_names() -> None: + """Test hyphen/underscore variants normalize to the same mapping key.""" + assert normalize_benchmark_name("SWE-Bench_Pro") == "swebenchpro" + assert get_harbor_dataset("swe-bench-pro") == "swebenchpro" + assert get_harbor_dataset("terminal-bench") == "terminal-bench@2.0" + + +def test_uncovered_benchmark_is_explicit() -> None: + """Test benchmarks without Harbor adapters are not silently mapped.""" + assert is_harbor_covered("commit0") is False + with pytest.raises(KeyError, match="No Harbor dataset mapping"): + get_harbor_dataset("commit0") + + +def test_harbor_backed_defaults_use_mapping() -> None: + """Test existing Harbor-backed wrappers consume the shared mapping.""" + assert SKILLS_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["skillsbench"] + assert TERMINAL_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["terminalbench"]