From 9169a9305039b7b24d0e7eb7e5bf761abfe96152 Mon Sep 17 00:00:00 2001 From: Graham Neubig Date: Thu, 28 May 2026 11:28:45 -0400 Subject: [PATCH 1/2] Add Harbor covered benchmark mapping --- benchmarks/skillsbench/config.py | 5 +++- benchmarks/terminalbench/config.py | 5 +++- benchmarks/utils/harbor_compat.py | 46 ++++++++++++++++++++++++++++++ tests/test_harbor_compat.py | 46 ++++++++++++++++++++++++++++++ 4 files changed, 100 insertions(+), 2 deletions(-) create mode 100644 benchmarks/utils/harbor_compat.py create mode 100644 tests/test_harbor_compat.py diff --git a/benchmarks/skillsbench/config.py b/benchmarks/skillsbench/config.py index 8b55a92b0..8f2750379 100644 --- a/benchmarks/skillsbench/config.py +++ b/benchmarks/skillsbench/config.py @@ -1,8 +1,11 @@ """SkillsBench configuration defaults.""" +from benchmarks.utils.harbor_compat import get_harbor_dataset + + # Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { - "dataset": "benchflow/skillsbench", + "dataset": get_harbor_dataset("skillsbench"), "output_dir": "./evaluation_outputs", "num_workers": 1, } diff --git a/benchmarks/terminalbench/config.py b/benchmarks/terminalbench/config.py index 35748c0f4..e132c0627 100644 --- a/benchmarks/terminalbench/config.py +++ b/benchmarks/terminalbench/config.py @@ -1,8 +1,11 @@ """Terminal-Bench configuration defaults.""" +from benchmarks.utils.harbor_compat import get_harbor_dataset + + # Default inference settings (only include values actually used by argparse) INFER_DEFAULTS = { - "dataset": "terminal-bench@2.0", + "dataset": get_harbor_dataset("terminalbench"), "output_dir": "./evaluation_outputs", "num_workers": 1, } diff --git a/benchmarks/utils/harbor_compat.py b/benchmarks/utils/harbor_compat.py new file mode 100644 index 000000000..66efd2d60 --- /dev/null +++ b/benchmarks/utils/harbor_compat.py @@ -0,0 +1,46 @@ +"""Harbor dataset names for benchmarks with Harbor-backed wrappers.""" + +from __future__ import annotations + + +HARBOR_DATASET_BY_BENCHMARK: dict[str, str] = { + "gaia": "gaia", + "multiswebench": "multi-swe-bench", + "skillsbench": "benchflow/skillsbench", + "swebench": "swebench-verified", + "swebenchmultilingual": "swebench_multilingual", + "swebenchpro": "swebenchpro", + "swesmith": "swesmith", + "swtbench": "swtbench-verified", + "swegym": "swegym", + "terminalbench": "terminal-bench@2.0", +} + + +def normalize_benchmark_name(benchmark_name: str) -> str: + """Normalize benchmark names to the package/module key used in this repo.""" + return ( + benchmark_name.strip() + .lower() + .replace("-", "") + .replace("_", "") + .replace("/", "") + ) + + +def is_harbor_covered(benchmark_name: str) -> bool: + """Return whether the benchmark has a Harbor dataset mapping.""" + return normalize_benchmark_name(benchmark_name) in HARBOR_DATASET_BY_BENCHMARK + + +def get_harbor_dataset(benchmark_name: str) -> str: + """Return the Harbor dataset name for a covered benchmark.""" + key = normalize_benchmark_name(benchmark_name) + try: + return HARBOR_DATASET_BY_BENCHMARK[key] + except KeyError as exc: + supported = ", ".join(sorted(HARBOR_DATASET_BY_BENCHMARK)) + raise KeyError( + f"No Harbor dataset mapping for {benchmark_name!r}. " + f"Supported benchmarks: {supported}" + ) from exc diff --git a/tests/test_harbor_compat.py b/tests/test_harbor_compat.py new file mode 100644 index 000000000..8765221b0 --- /dev/null +++ b/tests/test_harbor_compat.py @@ -0,0 +1,46 @@ +"""Tests for Harbor benchmark compatibility mappings.""" + +import pytest + +from benchmarks.skillsbench.config import INFER_DEFAULTS as SKILLS_DEFAULTS +from benchmarks.terminalbench.config import INFER_DEFAULTS as TERMINAL_DEFAULTS +from benchmarks.utils.harbor_compat import ( + HARBOR_DATASET_BY_BENCHMARK, + get_harbor_dataset, + is_harbor_covered, + normalize_benchmark_name, +) + + +def test_known_harbor_dataset_mappings() -> None: + """Test Harbor dataset names for covered OpenHands benchmark modules.""" + assert get_harbor_dataset("swebench") == "swebench-verified" + assert get_harbor_dataset("swebenchpro") == "swebenchpro" + assert get_harbor_dataset("swebenchmultilingual") == "swebench_multilingual" + assert get_harbor_dataset("swtbench") == "swtbench-verified" + assert get_harbor_dataset("swesmith") == "swesmith" + assert get_harbor_dataset("multiswebench") == "multi-swe-bench" + assert get_harbor_dataset("gaia") == "gaia" + assert get_harbor_dataset("skillsbench") == "benchflow/skillsbench" + assert get_harbor_dataset("terminalbench") == "terminal-bench@2.0" + assert get_harbor_dataset("swegym") == "swegym" + + +def test_name_normalization_accepts_cli_style_names() -> None: + """Test hyphen/underscore variants normalize to the same mapping key.""" + assert normalize_benchmark_name("SWE-Bench_Pro") == "swebenchpro" + assert get_harbor_dataset("swe-bench-pro") == "swebenchpro" + assert get_harbor_dataset("terminal-bench") == "terminal-bench@2.0" + + +def test_uncovered_benchmark_is_explicit() -> None: + """Test benchmarks without Harbor adapters are not silently mapped.""" + assert is_harbor_covered("commit0") is False + with pytest.raises(KeyError, match="No Harbor dataset mapping"): + get_harbor_dataset("commit0") + + +def test_harbor_backed_defaults_use_mapping() -> None: + """Test existing Harbor-backed wrappers consume the shared mapping.""" + assert SKILLS_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["skillsbench"] + assert TERMINAL_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["terminalbench"] From f1d1c4907ce5ccb12eda530ad346bbcb8e0cc493 Mon Sep 17 00:00:00 2001 From: openhands Date: Sun, 31 May 2026 17:32:48 +0000 Subject: [PATCH 2/2] chore: address PR review feedback (#731) Co-authored-by: openhands --- benchmarks/utils/harbor_compat.py | 31 ++++++++++++++++++------------- tests/test_harbor_compat.py | 10 ++++++++++ 2 files changed, 28 insertions(+), 13 deletions(-) diff --git a/benchmarks/utils/harbor_compat.py b/benchmarks/utils/harbor_compat.py index 66efd2d60..8ea2ca624 100644 --- a/benchmarks/utils/harbor_compat.py +++ b/benchmarks/utils/harbor_compat.py @@ -2,19 +2,24 @@ from __future__ import annotations - -HARBOR_DATASET_BY_BENCHMARK: dict[str, str] = { - "gaia": "gaia", - "multiswebench": "multi-swe-bench", - "skillsbench": "benchflow/skillsbench", - "swebench": "swebench-verified", - "swebenchmultilingual": "swebench_multilingual", - "swebenchpro": "swebenchpro", - "swesmith": "swesmith", - "swtbench": "swtbench-verified", - "swegym": "swegym", - "terminalbench": "terminal-bench@2.0", -} +from collections.abc import Mapping +from types import MappingProxyType + + +HARBOR_DATASET_BY_BENCHMARK: Mapping[str, str] = MappingProxyType( + { + "gaia": "gaia", + "multiswebench": "multi-swe-bench", + "skillsbench": "benchflow/skillsbench", + "swebench": "swebench-verified", + "swebenchmultilingual": "swebench_multilingual", + "swebenchpro": "swebenchpro", + "swesmith": "swesmith", + "swtbench": "swtbench-verified", + "swegym": "swegym", + "terminalbench": "terminal-bench@2.0", + } +) def normalize_benchmark_name(benchmark_name: str) -> str: diff --git a/tests/test_harbor_compat.py b/tests/test_harbor_compat.py index 8765221b0..016e1fdba 100644 --- a/tests/test_harbor_compat.py +++ b/tests/test_harbor_compat.py @@ -1,5 +1,8 @@ """Tests for Harbor benchmark compatibility mappings.""" +import operator +from typing import Any, cast + import pytest from benchmarks.skillsbench.config import INFER_DEFAULTS as SKILLS_DEFAULTS @@ -26,6 +29,13 @@ def test_known_harbor_dataset_mappings() -> None: assert get_harbor_dataset("swegym") == "swegym" +def test_harbor_dataset_mapping_is_read_only() -> None: + """Test callers cannot mutate the shared Harbor dataset registry.""" + mapping_as_any = cast(Any, HARBOR_DATASET_BY_BENCHMARK) + with pytest.raises(TypeError): + operator.setitem(mapping_as_any, "commit0", "commit0") + + def test_name_normalization_accepts_cli_style_names() -> None: """Test hyphen/underscore variants normalize to the same mapping key.""" assert normalize_benchmark_name("SWE-Bench_Pro") == "swebenchpro"