Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion benchmarks/skillsbench/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""SkillsBench configuration defaults."""

from benchmarks.utils.harbor_compat import get_harbor_dataset


# Default inference settings (only include values actually used by argparse)
INFER_DEFAULTS = {
"dataset": "benchflow/skillsbench",
"dataset": get_harbor_dataset("skillsbench"),
"output_dir": "./evaluation_outputs",
"num_workers": 1,
}
Expand Down
5 changes: 4 additions & 1 deletion benchmarks/terminalbench/config.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,11 @@
"""Terminal-Bench configuration defaults."""

from benchmarks.utils.harbor_compat import get_harbor_dataset


# Default inference settings (only include values actually used by argparse)
INFER_DEFAULTS = {
"dataset": "terminal-bench@2.0",
"dataset": get_harbor_dataset("terminalbench"),
"output_dir": "./evaluation_outputs",
"num_workers": 1,
}
Expand Down
51 changes: 51 additions & 0 deletions benchmarks/utils/harbor_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,51 @@
"""Harbor dataset names for benchmarks with Harbor-backed wrappers."""

from __future__ import annotations

from collections.abc import Mapping
from types import MappingProxyType


HARBOR_DATASET_BY_BENCHMARK: Mapping[str, str] = MappingProxyType(
{
"gaia": "gaia",
"multiswebench": "multi-swe-bench",
"skillsbench": "benchflow/skillsbench",
"swebench": "swebench-verified",
"swebenchmultilingual": "swebench_multilingual",
"swebenchpro": "swebenchpro",
"swesmith": "swesmith",
"swtbench": "swtbench-verified",
"swegym": "swegym",
"terminalbench": "terminal-bench@2.0",
}
)


def normalize_benchmark_name(benchmark_name: str) -> str:
"""Normalize benchmark names to the package/module key used in this repo."""
return (
benchmark_name.strip()
.lower()
.replace("-", "")
.replace("_", "")
.replace("/", "")
)


def is_harbor_covered(benchmark_name: str) -> bool:
"""Return whether the benchmark has a Harbor dataset mapping."""
return normalize_benchmark_name(benchmark_name) in HARBOR_DATASET_BY_BENCHMARK


def get_harbor_dataset(benchmark_name: str) -> str:
"""Return the Harbor dataset name for a covered benchmark."""
key = normalize_benchmark_name(benchmark_name)
try:
return HARBOR_DATASET_BY_BENCHMARK[key]
except KeyError as exc:
supported = ", ".join(sorted(HARBOR_DATASET_BY_BENCHMARK))
raise KeyError(
f"No Harbor dataset mapping for {benchmark_name!r}. "
f"Supported benchmarks: {supported}"
) from exc
56 changes: 56 additions & 0 deletions tests/test_harbor_compat.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
"""Tests for Harbor benchmark compatibility mappings."""

import operator
from typing import Any, cast

import pytest

from benchmarks.skillsbench.config import INFER_DEFAULTS as SKILLS_DEFAULTS
from benchmarks.terminalbench.config import INFER_DEFAULTS as TERMINAL_DEFAULTS
from benchmarks.utils.harbor_compat import (
HARBOR_DATASET_BY_BENCHMARK,
get_harbor_dataset,
is_harbor_covered,
normalize_benchmark_name,
)


def test_known_harbor_dataset_mappings() -> None:
"""Test Harbor dataset names for covered OpenHands benchmark modules."""
assert get_harbor_dataset("swebench") == "swebench-verified"
assert get_harbor_dataset("swebenchpro") == "swebenchpro"
assert get_harbor_dataset("swebenchmultilingual") == "swebench_multilingual"
assert get_harbor_dataset("swtbench") == "swtbench-verified"
assert get_harbor_dataset("swesmith") == "swesmith"
assert get_harbor_dataset("multiswebench") == "multi-swe-bench"
assert get_harbor_dataset("gaia") == "gaia"
assert get_harbor_dataset("skillsbench") == "benchflow/skillsbench"
assert get_harbor_dataset("terminalbench") == "terminal-bench@2.0"
assert get_harbor_dataset("swegym") == "swegym"


def test_harbor_dataset_mapping_is_read_only() -> None:
"""Test callers cannot mutate the shared Harbor dataset registry."""
mapping_as_any = cast(Any, HARBOR_DATASET_BY_BENCHMARK)
with pytest.raises(TypeError):
operator.setitem(mapping_as_any, "commit0", "commit0")


def test_name_normalization_accepts_cli_style_names() -> None:
"""Test hyphen/underscore variants normalize to the same mapping key."""
assert normalize_benchmark_name("SWE-Bench_Pro") == "swebenchpro"
assert get_harbor_dataset("swe-bench-pro") == "swebenchpro"
assert get_harbor_dataset("terminal-bench") == "terminal-bench@2.0"


def test_uncovered_benchmark_is_explicit() -> None:
"""Test benchmarks without Harbor adapters are not silently mapped."""
assert is_harbor_covered("commit0") is False
with pytest.raises(KeyError, match="No Harbor dataset mapping"):
get_harbor_dataset("commit0")


def test_harbor_backed_defaults_use_mapping() -> None:
"""Test existing Harbor-backed wrappers consume the shared mapping."""
assert SKILLS_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["skillsbench"]
assert TERMINAL_DEFAULTS["dataset"] == HARBOR_DATASET_BY_BENCHMARK["terminalbench"]
Loading