diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py index 535a27d62..abb71d988 100644 --- a/benchmarks/skillsbench/run_infer.py +++ b/benchmarks/skillsbench/run_infer.py @@ -11,7 +11,6 @@ import argparse import json import os -import re import shutil import subprocess import sys @@ -19,10 +18,16 @@ from datetime import datetime, timezone from pathlib import Path -from pydantic import SecretStr - from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.evaluation_utils import construct_eval_output_dir +from benchmarks.utils.harbor import ( + HarborCredentialMode, + check_harbor_installed as _check_harbor_installed, + convert_harbor_to_eval_output as _convert_harbor_to_eval_output, + get_supported_agent_name as _get_harbor_supported_agent_name, + get_supported_task_filter_flag as _get_harbor_supported_task_filter_flag, + run_harbor_evaluation as _run_harbor_evaluation, +) from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -62,17 +67,7 @@ def check_harbor_installed() -> bool: """Check if harbor CLI is installed and available.""" - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] - try: - result = subprocess.run( - [harbor_exe, "--help"], - capture_output=True, - text=True, - timeout=10, - ) - return result.returncode == 0 - except (FileNotFoundError, subprocess.TimeoutExpired): - return False + return _check_harbor_installed(HARBOR_DEFAULTS["harbor_executable"]) def _run_command(cmd: list[str], error_message: str) -> str: @@ -90,42 +85,15 @@ def _run_command(cmd: list[str], error_message: str) -> str: def _get_supported_task_filter_flag(harbor_exe: str) -> str: """Detect whether Harbor expects --task-name or --include-task-name.""" - try: - result = subprocess.run( - [harbor_exe, "run", "--help"], - capture_output=True, - text=True, - ) - except FileNotFoundError: - return "--include-task-name" - - help_text = f"{result.stdout}\n{result.stderr}" - supported_flags = set(re.findall(r"(? str: """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk.""" - try: - result = subprocess.run( - [harbor_exe, "run", "--help"], - capture_output=True, - text=True, - ) - except FileNotFoundError: - return HARBOR_DEFAULTS["agent_name"] - - help_text = f"{result.stdout}\n{result.stderr}" - compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower()) - if "openhands-sdk" in compact_help_text: - return "openhands-sdk" - if "openhands" in compact_help_text: - return "openhands" - return HARBOR_DEFAULTS["agent_name"] + return _get_harbor_supported_agent_name( + harbor_exe, + default_agent_name=HARBOR_DEFAULTS["agent_name"], + ) def get_skillsbench_main_commit( @@ -405,122 +373,29 @@ def run_harbor_evaluation( Returns: Path to the harbor output directory. """ - harbor_output_dir = Path(output_dir) / "harbor_output" - harbor_output_dir.mkdir(parents=True, exist_ok=True) harbor_exe = HARBOR_DEFAULTS["harbor_executable"] agent_name = _get_supported_agent_name(harbor_exe) task_filter_flag = _get_supported_task_filter_flag(harbor_exe) - # Build harbor command using harbor CLI flags. - # Use absolute path for --jobs-dir to avoid CWD-relative path issues. - cmd = [ - harbor_exe, - "run", - "--path" if dataset_is_path else "-d", - dataset, - "-a", - agent_name, - "-m", - llm.model, - "--jobs-dir", - str(harbor_output_dir.resolve()), - "--n-concurrent", - str(num_workers), - ] - - # Add specific task names if provided - if task_ids: - for task_id in task_ids: - cmd.extend( - [ - task_filter_flag, - _normalize_task_filter_value( - task_id, dataset_is_path=dataset_is_path - ), - ] - ) - - if n_limit is not None: - cmd.extend(["--n-tasks", str(n_limit)]) - - logger.info(f"Running harbor command: {' '.join(cmd)}") - logger.info(f"Output directory: {harbor_output_dir}") - - # harbor's openhands-sdk agent reads LLM credentials from the host process - # environment (os.environ), not from --ae flags which go to the sandbox. - env = os.environ.copy() - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - env["LLM_API_KEY"] = api_key - if llm.base_url: - env["LLM_BASE_URL"] = llm.base_url - - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - env=env, - ) - - if result.returncode != 0: - if ( - task_ids - and task_filter_flag == "--task-name" - and "No such option: --task-name" in result.stderr - ): - fallback_cmd = [ - "--include-task-name" if part == "--task-name" else part - for part in cmd - ] - logger.warning( - "Harbor does not support --task-name; retrying with " - "--include-task-name" - ) - result = subprocess.run( - fallback_cmd, - capture_output=True, - text=True, - env=env, - ) - - if result.returncode != 0: - logger.error(f"Harbor command failed with code {result.returncode}") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") - - logger.info("Harbor evaluation completed successfully") - logger.info(f"stdout: {result.stdout}") - - except FileNotFoundError: - raise RuntimeError( - "Harbor CLI not found. Please install harbor: pip install harbor" - ) - - return harbor_output_dir - - -def _find_job_dir(harbor_output_dir: Path) -> Path: - """Find the harbor job directory (timestamp-named) inside the output dir.""" - # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) - # containing result.json and trial subdirectories - candidates = [ - d - for d in harbor_output_dir.iterdir() - if d.is_dir() and (d / "result.json").exists() - ] - if not candidates: - raise RuntimeError( - f"No harbor job directory found in {harbor_output_dir}. " - f"Expected a timestamp-named directory containing result.json." - ) - # Use the most recent job directory if multiple exist - return sorted(candidates)[-1] + return _run_harbor_evaluation( + llm=llm, + dataset=dataset, + output_dir=output_dir, + harbor_executable=harbor_exe, + agent_name=agent_name, + dataset_is_path=dataset_is_path, + num_workers=num_workers, + task_ids=task_ids, + n_limit=n_limit, + task_filter_flag=task_filter_flag, + normalize_task_id=lambda task_id: _normalize_task_filter_value( + task_id, + dataset_is_path=dataset_is_path, + ), + credential_mode=HarborCredentialMode.PROCESS_ENV, + retry_legacy_task_flag=True, + subprocess_run=subprocess.run, + ) def convert_harbor_to_eval_output( @@ -539,108 +414,10 @@ def convert_harbor_to_eval_output( harbor_output_dir: Path to harbor output directory. eval_output_path: Path to write the converted output.jsonl. """ - logger.info(f"Converting harbor output from {harbor_output_dir}") - - job_dir = _find_job_dir(harbor_output_dir) - logger.info(f"Using harbor job directory: {job_dir}") - - # Find trial result files (each trial dir has a result.json) - result_files = list(job_dir.glob("*/result.json")) - # Exclude the job-level result.json - result_files = [f for f in result_files if f.parent != job_dir] - - if not result_files: - raise RuntimeError( - f"No trial result files found in {job_dir}. " - f"Expected result.json files in trial subdirectories." - ) - - logger.info(f"Found {len(result_files)} trial results in {job_dir}") - - results: list[dict] = [] - errors: list[dict] = [] - - for result_file in result_files: - try: - with open(result_file) as f: - trial = json.load(f) - - instance_id = _canonicalize_instance_id( - trial.get("task_name", result_file.parent.name) - ) - - # Check for exceptions - if trial.get("exception_info"): - errors.append( - { - "instance_id": instance_id, - "error": str(trial["exception_info"]), - "test_result": {}, - } - ) - continue - - # Extract verifier results - verifier_result = trial.get("verifier_result", {}) - rewards = verifier_result.get("rewards", {}) - passed = rewards.get("reward", 0.0) > 0 - - # Extract agent metrics - agent_result = trial.get("agent_result", {}) - - eval_entry = { - "instance_id": instance_id, - "test_result": { - "trial_name": trial.get("trial_name"), - "trial_uri": trial.get("trial_uri"), - "rewards": rewards, - "passed": passed, - }, - "instruction": "", - "error": None, - "history": [], - "metrics": { - "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, - "total_completion_tokens": ( - agent_result.get("n_output_tokens") or 0 - ), - "total_cost_usd": agent_result.get("cost_usd") or 0.0, - }, - } - results.append(eval_entry) - logger.info( - f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" - ) - - except (json.JSONDecodeError, OSError) as e: - logger.error(f"Failed to process result file {result_file}: {e}") - errors.append( - { - "instance_id": _canonicalize_instance_id(result_file.parent.name), - "error": str(e), - "test_result": {}, - } - ) - - if not results and not errors: - raise RuntimeError(f"No trials processed from {harbor_output_dir}") - - if not results: - logger.warning( - f"All {len(errors)} trials failed in {harbor_output_dir}; " - "writing error entries for downstream reporting" - ) - - # Write results to output.jsonl - with open(eval_output_path, "w") as f: - for entry in results: - f.write(json.dumps(entry) + "\n") - for entry in errors: - f.write(json.dumps(entry) + "\n") - - logger.info( - f"Wrote {len(results)} successful + {len(errors)} failed entries " - f"to {eval_output_path}" + _convert_harbor_to_eval_output( + harbor_output_dir, + eval_output_path, + canonicalize_instance_id=_canonicalize_instance_id, ) diff --git a/benchmarks/terminalbench/run_infer.py b/benchmarks/terminalbench/run_infer.py index a01c4006d..48e785c79 100644 --- a/benchmarks/terminalbench/run_infer.py +++ b/benchmarks/terminalbench/run_infer.py @@ -16,10 +16,15 @@ from datetime import datetime, timezone from pathlib import Path -from pydantic import SecretStr - from benchmarks.terminalbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS from benchmarks.utils.evaluation_utils import construct_eval_output_dir +from benchmarks.utils.harbor import ( + HarborCredentialMode, + check_harbor_installed as _check_harbor_installed, + convert_harbor_to_eval_output, + get_supported_task_filter_flag, + run_harbor_evaluation as _run_harbor_evaluation, +) from benchmarks.utils.report_costs import generate_cost_report from openhands.sdk import LLM, get_logger @@ -32,17 +37,10 @@ def check_harbor_installed() -> bool: """Check if harbor CLI is installed and available.""" - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] - try: - result = subprocess.run( - [harbor_exe, "--version"], - capture_output=True, - text=True, - timeout=10, - ) - return result.returncode == 0 - except (FileNotFoundError, subprocess.TimeoutExpired): - return False + return _check_harbor_installed( + HARBOR_DEFAULTS["harbor_executable"], + probe_arg="--version", + ) def run_harbor_evaluation( @@ -66,207 +64,20 @@ def run_harbor_evaluation( Returns: Path to the harbor output directory. """ - harbor_output_dir = Path(output_dir) / "harbor_output" - harbor_output_dir.mkdir(parents=True, exist_ok=True) - harbor_exe = HARBOR_DEFAULTS["harbor_executable"] - - # Build harbor command using harbor CLI flags. - # Use absolute path for --jobs-dir to avoid CWD-relative path issues. - cmd = [ - harbor_exe, - "run", - "-d", - dataset, - "-a", - HARBOR_DEFAULTS["agent_name"], - "-m", - llm.model, - "--jobs-dir", - str(harbor_output_dir.resolve()), - "--n-concurrent", - str(num_workers), - ] - - # Pass LLM credentials as agent environment variables - if llm.api_key: - api_key = ( - llm.api_key.get_secret_value() - if isinstance(llm.api_key, SecretStr) - else llm.api_key - ) - cmd.extend(["--ae", f"LLM_API_KEY={api_key}"]) - if llm.base_url: - cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) - - # Add specific task names if provided - if task_ids: - for task_id in task_ids: - cmd.extend(["--task-name", task_id]) - - if n_limit is not None: - cmd.extend(["--n-tasks", str(n_limit)]) - - logger.info(f"Running harbor command: {' '.join(cmd)}") - logger.info(f"Output directory: {harbor_output_dir}") - - try: - result = subprocess.run( - cmd, - capture_output=True, - text=True, - ) - - if result.returncode != 0: - logger.error(f"Harbor command failed with code {result.returncode}") - logger.error(f"stdout: {result.stdout}") - logger.error(f"stderr: {result.stderr}") - raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") - - logger.info("Harbor evaluation completed successfully") - logger.info(f"stdout: {result.stdout}") - - except FileNotFoundError: - raise RuntimeError( - "Harbor CLI not found. Please install harbor: pip install harbor" - ) - - return harbor_output_dir - - -def _find_job_dir(harbor_output_dir: Path) -> Path: - """Find the harbor job directory (timestamp-named) inside the output dir.""" - # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47) - # containing result.json and trial subdirectories - candidates = [ - d - for d in harbor_output_dir.iterdir() - if d.is_dir() and (d / "result.json").exists() - ] - if not candidates: - raise RuntimeError( - f"No harbor job directory found in {harbor_output_dir}. " - f"Expected a timestamp-named directory containing result.json." - ) - # Use the most recent job directory if multiple exist - return sorted(candidates)[-1] - - -def convert_harbor_to_eval_output( - harbor_output_dir: Path, - eval_output_path: Path, -) -> None: - """Convert harbor output to evaluation output format. - - Harbor stores trial results in a job directory structured as: - harbor_output/TIMESTAMP/TRIAL_NAME/result.json - - Each trial's result.json contains task_name, verifier_result, agent_result, - timing info, and exception details. - - Args: - harbor_output_dir: Path to harbor output directory. - eval_output_path: Path to write the converted output.jsonl. - """ - logger.info(f"Converting harbor output from {harbor_output_dir}") - - job_dir = _find_job_dir(harbor_output_dir) - logger.info(f"Using harbor job directory: {job_dir}") - - # Find trial result files (each trial dir has a result.json) - result_files = list(job_dir.glob("*/result.json")) - # Exclude the job-level result.json - result_files = [f for f in result_files if f.parent != job_dir] - - if not result_files: - raise RuntimeError( - f"No trial result files found in {job_dir}. " - f"Expected result.json files in trial subdirectories." - ) - - logger.info(f"Found {len(result_files)} trial results in {job_dir}") - - results: list[dict] = [] - errors: list[dict] = [] - - for result_file in result_files: - try: - with open(result_file) as f: - trial = json.load(f) - - instance_id = trial.get("task_name", result_file.parent.name) - - # Check for exceptions - if trial.get("exception_info"): - errors.append( - { - "instance_id": instance_id, - "error": str(trial["exception_info"]), - "test_result": {}, - } - ) - continue - - # Extract verifier results - verifier_result = trial.get("verifier_result", {}) - rewards = verifier_result.get("rewards", {}) - passed = rewards.get("reward", 0.0) > 0 - - # Extract agent metrics - agent_result = trial.get("agent_result", {}) - - eval_entry = { - "instance_id": instance_id, - "test_result": { - "trial_name": trial.get("trial_name"), - "trial_uri": trial.get("trial_uri"), - "rewards": rewards, - "passed": passed, - }, - "instruction": "", - "error": None, - "history": [], - "metrics": { - "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, - "total_completion_tokens": ( - agent_result.get("n_output_tokens") or 0 - ), - "total_cost_usd": agent_result.get("cost_usd") or 0.0, - }, - } - results.append(eval_entry) - logger.info( - f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" - ) - - except (json.JSONDecodeError, OSError) as e: - logger.error(f"Failed to process result file {result_file}: {e}") - errors.append( - { - "instance_id": result_file.parent.name, - "error": str(e), - "test_result": {}, - } - ) - - if not results and not errors: - raise RuntimeError(f"No trials processed from {harbor_output_dir}") - - if not results: - logger.warning( - f"All {len(errors)} trials failed in {harbor_output_dir}; " - "writing error entries for downstream reporting" - ) - - # Write results to output.jsonl - with open(eval_output_path, "w") as f: - for entry in results: - f.write(json.dumps(entry) + "\n") - for entry in errors: - f.write(json.dumps(entry) + "\n") - - logger.info( - f"Wrote {len(results)} successful + {len(errors)} failed entries " - f"to {eval_output_path}" + return _run_harbor_evaluation( + llm=llm, + dataset=dataset, + output_dir=output_dir, + harbor_executable=HARBOR_DEFAULTS["harbor_executable"], + agent_name=HARBOR_DEFAULTS["agent_name"], + num_workers=num_workers, + task_ids=task_ids, + n_limit=n_limit, + task_filter_flag=get_supported_task_filter_flag( + HARBOR_DEFAULTS["harbor_executable"] + ), + credential_mode=HarborCredentialMode.AGENT_ENV_FLAGS, + subprocess_run=subprocess.run, ) diff --git a/benchmarks/utils/harbor.py b/benchmarks/utils/harbor.py new file mode 100644 index 000000000..4dda977ce --- /dev/null +++ b/benchmarks/utils/harbor.py @@ -0,0 +1,305 @@ +"""Shared Harbor execution helpers for benchmark compatibility wrappers.""" + +from __future__ import annotations + +import json +import os +import re +import subprocess +from enum import Enum +from pathlib import Path +from typing import Any, Callable + +from openhands.sdk import LLM, get_logger + + +logger = get_logger(__name__) + + +class HarborCredentialMode(str, Enum): + """How LLM credentials are forwarded to Harbor/OpenHands SDK.""" + + AGENT_ENV_FLAGS = "agent_env_flags" + PROCESS_ENV = "process_env" + + +def check_harbor_installed( + harbor_executable: str = "harbor", + probe_arg: str = "--help", +) -> bool: + """Return whether the Harbor CLI is installed and responds successfully.""" + try: + result = subprocess.run( + [harbor_executable, probe_arg], + capture_output=True, + text=True, + timeout=10, + ) + return result.returncode == 0 + except (FileNotFoundError, subprocess.TimeoutExpired): + return False + + +def _probe_harbor_run_help(harbor_executable: str) -> str: + """Run harbor run --help and return combined stdout+stderr, or empty string if not found.""" + try: + result = subprocess.run( + [harbor_executable, "run", "--help"], + capture_output=True, + text=True, + timeout=10, + ) + return f"{result.stdout}\n{result.stderr}" + except (FileNotFoundError, subprocess.TimeoutExpired): + return "" + + +def get_supported_task_filter_flag(harbor_executable: str) -> str: + """Detect whether Harbor expects --task-name or --include-task-name.""" + help_text = _probe_harbor_run_help(harbor_executable) + supported_flags = set(re.findall(r"(? str: + """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk.""" + help_text = _probe_harbor_run_help(harbor_executable) + compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower()) + if "openhands-sdk" in compact_help_text: + return "openhands-sdk" + if "openhands" in compact_help_text: + return "openhands" + return default_agent_name + + +def _secret_value(value: object) -> str: + if hasattr(value, "get_secret_value"): + return value.get_secret_value() # type: ignore[no-any-return, attr-defined] + return str(value) + + +def run_harbor_evaluation( + llm: LLM, + dataset: str, + output_dir: str, + *, + harbor_executable: str = "harbor", + agent_name: str = "openhands-sdk", + dataset_is_path: bool = False, + num_workers: int = 1, + task_ids: list[str] | None = None, + n_limit: int | None = None, + task_filter_flag: str = "--task-name", + normalize_task_id: Callable[[str], str] | None = None, + credential_mode: HarborCredentialMode = HarborCredentialMode.AGENT_ENV_FLAGS, + retry_legacy_task_flag: bool = False, + subprocess_run: Callable[..., Any] = subprocess.run, +) -> Path: + """Run Harbor and return the directory containing Harbor job outputs. + + The ``subprocess_run`` parameter is a testing seam; pass a fake callable + in tests rather than patching the subprocess module. + """ + harbor_output_dir = Path(output_dir) / "harbor_output" + harbor_output_dir.mkdir(parents=True, exist_ok=True) + + cmd = [ + harbor_executable, + "run", + "--path" if dataset_is_path else "-d", + dataset, + "-a", + agent_name, + "-m", + llm.model, + "--jobs-dir", + str(harbor_output_dir.resolve()), + "--n-concurrent", + str(num_workers), + ] + + env: dict[str, str] | None = None + if credential_mode == HarborCredentialMode.AGENT_ENV_FLAGS: + if llm.api_key: + cmd.extend(["--ae", f"LLM_API_KEY={_secret_value(llm.api_key)}"]) + if llm.base_url: + cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"]) + elif credential_mode == HarborCredentialMode.PROCESS_ENV: + env = os.environ.copy() + if llm.api_key: + env["LLM_API_KEY"] = _secret_value(llm.api_key) + if llm.base_url: + env["LLM_BASE_URL"] = llm.base_url + + if task_ids: + normalize = normalize_task_id or (lambda task_id: task_id) + for task_id in task_ids: + cmd.extend([task_filter_flag, normalize(task_id)]) + + if n_limit is not None: + cmd.extend(["--n-tasks", str(n_limit)]) + + safe_cmd = [ + "***" if prev == "--ae" and part.startswith("LLM_") else part + for prev, part in zip([""] + cmd, cmd) + ] + logger.info(f"Running harbor command: {' '.join(safe_cmd)}") + logger.info(f"Output directory: {harbor_output_dir}") + + try: + result = subprocess_run(cmd, capture_output=True, text=True, env=env) + + if ( + result.returncode != 0 + and retry_legacy_task_flag + and task_ids + and task_filter_flag == "--task-name" + and "No such option: --task-name" in result.stderr + ): + fallback_cmd = [ + "--include-task-name" if part == "--task-name" else part for part in cmd + ] + logger.warning( + "Harbor does not support --task-name; retrying with --include-task-name" + ) + result = subprocess_run( + fallback_cmd, capture_output=True, text=True, env=env + ) + + if result.returncode != 0: + logger.error(f"Harbor command failed with code {result.returncode}") + logger.error(f"stdout: {result.stdout}") + logger.error(f"stderr: {result.stderr}") + raise RuntimeError(f"Harbor evaluation failed: {result.stderr}") + + logger.info("Harbor evaluation completed successfully") + logger.info(f"stdout: {result.stdout}") + except FileNotFoundError: + raise RuntimeError( + "Harbor CLI not found. Please install harbor: pip install harbor" + ) + + return harbor_output_dir + + +def _find_job_dir(harbor_output_dir: Path) -> Path: + """Find the latest Harbor job directory inside an output directory.""" + candidates = [ + d + for d in harbor_output_dir.iterdir() + if d.is_dir() and (d / "result.json").exists() + ] + if not candidates: + raise RuntimeError( + f"No harbor job directory found in {harbor_output_dir}. " + f"Expected a timestamp-named directory containing result.json." + ) + return sorted(candidates)[-1] + + +def convert_harbor_to_eval_output( + harbor_output_dir: Path, + eval_output_path: Path, + *, + canonicalize_instance_id: Callable[[str], str] | None = None, +) -> None: + """Convert Harbor trial results to OpenHands benchmark output.jsonl format.""" + logger.info(f"Converting harbor output from {harbor_output_dir}") + + canonicalize = canonicalize_instance_id or (lambda instance_id: instance_id) + job_dir = _find_job_dir(harbor_output_dir) + logger.info(f"Using harbor job directory: {job_dir}") + + result_files = [f for f in job_dir.glob("*/result.json") if f.parent != job_dir] + if not result_files: + raise RuntimeError( + f"No trial result files found in {job_dir}. " + f"Expected result.json files in trial subdirectories." + ) + + logger.info(f"Found {len(result_files)} trial results in {job_dir}") + + results: list[dict] = [] + errors: list[dict] = [] + + for result_file in result_files: + try: + with open(result_file) as f: + trial = json.load(f) + + instance_id = canonicalize(trial.get("task_name", result_file.parent.name)) + + if trial.get("exception_info"): + errors.append( + { + "instance_id": instance_id, + "error": str(trial["exception_info"]), + "test_result": {}, + } + ) + continue + + verifier_result = trial.get("verifier_result", {}) + rewards = verifier_result.get("rewards", {}) + passed = rewards.get("reward", 0.0) > 0 + agent_result = trial.get("agent_result", {}) + + eval_entry = { + "instance_id": instance_id, + "test_result": { + "trial_name": trial.get("trial_name"), + "trial_uri": trial.get("trial_uri"), + "rewards": rewards, + "passed": passed, + }, + "instruction": "", + "error": None, + "history": [], + "metrics": { + "total_prompt_tokens": agent_result.get("n_input_tokens") or 0, + "total_completion_tokens": ( + agent_result.get("n_output_tokens") or 0 + ), + "total_cost_usd": agent_result.get("cost_usd") or 0.0, + }, + } + results.append(eval_entry) + logger.info( + f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}" + ) + except (json.JSONDecodeError, OSError) as e: + logger.error(f"Failed to process result file {result_file}: {e}") + errors.append( + { + "instance_id": canonicalize(result_file.parent.name), + "error": str(e), + "test_result": {}, + } + ) + + if not results and not errors: + raise RuntimeError(f"No trials processed from {harbor_output_dir}") + + if not results: + logger.warning( + f"All {len(errors)} trials failed in {harbor_output_dir}; " + "writing error entries for downstream reporting" + ) + + with open(eval_output_path, "w") as f: + for entry in results: + f.write(json.dumps(entry) + "\n") + for entry in errors: + f.write(json.dumps(entry) + "\n") + + logger.info( + f"Wrote {len(results)} successful + {len(errors)} failed entries " + f"to {eval_output_path}" + ) diff --git a/tests/test_harbor.py b/tests/test_harbor.py new file mode 100644 index 000000000..6eac84d90 --- /dev/null +++ b/tests/test_harbor.py @@ -0,0 +1,243 @@ +"""Tests for shared Harbor benchmark execution utilities.""" + +from __future__ import annotations + +from pathlib import Path +from typing import Any + +import pytest + +from benchmarks.utils.harbor import ( + HarborCredentialMode, + get_supported_agent_name, + get_supported_task_filter_flag, + run_harbor_evaluation, +) +from openhands.sdk import LLM + + +def _fake_run(returncode: int = 0, stderr: str = "") -> Any: + """Return a fake subprocess.run callable capturing calls.""" + captured: dict[str, Any] = {} + + def fake(cmd: list[str], *, capture_output: bool, text: bool, env=None) -> Any: + captured.setdefault("cmds", []).append(list(cmd)) + captured["env"] = env + return type( + "R", (), {"returncode": returncode, "stdout": "ok", "stderr": stderr} + )() + + fake.captured = captured # type: ignore[attr-defined] + return fake + + +class TestGetSupportedTaskFilterFlag: + """Tests for get_supported_task_filter_flag parsing logic.""" + + def test_returns_include_task_name_when_present( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "--include-task-name\n--other-flag", + ) + assert get_supported_task_filter_flag("harbor") == "--include-task-name" + + def test_returns_task_name_when_include_absent( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "--task-name Filter by task", + ) + assert get_supported_task_filter_flag("harbor") == "--task-name" + + def test_include_task_name_takes_priority_over_task_name( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "--task-name ...\n--include-task-name ...", + ) + assert get_supported_task_filter_flag("harbor") == "--include-task-name" + + def test_falls_back_to_include_task_name_when_neither_present( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "Options: --output-dir --help", + ) + assert get_supported_task_filter_flag("harbor") == "--include-task-name" + + def test_falls_back_when_help_text_empty( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """Empty string is returned by _probe_harbor_run_help on timeout/not-found.""" + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "", + ) + assert get_supported_task_filter_flag("harbor") == "--include-task-name" + + def test_regex_boundary_rejects_partial_matches( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + """The lookbehind prevents flags that are embedded inside a word.""" + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "prefix--task-name (no standalone flag here)", + ) + assert get_supported_task_filter_flag("harbor") == "--include-task-name" + + +class TestGetSupportedAgentName: + """Tests for get_supported_agent_name parsing logic.""" + + def test_returns_openhands_sdk_when_present( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "Agents: openhands-sdk, other-agent", + ) + assert get_supported_agent_name("harbor") == "openhands-sdk" + + def test_returns_openhands_when_sdk_absent( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "Agents: openhands, other-agent", + ) + assert get_supported_agent_name("harbor") == "openhands" + + def test_openhands_sdk_takes_priority_over_openhands( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "openhands openhands-sdk", + ) + assert get_supported_agent_name("harbor") == "openhands-sdk" + + def test_falls_back_to_default_when_neither_present( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "Agents: some-other-agent", + ) + assert ( + get_supported_agent_name("harbor", default_agent_name="custom-agent") + == "custom-agent" + ) + + def test_falls_back_when_help_text_empty( + self, monkeypatch: pytest.MonkeyPatch + ) -> None: + monkeypatch.setattr( + "benchmarks.utils.harbor._probe_harbor_run_help", + lambda _: "", + ) + assert get_supported_agent_name("harbor") == "openhands-sdk" + + +class TestRunHarborEvaluationCredentialModes: + """Tests for run_harbor_evaluation credential injection modes.""" + + def test_agent_env_flags_mode_injects_ae_flags(self, tmp_path: Path) -> None: + """AGENT_ENV_FLAGS mode passes credentials via --ae flags; env is None.""" + run = _fake_run() + run_harbor_evaluation( + llm=LLM( + model="test/model", + api_key="my-key", + base_url="https://proxy.example.com", + ), + dataset="my-dataset", + output_dir=str(tmp_path), + credential_mode=HarborCredentialMode.AGENT_ENV_FLAGS, + subprocess_run=run, + ) + cmd = run.captured["cmds"][0] + assert "--ae" in cmd + assert any("LLM_API_KEY=my-key" in part for part in cmd) + assert any("LLM_BASE_URL=https://proxy.example.com" in part for part in cmd) + assert run.captured["env"] is None + + def test_process_env_mode_sets_env_vars(self, tmp_path: Path) -> None: + """PROCESS_ENV mode passes credentials via env dict; no --ae flags.""" + run = _fake_run() + run_harbor_evaluation( + llm=LLM( + model="test/model", + api_key="my-key", + base_url="https://proxy.example.com", + ), + dataset="my-dataset", + output_dir=str(tmp_path), + credential_mode=HarborCredentialMode.PROCESS_ENV, + subprocess_run=run, + ) + cmd = run.captured["cmds"][0] + assert "--ae" not in cmd + env = run.captured["env"] + assert env is not None + assert env["LLM_API_KEY"] == "my-key" + assert env["LLM_BASE_URL"] == "https://proxy.example.com" + + +class TestRunHarborEvaluationTaskFiltering: + """Tests for task_ids, n_limit, and fallback-retry in run_harbor_evaluation.""" + + def test_task_ids_and_n_limit_included_in_command(self, tmp_path: Path) -> None: + run = _fake_run() + run_harbor_evaluation( + llm=LLM(model="test/model"), + dataset="my-dataset", + output_dir=str(tmp_path), + task_ids=["task-a", "task-b"], + n_limit=5, + task_filter_flag="--task-name", + subprocess_run=run, + ) + cmd = run.captured["cmds"][0] + assert cmd.count("--task-name") == 2 + assert "task-a" in cmd + assert "task-b" in cmd + assert cmd[cmd.index("--n-tasks") + 1] == "5" + + def test_fallback_retry_switches_to_include_task_name(self, tmp_path: Path) -> None: + """When --task-name fails with 'No such option', retries with --include-task-name.""" + cmds: list[list[str]] = [] + + def fake_run( + cmd: list[str], *, capture_output: bool, text: bool, env=None + ) -> Any: + cmds.append(list(cmd)) + if "--task-name" in cmd: + return type( + "R", + (), + { + "returncode": 2, + "stdout": "", + "stderr": "No such option: --task-name", + }, + )() + return type("R", (), {"returncode": 0, "stdout": "ok", "stderr": ""})() + + run_harbor_evaluation( + llm=LLM(model="test/model"), + dataset="my-dataset", + output_dir=str(tmp_path), + task_ids=["task-a"], + task_filter_flag="--task-name", + retry_legacy_task_flag=True, + subprocess_run=fake_run, + ) + assert len(cmds) == 2 + assert "--task-name" in cmds[0] + assert "--include-task-name" in cmds[1] + assert "--task-name" not in cmds[1] diff --git a/tests/test_terminalbench.py b/tests/test_terminalbench.py index 39a8d6ffa..d9b62e07e 100644 --- a/tests/test_terminalbench.py +++ b/tests/test_terminalbench.py @@ -224,7 +224,15 @@ def test_run_harbor_evaluation_passes_filters_and_limits( """Test Harbor command includes task filters and n-limit for CI runs.""" captured: dict[str, list[str]] = {} - def fake_run(cmd: list[str], capture_output: bool, text: bool): + def fake_run( + cmd: list[str], capture_output: bool, text: bool, env=None, timeout=None + ): + if cmd == ["harbor", "run", "--help"]: + return type( + "Completed", + (), + {"returncode": 0, "stdout": "--include-task-name", "stderr": ""}, + )() captured["cmd"] = cmd return type( "Completed", @@ -265,7 +273,7 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool): ] assert "--jobs-dir" in cmd assert str(expected_output_dir.resolve()) in cmd - assert cmd.count("--task-name") == 2 + assert cmd.count("--include-task-name") == 2 assert "task-a" in cmd assert "task-b" in cmd assert cmd[cmd.index("--n-concurrent") + 1] == "3"