diff --git a/benchmarks/skillsbench/run_infer.py b/benchmarks/skillsbench/run_infer.py
index 535a27d62..abb71d988 100644
--- a/benchmarks/skillsbench/run_infer.py
+++ b/benchmarks/skillsbench/run_infer.py
@@ -11,7 +11,6 @@
 import argparse
 import json
 import os
-import re
 import shutil
 import subprocess
 import sys
@@ -19,10 +18,16 @@
 from datetime import datetime, timezone
 from pathlib import Path
 
-from pydantic import SecretStr
-
 from benchmarks.skillsbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.harbor import (
+    HarborCredentialMode,
+    check_harbor_installed as _check_harbor_installed,
+    convert_harbor_to_eval_output as _convert_harbor_to_eval_output,
+    get_supported_agent_name as _get_harbor_supported_agent_name,
+    get_supported_task_filter_flag as _get_harbor_supported_task_filter_flag,
+    run_harbor_evaluation as _run_harbor_evaluation,
+)
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import LLM, get_logger
 
@@ -62,17 +67,7 @@
 
 def check_harbor_installed() -> bool:
     """Check if harbor CLI is installed and available."""
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
-    try:
-        result = subprocess.run(
-            [harbor_exe, "--help"],
-            capture_output=True,
-            text=True,
-            timeout=10,
-        )
-        return result.returncode == 0
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        return False
+    return _check_harbor_installed(HARBOR_DEFAULTS["harbor_executable"])
 
 
 def _run_command(cmd: list[str], error_message: str) -> str:
@@ -90,42 +85,15 @@ def _run_command(cmd: list[str], error_message: str) -> str:
 
 def _get_supported_task_filter_flag(harbor_exe: str) -> str:
     """Detect whether Harbor expects --task-name or --include-task-name."""
-    try:
-        result = subprocess.run(
-            [harbor_exe, "run", "--help"],
-            capture_output=True,
-            text=True,
-        )
-    except FileNotFoundError:
-        return "--include-task-name"
-
-    help_text = f"{result.stdout}\n{result.stderr}"
-    supported_flags = set(re.findall(r"(?<![\w-])--[a-z0-9-]+", help_text))
-    if "--include-task-name" in supported_flags:
-        return "--include-task-name"
-    if "--task-name" in supported_flags:
-        return "--task-name"
-    return "--include-task-name"
+    return _get_harbor_supported_task_filter_flag(harbor_exe)
 
 
 def _get_supported_agent_name(harbor_exe: str) -> str:
     """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk."""
-    try:
-        result = subprocess.run(
-            [harbor_exe, "run", "--help"],
-            capture_output=True,
-            text=True,
-        )
-    except FileNotFoundError:
-        return HARBOR_DEFAULTS["agent_name"]
-
-    help_text = f"{result.stdout}\n{result.stderr}"
-    compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower())
-    if "openhands-sdk" in compact_help_text:
-        return "openhands-sdk"
-    if "openhands" in compact_help_text:
-        return "openhands"
-    return HARBOR_DEFAULTS["agent_name"]
+    return _get_harbor_supported_agent_name(
+        harbor_exe,
+        default_agent_name=HARBOR_DEFAULTS["agent_name"],
+    )
 
 
 def get_skillsbench_main_commit(
@@ -405,122 +373,29 @@ def run_harbor_evaluation(
     Returns:
         Path to the harbor output directory.
     """
-    harbor_output_dir = Path(output_dir) / "harbor_output"
-    harbor_output_dir.mkdir(parents=True, exist_ok=True)
     harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
     agent_name = _get_supported_agent_name(harbor_exe)
     task_filter_flag = _get_supported_task_filter_flag(harbor_exe)
 
-    # Build harbor command using harbor CLI flags.
-    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
-    cmd = [
-        harbor_exe,
-        "run",
-        "--path" if dataset_is_path else "-d",
-        dataset,
-        "-a",
-        agent_name,
-        "-m",
-        llm.model,
-        "--jobs-dir",
-        str(harbor_output_dir.resolve()),
-        "--n-concurrent",
-        str(num_workers),
-    ]
-
-    # Add specific task names if provided
-    if task_ids:
-        for task_id in task_ids:
-            cmd.extend(
-                [
-                    task_filter_flag,
-                    _normalize_task_filter_value(
-                        task_id, dataset_is_path=dataset_is_path
-                    ),
-                ]
-            )
-
-    if n_limit is not None:
-        cmd.extend(["--n-tasks", str(n_limit)])
-
-    logger.info(f"Running harbor command: {' '.join(cmd)}")
-    logger.info(f"Output directory: {harbor_output_dir}")
-
-    # harbor's openhands-sdk agent reads LLM credentials from the host process
-    # environment (os.environ), not from --ae flags which go to the sandbox.
-    env = os.environ.copy()
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        env["LLM_API_KEY"] = api_key
-    if llm.base_url:
-        env["LLM_BASE_URL"] = llm.base_url
-
-    try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-            env=env,
-        )
-
-        if result.returncode != 0:
-            if (
-                task_ids
-                and task_filter_flag == "--task-name"
-                and "No such option: --task-name" in result.stderr
-            ):
-                fallback_cmd = [
-                    "--include-task-name" if part == "--task-name" else part
-                    for part in cmd
-                ]
-                logger.warning(
-                    "Harbor does not support --task-name; retrying with "
-                    "--include-task-name"
-                )
-                result = subprocess.run(
-                    fallback_cmd,
-                    capture_output=True,
-                    text=True,
-                    env=env,
-                )
-
-            if result.returncode != 0:
-                logger.error(f"Harbor command failed with code {result.returncode}")
-                logger.error(f"stdout: {result.stdout}")
-                logger.error(f"stderr: {result.stderr}")
-                raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
-
-        logger.info("Harbor evaluation completed successfully")
-        logger.info(f"stdout: {result.stdout}")
-
-    except FileNotFoundError:
-        raise RuntimeError(
-            "Harbor CLI not found. Please install harbor: pip install harbor"
-        )
-
-    return harbor_output_dir
-
-
-def _find_job_dir(harbor_output_dir: Path) -> Path:
-    """Find the harbor job directory (timestamp-named) inside the output dir."""
-    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
-    # containing result.json and trial subdirectories
-    candidates = [
-        d
-        for d in harbor_output_dir.iterdir()
-        if d.is_dir() and (d / "result.json").exists()
-    ]
-    if not candidates:
-        raise RuntimeError(
-            f"No harbor job directory found in {harbor_output_dir}. "
-            f"Expected a timestamp-named directory containing result.json."
-        )
-    # Use the most recent job directory if multiple exist
-    return sorted(candidates)[-1]
+    return _run_harbor_evaluation(
+        llm=llm,
+        dataset=dataset,
+        output_dir=output_dir,
+        harbor_executable=harbor_exe,
+        agent_name=agent_name,
+        dataset_is_path=dataset_is_path,
+        num_workers=num_workers,
+        task_ids=task_ids,
+        n_limit=n_limit,
+        task_filter_flag=task_filter_flag,
+        normalize_task_id=lambda task_id: _normalize_task_filter_value(
+            task_id,
+            dataset_is_path=dataset_is_path,
+        ),
+        credential_mode=HarborCredentialMode.PROCESS_ENV,
+        retry_legacy_task_flag=True,
+        subprocess_run=subprocess.run,
+    )
 
 
 def convert_harbor_to_eval_output(
@@ -539,108 +414,10 @@ def convert_harbor_to_eval_output(
         harbor_output_dir: Path to harbor output directory.
         eval_output_path: Path to write the converted output.jsonl.
     """
-    logger.info(f"Converting harbor output from {harbor_output_dir}")
-
-    job_dir = _find_job_dir(harbor_output_dir)
-    logger.info(f"Using harbor job directory: {job_dir}")
-
-    # Find trial result files (each trial dir has a result.json)
-    result_files = list(job_dir.glob("*/result.json"))
-    # Exclude the job-level result.json
-    result_files = [f for f in result_files if f.parent != job_dir]
-
-    if not result_files:
-        raise RuntimeError(
-            f"No trial result files found in {job_dir}. "
-            f"Expected result.json files in trial subdirectories."
-        )
-
-    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
-
-    results: list[dict] = []
-    errors: list[dict] = []
-
-    for result_file in result_files:
-        try:
-            with open(result_file) as f:
-                trial = json.load(f)
-
-            instance_id = _canonicalize_instance_id(
-                trial.get("task_name", result_file.parent.name)
-            )
-
-            # Check for exceptions
-            if trial.get("exception_info"):
-                errors.append(
-                    {
-                        "instance_id": instance_id,
-                        "error": str(trial["exception_info"]),
-                        "test_result": {},
-                    }
-                )
-                continue
-
-            # Extract verifier results
-            verifier_result = trial.get("verifier_result", {})
-            rewards = verifier_result.get("rewards", {})
-            passed = rewards.get("reward", 0.0) > 0
-
-            # Extract agent metrics
-            agent_result = trial.get("agent_result", {})
-
-            eval_entry = {
-                "instance_id": instance_id,
-                "test_result": {
-                    "trial_name": trial.get("trial_name"),
-                    "trial_uri": trial.get("trial_uri"),
-                    "rewards": rewards,
-                    "passed": passed,
-                },
-                "instruction": "",
-                "error": None,
-                "history": [],
-                "metrics": {
-                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
-                    "total_completion_tokens": (
-                        agent_result.get("n_output_tokens") or 0
-                    ),
-                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
-                },
-            }
-            results.append(eval_entry)
-            logger.info(
-                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
-            )
-
-        except (json.JSONDecodeError, OSError) as e:
-            logger.error(f"Failed to process result file {result_file}: {e}")
-            errors.append(
-                {
-                    "instance_id": _canonicalize_instance_id(result_file.parent.name),
-                    "error": str(e),
-                    "test_result": {},
-                }
-            )
-
-    if not results and not errors:
-        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
-
-    if not results:
-        logger.warning(
-            f"All {len(errors)} trials failed in {harbor_output_dir}; "
-            "writing error entries for downstream reporting"
-        )
-
-    # Write results to output.jsonl
-    with open(eval_output_path, "w") as f:
-        for entry in results:
-            f.write(json.dumps(entry) + "\n")
-        for entry in errors:
-            f.write(json.dumps(entry) + "\n")
-
-    logger.info(
-        f"Wrote {len(results)} successful + {len(errors)} failed entries "
-        f"to {eval_output_path}"
+    _convert_harbor_to_eval_output(
+        harbor_output_dir,
+        eval_output_path,
+        canonicalize_instance_id=_canonicalize_instance_id,
     )
 
 
diff --git a/benchmarks/terminalbench/run_infer.py b/benchmarks/terminalbench/run_infer.py
index a01c4006d..48e785c79 100644
--- a/benchmarks/terminalbench/run_infer.py
+++ b/benchmarks/terminalbench/run_infer.py
@@ -16,10 +16,15 @@
 from datetime import datetime, timezone
 from pathlib import Path
 
-from pydantic import SecretStr
-
 from benchmarks.terminalbench.config import HARBOR_DEFAULTS, INFER_DEFAULTS
 from benchmarks.utils.evaluation_utils import construct_eval_output_dir
+from benchmarks.utils.harbor import (
+    HarborCredentialMode,
+    check_harbor_installed as _check_harbor_installed,
+    convert_harbor_to_eval_output,
+    get_supported_task_filter_flag,
+    run_harbor_evaluation as _run_harbor_evaluation,
+)
 from benchmarks.utils.report_costs import generate_cost_report
 from openhands.sdk import LLM, get_logger
 
@@ -32,17 +37,10 @@
 
 def check_harbor_installed() -> bool:
     """Check if harbor CLI is installed and available."""
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
-    try:
-        result = subprocess.run(
-            [harbor_exe, "--version"],
-            capture_output=True,
-            text=True,
-            timeout=10,
-        )
-        return result.returncode == 0
-    except (FileNotFoundError, subprocess.TimeoutExpired):
-        return False
+    return _check_harbor_installed(
+        HARBOR_DEFAULTS["harbor_executable"],
+        probe_arg="--version",
+    )
 
 
 def run_harbor_evaluation(
@@ -66,207 +64,20 @@ def run_harbor_evaluation(
     Returns:
         Path to the harbor output directory.
     """
-    harbor_output_dir = Path(output_dir) / "harbor_output"
-    harbor_output_dir.mkdir(parents=True, exist_ok=True)
-    harbor_exe = HARBOR_DEFAULTS["harbor_executable"]
-
-    # Build harbor command using harbor CLI flags.
-    # Use absolute path for --jobs-dir to avoid CWD-relative path issues.
-    cmd = [
-        harbor_exe,
-        "run",
-        "-d",
-        dataset,
-        "-a",
-        HARBOR_DEFAULTS["agent_name"],
-        "-m",
-        llm.model,
-        "--jobs-dir",
-        str(harbor_output_dir.resolve()),
-        "--n-concurrent",
-        str(num_workers),
-    ]
-
-    # Pass LLM credentials as agent environment variables
-    if llm.api_key:
-        api_key = (
-            llm.api_key.get_secret_value()
-            if isinstance(llm.api_key, SecretStr)
-            else llm.api_key
-        )
-        cmd.extend(["--ae", f"LLM_API_KEY={api_key}"])
-    if llm.base_url:
-        cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
-
-    # Add specific task names if provided
-    if task_ids:
-        for task_id in task_ids:
-            cmd.extend(["--task-name", task_id])
-
-    if n_limit is not None:
-        cmd.extend(["--n-tasks", str(n_limit)])
-
-    logger.info(f"Running harbor command: {' '.join(cmd)}")
-    logger.info(f"Output directory: {harbor_output_dir}")
-
-    try:
-        result = subprocess.run(
-            cmd,
-            capture_output=True,
-            text=True,
-        )
-
-        if result.returncode != 0:
-            logger.error(f"Harbor command failed with code {result.returncode}")
-            logger.error(f"stdout: {result.stdout}")
-            logger.error(f"stderr: {result.stderr}")
-            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
-
-        logger.info("Harbor evaluation completed successfully")
-        logger.info(f"stdout: {result.stdout}")
-
-    except FileNotFoundError:
-        raise RuntimeError(
-            "Harbor CLI not found. Please install harbor: pip install harbor"
-        )
-
-    return harbor_output_dir
-
-
-def _find_job_dir(harbor_output_dir: Path) -> Path:
-    """Find the harbor job directory (timestamp-named) inside the output dir."""
-    # Harbor creates a timestamp-named subdirectory (e.g., 2026-03-07__16-08-47)
-    # containing result.json and trial subdirectories
-    candidates = [
-        d
-        for d in harbor_output_dir.iterdir()
-        if d.is_dir() and (d / "result.json").exists()
-    ]
-    if not candidates:
-        raise RuntimeError(
-            f"No harbor job directory found in {harbor_output_dir}. "
-            f"Expected a timestamp-named directory containing result.json."
-        )
-    # Use the most recent job directory if multiple exist
-    return sorted(candidates)[-1]
-
-
-def convert_harbor_to_eval_output(
-    harbor_output_dir: Path,
-    eval_output_path: Path,
-) -> None:
-    """Convert harbor output to evaluation output format.
-
-    Harbor stores trial results in a job directory structured as:
-        harbor_output/TIMESTAMP/TRIAL_NAME/result.json
-
-    Each trial's result.json contains task_name, verifier_result, agent_result,
-    timing info, and exception details.
-
-    Args:
-        harbor_output_dir: Path to harbor output directory.
-        eval_output_path: Path to write the converted output.jsonl.
-    """
-    logger.info(f"Converting harbor output from {harbor_output_dir}")
-
-    job_dir = _find_job_dir(harbor_output_dir)
-    logger.info(f"Using harbor job directory: {job_dir}")
-
-    # Find trial result files (each trial dir has a result.json)
-    result_files = list(job_dir.glob("*/result.json"))
-    # Exclude the job-level result.json
-    result_files = [f for f in result_files if f.parent != job_dir]
-
-    if not result_files:
-        raise RuntimeError(
-            f"No trial result files found in {job_dir}. "
-            f"Expected result.json files in trial subdirectories."
-        )
-
-    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
-
-    results: list[dict] = []
-    errors: list[dict] = []
-
-    for result_file in result_files:
-        try:
-            with open(result_file) as f:
-                trial = json.load(f)
-
-            instance_id = trial.get("task_name", result_file.parent.name)
-
-            # Check for exceptions
-            if trial.get("exception_info"):
-                errors.append(
-                    {
-                        "instance_id": instance_id,
-                        "error": str(trial["exception_info"]),
-                        "test_result": {},
-                    }
-                )
-                continue
-
-            # Extract verifier results
-            verifier_result = trial.get("verifier_result", {})
-            rewards = verifier_result.get("rewards", {})
-            passed = rewards.get("reward", 0.0) > 0
-
-            # Extract agent metrics
-            agent_result = trial.get("agent_result", {})
-
-            eval_entry = {
-                "instance_id": instance_id,
-                "test_result": {
-                    "trial_name": trial.get("trial_name"),
-                    "trial_uri": trial.get("trial_uri"),
-                    "rewards": rewards,
-                    "passed": passed,
-                },
-                "instruction": "",
-                "error": None,
-                "history": [],
-                "metrics": {
-                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
-                    "total_completion_tokens": (
-                        agent_result.get("n_output_tokens") or 0
-                    ),
-                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
-                },
-            }
-            results.append(eval_entry)
-            logger.info(
-                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
-            )
-
-        except (json.JSONDecodeError, OSError) as e:
-            logger.error(f"Failed to process result file {result_file}: {e}")
-            errors.append(
-                {
-                    "instance_id": result_file.parent.name,
-                    "error": str(e),
-                    "test_result": {},
-                }
-            )
-
-    if not results and not errors:
-        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
-
-    if not results:
-        logger.warning(
-            f"All {len(errors)} trials failed in {harbor_output_dir}; "
-            "writing error entries for downstream reporting"
-        )
-
-    # Write results to output.jsonl
-    with open(eval_output_path, "w") as f:
-        for entry in results:
-            f.write(json.dumps(entry) + "\n")
-        for entry in errors:
-            f.write(json.dumps(entry) + "\n")
-
-    logger.info(
-        f"Wrote {len(results)} successful + {len(errors)} failed entries "
-        f"to {eval_output_path}"
+    return _run_harbor_evaluation(
+        llm=llm,
+        dataset=dataset,
+        output_dir=output_dir,
+        harbor_executable=HARBOR_DEFAULTS["harbor_executable"],
+        agent_name=HARBOR_DEFAULTS["agent_name"],
+        num_workers=num_workers,
+        task_ids=task_ids,
+        n_limit=n_limit,
+        task_filter_flag=get_supported_task_filter_flag(
+            HARBOR_DEFAULTS["harbor_executable"]
+        ),
+        credential_mode=HarborCredentialMode.AGENT_ENV_FLAGS,
+        subprocess_run=subprocess.run,
     )
 
 
diff --git a/benchmarks/utils/harbor.py b/benchmarks/utils/harbor.py
new file mode 100644
index 000000000..4dda977ce
--- /dev/null
+++ b/benchmarks/utils/harbor.py
@@ -0,0 +1,305 @@
+"""Shared Harbor execution helpers for benchmark compatibility wrappers."""
+
+from __future__ import annotations
+
+import json
+import os
+import re
+import subprocess
+from enum import Enum
+from pathlib import Path
+from typing import Any, Callable
+
+from openhands.sdk import LLM, get_logger
+
+
+logger = get_logger(__name__)
+
+
+class HarborCredentialMode(str, Enum):
+    """How LLM credentials are forwarded to Harbor/OpenHands SDK."""
+
+    AGENT_ENV_FLAGS = "agent_env_flags"
+    PROCESS_ENV = "process_env"
+
+
+def check_harbor_installed(
+    harbor_executable: str = "harbor",
+    probe_arg: str = "--help",
+) -> bool:
+    """Return whether the Harbor CLI is installed and responds successfully."""
+    try:
+        result = subprocess.run(
+            [harbor_executable, probe_arg],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        return result.returncode == 0
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return False
+
+
+def _probe_harbor_run_help(harbor_executable: str) -> str:
+    """Run harbor run --help and return combined stdout+stderr, or empty string if not found."""
+    try:
+        result = subprocess.run(
+            [harbor_executable, "run", "--help"],
+            capture_output=True,
+            text=True,
+            timeout=10,
+        )
+        return f"{result.stdout}\n{result.stderr}"
+    except (FileNotFoundError, subprocess.TimeoutExpired):
+        return ""
+
+
+def get_supported_task_filter_flag(harbor_executable: str) -> str:
+    """Detect whether Harbor expects --task-name or --include-task-name."""
+    help_text = _probe_harbor_run_help(harbor_executable)
+    supported_flags = set(re.findall(r"(?<![\w-])--[a-z0-9-]+", help_text))
+    if "--include-task-name" in supported_flags:
+        return "--include-task-name"
+    if "--task-name" in supported_flags:
+        return "--task-name"
+    return "--include-task-name"
+
+
+def get_supported_agent_name(
+    harbor_executable: str,
+    default_agent_name: str = "openhands-sdk",
+) -> str:
+    """Detect whether Harbor exposes the OpenHands agent as openhands or openhands-sdk."""
+    help_text = _probe_harbor_run_help(harbor_executable)
+    compact_help_text = re.sub(r"[^a-z0-9-]+", "", help_text.lower())
+    if "openhands-sdk" in compact_help_text:
+        return "openhands-sdk"
+    if "openhands" in compact_help_text:
+        return "openhands"
+    return default_agent_name
+
+
+def _secret_value(value: object) -> str:
+    if hasattr(value, "get_secret_value"):
+        return value.get_secret_value()  # type: ignore[no-any-return, attr-defined]
+    return str(value)
+
+
+def run_harbor_evaluation(
+    llm: LLM,
+    dataset: str,
+    output_dir: str,
+    *,
+    harbor_executable: str = "harbor",
+    agent_name: str = "openhands-sdk",
+    dataset_is_path: bool = False,
+    num_workers: int = 1,
+    task_ids: list[str] | None = None,
+    n_limit: int | None = None,
+    task_filter_flag: str = "--task-name",
+    normalize_task_id: Callable[[str], str] | None = None,
+    credential_mode: HarborCredentialMode = HarborCredentialMode.AGENT_ENV_FLAGS,
+    retry_legacy_task_flag: bool = False,
+    subprocess_run: Callable[..., Any] = subprocess.run,
+) -> Path:
+    """Run Harbor and return the directory containing Harbor job outputs.
+
+    The ``subprocess_run`` parameter is a testing seam; pass a fake callable
+    in tests rather than patching the subprocess module.
+    """
+    harbor_output_dir = Path(output_dir) / "harbor_output"
+    harbor_output_dir.mkdir(parents=True, exist_ok=True)
+
+    cmd = [
+        harbor_executable,
+        "run",
+        "--path" if dataset_is_path else "-d",
+        dataset,
+        "-a",
+        agent_name,
+        "-m",
+        llm.model,
+        "--jobs-dir",
+        str(harbor_output_dir.resolve()),
+        "--n-concurrent",
+        str(num_workers),
+    ]
+
+    env: dict[str, str] | None = None
+    if credential_mode == HarborCredentialMode.AGENT_ENV_FLAGS:
+        if llm.api_key:
+            cmd.extend(["--ae", f"LLM_API_KEY={_secret_value(llm.api_key)}"])
+        if llm.base_url:
+            cmd.extend(["--ae", f"LLM_BASE_URL={llm.base_url}"])
+    elif credential_mode == HarborCredentialMode.PROCESS_ENV:
+        env = os.environ.copy()
+        if llm.api_key:
+            env["LLM_API_KEY"] = _secret_value(llm.api_key)
+        if llm.base_url:
+            env["LLM_BASE_URL"] = llm.base_url
+
+    if task_ids:
+        normalize = normalize_task_id or (lambda task_id: task_id)
+        for task_id in task_ids:
+            cmd.extend([task_filter_flag, normalize(task_id)])
+
+    if n_limit is not None:
+        cmd.extend(["--n-tasks", str(n_limit)])
+
+    safe_cmd = [
+        "***" if prev == "--ae" and part.startswith("LLM_") else part
+        for prev, part in zip([""] + cmd, cmd)
+    ]
+    logger.info(f"Running harbor command: {' '.join(safe_cmd)}")
+    logger.info(f"Output directory: {harbor_output_dir}")
+
+    try:
+        result = subprocess_run(cmd, capture_output=True, text=True, env=env)
+
+        if (
+            result.returncode != 0
+            and retry_legacy_task_flag
+            and task_ids
+            and task_filter_flag == "--task-name"
+            and "No such option: --task-name" in result.stderr
+        ):
+            fallback_cmd = [
+                "--include-task-name" if part == "--task-name" else part for part in cmd
+            ]
+            logger.warning(
+                "Harbor does not support --task-name; retrying with --include-task-name"
+            )
+            result = subprocess_run(
+                fallback_cmd, capture_output=True, text=True, env=env
+            )
+
+        if result.returncode != 0:
+            logger.error(f"Harbor command failed with code {result.returncode}")
+            logger.error(f"stdout: {result.stdout}")
+            logger.error(f"stderr: {result.stderr}")
+            raise RuntimeError(f"Harbor evaluation failed: {result.stderr}")
+
+        logger.info("Harbor evaluation completed successfully")
+        logger.info(f"stdout: {result.stdout}")
+    except FileNotFoundError:
+        raise RuntimeError(
+            "Harbor CLI not found. Please install harbor: pip install harbor"
+        )
+
+    return harbor_output_dir
+
+
+def _find_job_dir(harbor_output_dir: Path) -> Path:
+    """Find the latest Harbor job directory inside an output directory."""
+    candidates = [
+        d
+        for d in harbor_output_dir.iterdir()
+        if d.is_dir() and (d / "result.json").exists()
+    ]
+    if not candidates:
+        raise RuntimeError(
+            f"No harbor job directory found in {harbor_output_dir}. "
+            f"Expected a timestamp-named directory containing result.json."
+        )
+    return sorted(candidates)[-1]
+
+
+def convert_harbor_to_eval_output(
+    harbor_output_dir: Path,
+    eval_output_path: Path,
+    *,
+    canonicalize_instance_id: Callable[[str], str] | None = None,
+) -> None:
+    """Convert Harbor trial results to OpenHands benchmark output.jsonl format."""
+    logger.info(f"Converting harbor output from {harbor_output_dir}")
+
+    canonicalize = canonicalize_instance_id or (lambda instance_id: instance_id)
+    job_dir = _find_job_dir(harbor_output_dir)
+    logger.info(f"Using harbor job directory: {job_dir}")
+
+    result_files = [f for f in job_dir.glob("*/result.json") if f.parent != job_dir]
+    if not result_files:
+        raise RuntimeError(
+            f"No trial result files found in {job_dir}. "
+            f"Expected result.json files in trial subdirectories."
+        )
+
+    logger.info(f"Found {len(result_files)} trial results in {job_dir}")
+
+    results: list[dict] = []
+    errors: list[dict] = []
+
+    for result_file in result_files:
+        try:
+            with open(result_file) as f:
+                trial = json.load(f)
+
+            instance_id = canonicalize(trial.get("task_name", result_file.parent.name))
+
+            if trial.get("exception_info"):
+                errors.append(
+                    {
+                        "instance_id": instance_id,
+                        "error": str(trial["exception_info"]),
+                        "test_result": {},
+                    }
+                )
+                continue
+
+            verifier_result = trial.get("verifier_result", {})
+            rewards = verifier_result.get("rewards", {})
+            passed = rewards.get("reward", 0.0) > 0
+            agent_result = trial.get("agent_result", {})
+
+            eval_entry = {
+                "instance_id": instance_id,
+                "test_result": {
+                    "trial_name": trial.get("trial_name"),
+                    "trial_uri": trial.get("trial_uri"),
+                    "rewards": rewards,
+                    "passed": passed,
+                },
+                "instruction": "",
+                "error": None,
+                "history": [],
+                "metrics": {
+                    "total_prompt_tokens": agent_result.get("n_input_tokens") or 0,
+                    "total_completion_tokens": (
+                        agent_result.get("n_output_tokens") or 0
+                    ),
+                    "total_cost_usd": agent_result.get("cost_usd") or 0.0,
+                },
+            }
+            results.append(eval_entry)
+            logger.info(
+                f"Processed trial {instance_id}: reward={rewards.get('reward', 'N/A')}"
+            )
+        except (json.JSONDecodeError, OSError) as e:
+            logger.error(f"Failed to process result file {result_file}: {e}")
+            errors.append(
+                {
+                    "instance_id": canonicalize(result_file.parent.name),
+                    "error": str(e),
+                    "test_result": {},
+                }
+            )
+
+    if not results and not errors:
+        raise RuntimeError(f"No trials processed from {harbor_output_dir}")
+
+    if not results:
+        logger.warning(
+            f"All {len(errors)} trials failed in {harbor_output_dir}; "
+            "writing error entries for downstream reporting"
+        )
+
+    with open(eval_output_path, "w") as f:
+        for entry in results:
+            f.write(json.dumps(entry) + "\n")
+        for entry in errors:
+            f.write(json.dumps(entry) + "\n")
+
+    logger.info(
+        f"Wrote {len(results)} successful + {len(errors)} failed entries "
+        f"to {eval_output_path}"
+    )
diff --git a/tests/test_harbor.py b/tests/test_harbor.py
new file mode 100644
index 000000000..6eac84d90
--- /dev/null
+++ b/tests/test_harbor.py
@@ -0,0 +1,243 @@
+"""Tests for shared Harbor benchmark execution utilities."""
+
+from __future__ import annotations
+
+from pathlib import Path
+from typing import Any
+
+import pytest
+
+from benchmarks.utils.harbor import (
+    HarborCredentialMode,
+    get_supported_agent_name,
+    get_supported_task_filter_flag,
+    run_harbor_evaluation,
+)
+from openhands.sdk import LLM
+
+
+def _fake_run(returncode: int = 0, stderr: str = "") -> Any:
+    """Return a fake subprocess.run callable capturing calls."""
+    captured: dict[str, Any] = {}
+
+    def fake(cmd: list[str], *, capture_output: bool, text: bool, env=None) -> Any:
+        captured.setdefault("cmds", []).append(list(cmd))
+        captured["env"] = env
+        return type(
+            "R", (), {"returncode": returncode, "stdout": "ok", "stderr": stderr}
+        )()
+
+    fake.captured = captured  # type: ignore[attr-defined]
+    return fake
+
+
+class TestGetSupportedTaskFilterFlag:
+    """Tests for get_supported_task_filter_flag parsing logic."""
+
+    def test_returns_include_task_name_when_present(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "--include-task-name\n--other-flag",
+        )
+        assert get_supported_task_filter_flag("harbor") == "--include-task-name"
+
+    def test_returns_task_name_when_include_absent(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "--task-name  Filter by task",
+        )
+        assert get_supported_task_filter_flag("harbor") == "--task-name"
+
+    def test_include_task_name_takes_priority_over_task_name(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "--task-name ...\n--include-task-name ...",
+        )
+        assert get_supported_task_filter_flag("harbor") == "--include-task-name"
+
+    def test_falls_back_to_include_task_name_when_neither_present(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "Options: --output-dir --help",
+        )
+        assert get_supported_task_filter_flag("harbor") == "--include-task-name"
+
+    def test_falls_back_when_help_text_empty(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """Empty string is returned by _probe_harbor_run_help on timeout/not-found."""
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "",
+        )
+        assert get_supported_task_filter_flag("harbor") == "--include-task-name"
+
+    def test_regex_boundary_rejects_partial_matches(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        """The lookbehind prevents flags that are embedded inside a word."""
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "prefix--task-name  (no standalone flag here)",
+        )
+        assert get_supported_task_filter_flag("harbor") == "--include-task-name"
+
+
+class TestGetSupportedAgentName:
+    """Tests for get_supported_agent_name parsing logic."""
+
+    def test_returns_openhands_sdk_when_present(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "Agents: openhands-sdk, other-agent",
+        )
+        assert get_supported_agent_name("harbor") == "openhands-sdk"
+
+    def test_returns_openhands_when_sdk_absent(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "Agents: openhands, other-agent",
+        )
+        assert get_supported_agent_name("harbor") == "openhands"
+
+    def test_openhands_sdk_takes_priority_over_openhands(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "openhands openhands-sdk",
+        )
+        assert get_supported_agent_name("harbor") == "openhands-sdk"
+
+    def test_falls_back_to_default_when_neither_present(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "Agents: some-other-agent",
+        )
+        assert (
+            get_supported_agent_name("harbor", default_agent_name="custom-agent")
+            == "custom-agent"
+        )
+
+    def test_falls_back_when_help_text_empty(
+        self, monkeypatch: pytest.MonkeyPatch
+    ) -> None:
+        monkeypatch.setattr(
+            "benchmarks.utils.harbor._probe_harbor_run_help",
+            lambda _: "",
+        )
+        assert get_supported_agent_name("harbor") == "openhands-sdk"
+
+
+class TestRunHarborEvaluationCredentialModes:
+    """Tests for run_harbor_evaluation credential injection modes."""
+
+    def test_agent_env_flags_mode_injects_ae_flags(self, tmp_path: Path) -> None:
+        """AGENT_ENV_FLAGS mode passes credentials via --ae flags; env is None."""
+        run = _fake_run()
+        run_harbor_evaluation(
+            llm=LLM(
+                model="test/model",
+                api_key="my-key",
+                base_url="https://proxy.example.com",
+            ),
+            dataset="my-dataset",
+            output_dir=str(tmp_path),
+            credential_mode=HarborCredentialMode.AGENT_ENV_FLAGS,
+            subprocess_run=run,
+        )
+        cmd = run.captured["cmds"][0]
+        assert "--ae" in cmd
+        assert any("LLM_API_KEY=my-key" in part for part in cmd)
+        assert any("LLM_BASE_URL=https://proxy.example.com" in part for part in cmd)
+        assert run.captured["env"] is None
+
+    def test_process_env_mode_sets_env_vars(self, tmp_path: Path) -> None:
+        """PROCESS_ENV mode passes credentials via env dict; no --ae flags."""
+        run = _fake_run()
+        run_harbor_evaluation(
+            llm=LLM(
+                model="test/model",
+                api_key="my-key",
+                base_url="https://proxy.example.com",
+            ),
+            dataset="my-dataset",
+            output_dir=str(tmp_path),
+            credential_mode=HarborCredentialMode.PROCESS_ENV,
+            subprocess_run=run,
+        )
+        cmd = run.captured["cmds"][0]
+        assert "--ae" not in cmd
+        env = run.captured["env"]
+        assert env is not None
+        assert env["LLM_API_KEY"] == "my-key"
+        assert env["LLM_BASE_URL"] == "https://proxy.example.com"
+
+
+class TestRunHarborEvaluationTaskFiltering:
+    """Tests for task_ids, n_limit, and fallback-retry in run_harbor_evaluation."""
+
+    def test_task_ids_and_n_limit_included_in_command(self, tmp_path: Path) -> None:
+        run = _fake_run()
+        run_harbor_evaluation(
+            llm=LLM(model="test/model"),
+            dataset="my-dataset",
+            output_dir=str(tmp_path),
+            task_ids=["task-a", "task-b"],
+            n_limit=5,
+            task_filter_flag="--task-name",
+            subprocess_run=run,
+        )
+        cmd = run.captured["cmds"][0]
+        assert cmd.count("--task-name") == 2
+        assert "task-a" in cmd
+        assert "task-b" in cmd
+        assert cmd[cmd.index("--n-tasks") + 1] == "5"
+
+    def test_fallback_retry_switches_to_include_task_name(self, tmp_path: Path) -> None:
+        """When --task-name fails with 'No such option', retries with --include-task-name."""
+        cmds: list[list[str]] = []
+
+        def fake_run(
+            cmd: list[str], *, capture_output: bool, text: bool, env=None
+        ) -> Any:
+            cmds.append(list(cmd))
+            if "--task-name" in cmd:
+                return type(
+                    "R",
+                    (),
+                    {
+                        "returncode": 2,
+                        "stdout": "",
+                        "stderr": "No such option: --task-name",
+                    },
+                )()
+            return type("R", (), {"returncode": 0, "stdout": "ok", "stderr": ""})()
+
+        run_harbor_evaluation(
+            llm=LLM(model="test/model"),
+            dataset="my-dataset",
+            output_dir=str(tmp_path),
+            task_ids=["task-a"],
+            task_filter_flag="--task-name",
+            retry_legacy_task_flag=True,
+            subprocess_run=fake_run,
+        )
+        assert len(cmds) == 2
+        assert "--task-name" in cmds[0]
+        assert "--include-task-name" in cmds[1]
+        assert "--task-name" not in cmds[1]
diff --git a/tests/test_terminalbench.py b/tests/test_terminalbench.py
index 39a8d6ffa..d9b62e07e 100644
--- a/tests/test_terminalbench.py
+++ b/tests/test_terminalbench.py
@@ -224,7 +224,15 @@ def test_run_harbor_evaluation_passes_filters_and_limits(
         """Test Harbor command includes task filters and n-limit for CI runs."""
         captured: dict[str, list[str]] = {}
 
-        def fake_run(cmd: list[str], capture_output: bool, text: bool):
+        def fake_run(
+            cmd: list[str], capture_output: bool, text: bool, env=None, timeout=None
+        ):
+            if cmd == ["harbor", "run", "--help"]:
+                return type(
+                    "Completed",
+                    (),
+                    {"returncode": 0, "stdout": "--include-task-name", "stderr": ""},
+                )()
             captured["cmd"] = cmd
             return type(
                 "Completed",
@@ -265,7 +273,7 @@ def fake_run(cmd: list[str], capture_output: bool, text: bool):
         ]
         assert "--jobs-dir" in cmd
         assert str(expected_output_dir.resolve()) in cmd
-        assert cmd.count("--task-name") == 2
+        assert cmd.count("--include-task-name") == 2
         assert "task-a" in cmd
         assert "task-b" in cmd
         assert cmd[cmd.index("--n-concurrent") + 1] == "3"