Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 9 additions & 0 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,9 @@
cleanup_agent_sessions,
ensure_agent_exists,
execute_openclaw_task,
ModelValidationError,
slugify_model,
validate_openrouter_model,
)
from lib_grading import GradeResult, grade_task
from lib_tasks import Task, TaskLoader
Expand Down Expand Up @@ -487,6 +489,13 @@ def main():
# Use a shared workspace for the agent - we'll copy fixtures per task
agent_workspace = Path(f"/tmp/pinchbench/{run_id}/agent_workspace")

# Validate model exists before wasting time on tasks
try:
validate_openrouter_model(args.model)
except ModelValidationError as exc:
logger.error("❌ %s", exc)
sys.exit(1)

ensure_agent_exists(agent_id, args.model, agent_workspace)
cleanup_agent_sessions(agent_id)

Expand Down
106 changes: 106 additions & 0 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,19 @@
import time
from pathlib import Path
from typing import Any, Dict, List
from urllib import error, request

from lib_tasks import Task


logger = logging.getLogger(__name__)


class ModelValidationError(Exception):
"""Raised when a model ID is invalid or inaccessible."""
pass


MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "4000"))


Expand All @@ -35,6 +43,104 @@ def normalize_model_id(model_id: str) -> str:
return f"openrouter/{model_id}"


def validate_openrouter_model(model_id: str, timeout_seconds: float = 10.0) -> bool:
"""
Validate that a model ID exists on OpenRouter.

Args:
model_id: Model ID (with or without openrouter/ prefix)
timeout_seconds: HTTP request timeout

Returns:
True if model is valid and accessible

Raises:
ModelValidationError: If model doesn't exist or validation fails
"""
# Strip openrouter/ prefix if present
bare_model_id = model_id
if bare_model_id.startswith("openrouter/"):
bare_model_id = bare_model_id[len("openrouter/"):]

# Skip validation for non-OpenRouter models
if "/" not in bare_model_id:
logger.info("Skipping model validation for non-OpenRouter model: %s", model_id)
return True

api_key = os.environ.get("OPENROUTER_API_KEY")
if not api_key:
logger.warning("OPENROUTER_API_KEY not set, skipping model validation")
return True

logger.info("🔍 Validating model: %s", bare_model_id)

headers = {
"Authorization": f"Bearer {api_key}",
"HTTP-Referer": "https://pinchbench.com",
"X-Title": "PinchBench",
}

# First, try the specific model endpoint (fast path for valid models)
encoded_model_id = bare_model_id.replace("/", "%2F")
specific_endpoint = f"https://openrouter.ai/api/v1/models/{encoded_model_id}"
req = request.Request(specific_endpoint, headers=headers, method="GET")
try:
with request.urlopen(req, timeout=timeout_seconds) as resp:
# Model exists - validation passed
logger.info("✅ Model validated: %s", bare_model_id)
return True
except error.HTTPError as exc:
if exc.code == 404:
# Model not found - fall through to fetch full catalog for suggestions
pass
else:
logger.warning("OpenRouter API error during validation: %s", exc)
return True
except error.URLError as exc:
logger.warning("Network error during model validation: %s", exc)
return True

# Model not found - fetch full catalog for "did you mean" suggestions
catalog_endpoint = "https://openrouter.ai/api/v1/models"
req = request.Request(catalog_endpoint, headers=headers, method="GET")
try:
with request.urlopen(req, timeout=timeout_seconds) as resp:
data = json.loads(resp.read().decode("utf-8"))
except error.HTTPError as exc:
logger.warning("OpenRouter API error fetching model catalog: %s", exc)
raise ModelValidationError(f"Model '{bare_model_id}' not found on OpenRouter.")
except error.URLError as exc:
logger.warning("Network error fetching model catalog: %s", exc)
raise ModelValidationError(f"Model '{bare_model_id}' not found on OpenRouter.")
except json.JSONDecodeError as exc:
logger.warning("Failed to parse OpenRouter response: %s", exc)
raise ModelValidationError(f"Model '{bare_model_id}' not found on OpenRouter.")

models = data.get("data", [])
model_ids = {m.get("id") for m in models if isinstance(m, dict) and m.get("id")}

# Check for close matches (typos)
close_matches = []
bare_lower = bare_model_id.lower()
for mid in model_ids:
if bare_lower in mid.lower() or mid.lower() in bare_lower:
close_matches.append(mid)

error_msg = f"Model '{bare_model_id}' not found on OpenRouter."
if close_matches:
close_matches_str = ", ".join(sorted(close_matches)[:5])
error_msg += f" Did you mean: {close_matches_str}?"
else:
# Try to suggest based on provider
provider = bare_model_id.split("/")[0] if "/" in bare_model_id else None
if provider:
provider_models = [m for m in model_ids if m.startswith(f"{provider}/")]
if provider_models:
error_msg += f" Available {provider} models: {', '.join(sorted(provider_models)[:5])}"

raise ModelValidationError(error_msg)


def _get_agent_workspace(agent_id: str) -> Path | None:
"""Get the workspace path for an agent from OpenClaw config."""
try:
Expand Down