diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 36b5e05..32380e6 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -27,7 +27,9 @@ cleanup_agent_sessions, ensure_agent_exists, execute_openclaw_task, + ModelValidationError, slugify_model, + validate_openrouter_model, ) from lib_grading import GradeResult, grade_task from lib_tasks import Task, TaskLoader @@ -487,6 +489,13 @@ def main(): # Use a shared workspace for the agent - we'll copy fixtures per task agent_workspace = Path(f"/tmp/pinchbench/{run_id}/agent_workspace") + # Validate model exists before wasting time on tasks + try: + validate_openrouter_model(args.model) + except ModelValidationError as exc: + logger.error("❌ %s", exc) + sys.exit(1) + ensure_agent_exists(agent_id, args.model, agent_workspace) cleanup_agent_sessions(agent_id) diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 24e456d..ac8fba5 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -11,11 +11,19 @@ import time from pathlib import Path from typing import Any, Dict, List +from urllib import error, request from lib_tasks import Task logger = logging.getLogger(__name__) + + +class ModelValidationError(Exception): + """Raised when a model ID is invalid or inaccessible.""" + pass + + MAX_OPENCLAW_MESSAGE_CHARS = int(os.environ.get("PINCHBENCH_MAX_MSG_CHARS", "4000")) @@ -35,6 +43,104 @@ def normalize_model_id(model_id: str) -> str: return f"openrouter/{model_id}" +def validate_openrouter_model(model_id: str, timeout_seconds: float = 10.0) -> bool: + """ + Validate that a model ID exists on OpenRouter. + + Args: + model_id: Model ID (with or without openrouter/ prefix) + timeout_seconds: HTTP request timeout + + Returns: + True if model is valid and accessible + + Raises: + ModelValidationError: If model doesn't exist or validation fails + """ + # Strip openrouter/ prefix if present + bare_model_id = model_id + if bare_model_id.startswith("openrouter/"): + bare_model_id = bare_model_id[len("openrouter/"):] + + # Skip validation for non-OpenRouter models + if "/" not in bare_model_id: + logger.info("Skipping model validation for non-OpenRouter model: %s", model_id) + return True + + api_key = os.environ.get("OPENROUTER_API_KEY") + if not api_key: + logger.warning("OPENROUTER_API_KEY not set, skipping model validation") + return True + + logger.info("🔍 Validating model: %s", bare_model_id) + + headers = { + "Authorization": f"Bearer {api_key}", + "HTTP-Referer": "https://pinchbench.com", + "X-Title": "PinchBench", + } + + # First, try the specific model endpoint (fast path for valid models) + encoded_model_id = bare_model_id.replace("/", "%2F") + specific_endpoint = f"https://openrouter.ai/api/v1/models/{encoded_model_id}" + req = request.Request(specific_endpoint, headers=headers, method="GET") + try: + with request.urlopen(req, timeout=timeout_seconds) as resp: + # Model exists - validation passed + logger.info("✅ Model validated: %s", bare_model_id) + return True + except error.HTTPError as exc: + if exc.code == 404: + # Model not found - fall through to fetch full catalog for suggestions + pass + else: + logger.warning("OpenRouter API error during validation: %s", exc) + return True + except error.URLError as exc: + logger.warning("Network error during model validation: %s", exc) + return True + + # Model not found - fetch full catalog for "did you mean" suggestions + catalog_endpoint = "https://openrouter.ai/api/v1/models" + req = request.Request(catalog_endpoint, headers=headers, method="GET") + try: + with request.urlopen(req, timeout=timeout_seconds) as resp: + data = json.loads(resp.read().decode("utf-8")) + except error.HTTPError as exc: + logger.warning("OpenRouter API error fetching model catalog: %s", exc) + raise ModelValidationError(f"Model '{bare_model_id}' not found on OpenRouter.") + except error.URLError as exc: + logger.warning("Network error fetching model catalog: %s", exc) + raise ModelValidationError(f"Model '{bare_model_id}' not found on OpenRouter.") + except json.JSONDecodeError as exc: + logger.warning("Failed to parse OpenRouter response: %s", exc) + raise ModelValidationError(f"Model '{bare_model_id}' not found on OpenRouter.") + + models = data.get("data", []) + model_ids = {m.get("id") for m in models if isinstance(m, dict) and m.get("id")} + + # Check for close matches (typos) + close_matches = [] + bare_lower = bare_model_id.lower() + for mid in model_ids: + if bare_lower in mid.lower() or mid.lower() in bare_lower: + close_matches.append(mid) + + error_msg = f"Model '{bare_model_id}' not found on OpenRouter." + if close_matches: + close_matches_str = ", ".join(sorted(close_matches)[:5]) + error_msg += f" Did you mean: {close_matches_str}?" + else: + # Try to suggest based on provider + provider = bare_model_id.split("/")[0] if "/" in bare_model_id else None + if provider: + provider_models = [m for m in model_ids if m.startswith(f"{provider}/")] + if provider_models: + error_msg += f" Available {provider} models: {', '.join(sorted(provider_models)[:5])}" + + raise ModelValidationError(error_msg) + + def _get_agent_workspace(agent_id: str) -> Path | None: """Get the workspace path for an agent from OpenClaw config.""" try: