Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions scripts/benchmark.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
import os
import statistics
import subprocess
import time
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

WARNING: Duplicate import time — this line adds import time but it already exists at line 23. The duplicate should be removed.

Suggested change
import time

import sys
import time
from pathlib import Path
Expand Down Expand Up @@ -414,6 +415,8 @@ def main():
"stderr": execution_error,
}
try:
# Small delay to ensure any pending file writes are flushed to disk
time.sleep(2)
grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose)
except Exception as exc:
if execution_error:
Expand Down
48 changes: 39 additions & 9 deletions scripts/lib_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,8 +24,9 @@ def slugify_model(model_id: str) -> str:

def normalize_model_id(model_id: str) -> str:
"""Ensure model id is provider-qualified for OpenClaw."""
if "/" not in model_id:
return model_id
# Don't add prefix if it's already provider-qualified (has / and not openrouter)
if "/" in model_id and not model_id.startswith("openrouter/"):
return model_id # Keep as-is (e.g., minimax-cn/MiniMax-M2.5)
if model_id.startswith("openrouter/"):
return model_id
return f"openrouter/{model_id}"
Expand All @@ -44,8 +45,8 @@ def _get_agent_workspace(agent_id: str) -> Path | None:
return None

# Parse the agent list output to find workspace
# OpenClaw normalizes colons to dashes in agent names, so check both.
normalized_id = agent_id.replace(":", "-")
# OpenClaw normalizes colons to dashes and lowercases agent names
normalized_id = agent_id.replace(":", "-").lower()
lines = list_result.stdout.split("\n")
found_agent = False
for line in lines:
Expand Down Expand Up @@ -100,9 +101,9 @@ def ensure_agent_exists(agent_id: str, model_id: str, workspace_dir: Path) -> bo
# Extract agent name: "- bench-foo-4-5" or "- main (default)"
name_part = line[2:].split()[0] if line[2:].strip() else ""
if name_part:
existing_agents.add(name_part)
normalized_id = agent_id.replace(":", "-")
if agent_id in existing_agents or normalized_id in existing_agents:
existing_agents.add(name_part.lower())
normalized_id = agent_id.replace(":", "-").lower()
if agent_id.lower() in existing_agents or normalized_id in existing_agents:
# Agent exists — check if workspace matches
current_workspace = _get_agent_workspace(agent_id)
if current_workspace is not None and current_workspace.resolve() == workspace_dir.resolve():
Expand Down Expand Up @@ -207,15 +208,44 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s
logger.error("Workspace file not found: %s", source)
raise

# Remove bootstrap files that would trigger the onboarding flow
# These interfere with benchmark tasks
for bootstrap_file in ["BOOTSTRAP.md", "SOUL.md", "USER.md", "IDENTITY.md"]:
bootstrap_path = workspace / bootstrap_file
if bootstrap_path.exists():
try:
bootstrap_path.unlink()
logger.info("Removed bootstrap file: %s", bootstrap_file)
except OSError as exc:
logger.warning("Failed to remove %s: %s", bootstrap_file, exc)

# Copy skills from main workspace to benchmark workspace
# This enables benchmark agents to use installed skills like nano-pdf
main_skills_dir = Path.home() / ".openclaw" / "workspace" / "skills"
if main_skills_dir.exists():
dest_skills_dir = workspace / "skills"
dest_skills_dir.mkdir(parents=True, exist_ok=True)
for skill_dir_src in main_skills_dir.iterdir():
if skill_dir_src.is_dir():
dest_skill_dir = dest_skills_dir / skill_dir_src.name
# Copy skill directory
import shutil
if dest_skill_dir.exists():
shutil.rmtree(dest_skill_dir)
shutil.copytree(skill_dir_src, dest_skill_dir)
logger.info("Copied skill to benchmark workspace: %s", skill_dir_src.name)

return workspace


def _get_agent_store_dir(agent_id: str) -> Path:
base_dir = Path.home() / ".openclaw" / "agents"
# OpenClaw normalizes agent IDs to lowercase and replaces colons with dashes
normalized_id = agent_id.replace(":", "-").lower()
direct_dir = base_dir / agent_id
if direct_dir.exists():
return direct_dir
normalized_dir = base_dir / agent_id.replace(":", "-")
normalized_dir = base_dir / normalized_id
if normalized_dir.exists():
return normalized_dir
return direct_dir
Expand All @@ -234,7 +264,7 @@ def _resolve_session_id_from_store(agent_id: str) -> str | None:
if not isinstance(sessions_payload, dict):
return None

normalized_id = agent_id.replace(":", "-")
normalized_id = agent_id.replace(":", "-").lower()
preferred_keys = [
f"agent:{agent_id}:main",
f"agent:{agent_id}:default",
Expand Down