From 0cccb85455b165af7da7bd15421c2462a8680315 Mon Sep 17 00:00:00 2001 From: zhuanghaoz Date: Mon, 9 Mar 2026 20:48:08 +0800 Subject: [PATCH 1/2] fix: normalize agent IDs and remove bootstrap files for benchmark - Fix agent ID normalization to handle lowercase transformation - Remove BOOTSTRAP.md, SOUL.md, USER.md, IDENTITY.md before running tasks - Fix model ID normalization to preserve provider-qualified models (e.g., minimax-cn/) These fixes ensure benchmark tasks work correctly with OpenClaw agents. --- scripts/lib_agent.py | 32 +++++++++++++++++++++++--------- 1 file changed, 23 insertions(+), 9 deletions(-) diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index ab3e867..9b29bf1 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -24,8 +24,9 @@ def slugify_model(model_id: str) -> str: def normalize_model_id(model_id: str) -> str: """Ensure model id is provider-qualified for OpenClaw.""" - if "/" not in model_id: - return model_id + # Don't add prefix if it's already provider-qualified (has / and not openrouter) + if "/" in model_id and not model_id.startswith("openrouter/"): + return model_id # Keep as-is (e.g., minimax-cn/MiniMax-M2.5) if model_id.startswith("openrouter/"): return model_id return f"openrouter/{model_id}" @@ -44,8 +45,8 @@ def _get_agent_workspace(agent_id: str) -> Path | None: return None # Parse the agent list output to find workspace - # OpenClaw normalizes colons to dashes in agent names, so check both. - normalized_id = agent_id.replace(":", "-") + # OpenClaw normalizes colons to dashes and lowercases agent names + normalized_id = agent_id.replace(":", "-").lower() lines = list_result.stdout.split("\n") found_agent = False for line in lines: @@ -100,9 +101,9 @@ def ensure_agent_exists(agent_id: str, model_id: str, workspace_dir: Path) -> bo # Extract agent name: "- bench-foo-4-5" or "- main (default)" name_part = line[2:].split()[0] if line[2:].strip() else "" if name_part: - existing_agents.add(name_part) - normalized_id = agent_id.replace(":", "-") - if agent_id in existing_agents or normalized_id in existing_agents: + existing_agents.add(name_part.lower()) + normalized_id = agent_id.replace(":", "-").lower() + if agent_id.lower() in existing_agents or normalized_id in existing_agents: # Agent exists — check if workspace matches current_workspace = _get_agent_workspace(agent_id) if current_workspace is not None and current_workspace.resolve() == workspace_dir.resolve(): @@ -207,15 +208,28 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s logger.error("Workspace file not found: %s", source) raise + # Remove bootstrap files that would trigger the onboarding flow + # These interfere with benchmark tasks + for bootstrap_file in ["BOOTSTRAP.md", "SOUL.md", "USER.md", "IDENTITY.md"]: + bootstrap_path = workspace / bootstrap_file + if bootstrap_path.exists(): + try: + bootstrap_path.unlink() + logger.info("Removed bootstrap file: %s", bootstrap_file) + except OSError as exc: + logger.warning("Failed to remove %s: %s", bootstrap_file, exc) + return workspace def _get_agent_store_dir(agent_id: str) -> Path: base_dir = Path.home() / ".openclaw" / "agents" + # OpenClaw normalizes agent IDs to lowercase and replaces colons with dashes + normalized_id = agent_id.replace(":", "-").lower() direct_dir = base_dir / agent_id if direct_dir.exists(): return direct_dir - normalized_dir = base_dir / agent_id.replace(":", "-") + normalized_dir = base_dir / normalized_id if normalized_dir.exists(): return normalized_dir return direct_dir @@ -234,7 +248,7 @@ def _resolve_session_id_from_store(agent_id: str) -> str | None: if not isinstance(sessions_payload, dict): return None - normalized_id = agent_id.replace(":", "-") + normalized_id = agent_id.replace(":", "-").lower() preferred_keys = [ f"agent:{agent_id}:main", f"agent:{agent_id}:default", From 46445b2744fbe7099ba5ae9ef4af5c36b07080c8 Mon Sep 17 00:00:00 2001 From: zhuanghaoz Date: Mon, 9 Mar 2026 23:58:16 +0800 Subject: [PATCH 2/2] fix: copy skills to benchmark workspace and add delay before grading - Copy skills from main workspace to benchmark workspace so agents can use nano-pdf - Add 2-second delay before grading to ensure files are flushed to disk - Fix model ID normalization to preserve provider-qualified models --- scripts/benchmark.py | 3 +++ scripts/lib_agent.py | 16 ++++++++++++++++ 2 files changed, 19 insertions(+) diff --git a/scripts/benchmark.py b/scripts/benchmark.py index 8437b0f..ef63787 100644 --- a/scripts/benchmark.py +++ b/scripts/benchmark.py @@ -18,6 +18,7 @@ import os import statistics import subprocess +import time import sys import time from pathlib import Path @@ -414,6 +415,8 @@ def main(): "stderr": execution_error, } try: + # Small delay to ensure any pending file writes are flushed to disk + time.sleep(2) grade = grade_task(task=task, execution_result=result, skill_dir=skill_dir, verbose=args.verbose) except Exception as exc: if execution_error: diff --git a/scripts/lib_agent.py b/scripts/lib_agent.py index 9b29bf1..4d37257 100644 --- a/scripts/lib_agent.py +++ b/scripts/lib_agent.py @@ -219,6 +219,22 @@ def prepare_task_workspace(skill_dir: Path, run_id: str, task: Task, agent_id: s except OSError as exc: logger.warning("Failed to remove %s: %s", bootstrap_file, exc) + # Copy skills from main workspace to benchmark workspace + # This enables benchmark agents to use installed skills like nano-pdf + main_skills_dir = Path.home() / ".openclaw" / "workspace" / "skills" + if main_skills_dir.exists(): + dest_skills_dir = workspace / "skills" + dest_skills_dir.mkdir(parents=True, exist_ok=True) + for skill_dir_src in main_skills_dir.iterdir(): + if skill_dir_src.is_dir(): + dest_skill_dir = dest_skills_dir / skill_dir_src.name + # Copy skill directory + import shutil + if dest_skill_dir.exists(): + shutil.rmtree(dest_skill_dir) + shutil.copytree(skill_dir_src, dest_skill_dir) + logger.info("Copied skill to benchmark workspace: %s", skill_dir_src.name) + return workspace