diff --git a/datasets/codex-cli-tools/codex-cli-skills.evalset.json b/datasets/codex-cli-tools/codex-cli-skills.evalset.json new file mode 100644 index 00000000..c1f19761 --- /dev/null +++ b/datasets/codex-cli-tools/codex-cli-skills.evalset.json @@ -0,0 +1,37 @@ +{ + "scenarios": [ + { + "id": "cloud-sql-list-instances-01", + "starting_prompt": "list all Cloud SQL instances in project ext-test-cloud-sql-postgres", + "conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once all instances are listed if daily-ci-evals-db exist get its state and validate its RUNNABLE", + "expected_trajectory": [ + "list_instances.js", + "get_instance.js" + ], + "expected_skills": [ + "cloud-sql-postgres-admin" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 3 + }, + { + "id": "csql-instance-not-found-failure", + "starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.", + "conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in ext-test-cloud-sql-postgres project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.", + "expected_trajectory": [ + "list_instances.js" + ], + "expected_skills": [ + "cloud-sql-postgres-admin" + ], + "env": { + "GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres" + }, + "kind": "tools", + "max_turns": 4 + } + ] +} \ No newline at end of file diff --git a/datasets/codex-cli-tools/example_run_skills_config.yaml b/datasets/codex-cli-tools/example_run_skills_config.yaml new file mode 100644 index 00000000..d0625164 --- /dev/null +++ b/datasets/codex-cli-tools/example_run_skills_config.yaml @@ -0,0 +1,40 @@ +############################################################ +### Dataset / Eval Items +############################################################ +dataset_config: datasets/codex-cli-tools/codex-cli-skills.evalset.json +dataset_format: agent-format + +# Orchestrator Configuration +orchestrator: agent +model_config: datasets/model_configs/codex_cli_skills_model.yaml +simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + +# Concurrency: number of scenarios to run in parallel. +# Set to 1 for sequential runs (easier to follow logs, avoids session conflicts +# on the shared sandboxed ~/.codex store). +runners: + agent_runners: 1 + +############################################################ +### Scorer Related Configs +############################################################ +scorers: + skills_trajectory: + enforce_order: false + skills_best_practices: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + goal_completion: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + behavioral_metrics: + model_config: datasets/model_configs/gemini_2.5_pro_model.yaml + turn_count: {} + end_to_end_latency: {} + tool_call_latency: {} + token_consumption: {} + +############################################################ +### Reporting Related Configs +############################################################ +reporting: + csv: + output_directory: 'results' diff --git a/datasets/model_configs/codex_cli_skills_model.yaml b/datasets/model_configs/codex_cli_skills_model.yaml new file mode 100644 index 00000000..c66df2de --- /dev/null +++ b/datasets/model_configs/codex_cli_skills_model.yaml @@ -0,0 +1,28 @@ +# OpenAI Codex CLI with declarative Skills support. +codex_cli_version: "@openai/codex@latest" + +generator: codex_cli + +model: "gpt-5.5" + +openai_api_key_secret: "projects/549584275235/secrets/OPENAI_API_KEY/versions/1" + +pricing: + input_per_million_usd: 1.25 + cached_input_per_million_usd: 0.125 + output_per_million_usd: 10.0 + +env: + GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres" + GOOGLE_CLOUD_LOCATION: "us-central1" + +setup: + skills: + # Example of cloning and linking a custom plugin marketplace repo + - action: install_from_repo + url: "https://github.com/gemini-cli-extensions/cloud-sql-postgresql" + + + # Example of linking a custom local skill package + # - action: link + # path: "/path/to/local/skill" diff --git a/evalbench/evaluator/agentevaluator.py b/evalbench/evaluator/agentevaluator.py index b2f3ca17..c4b060ee 100644 --- a/evalbench/evaluator/agentevaluator.py +++ b/evalbench/evaluator/agentevaluator.py @@ -176,7 +176,7 @@ def process_scenario( accumulated_tools.extend(tools) # Extract skills from generator output - if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)): + if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)): skills = self.generator.extract_skills(result.stdout) accumulated_skills.extend(skills) diff --git a/evalbench/generators/models/codex_cli.py b/evalbench/generators/models/codex_cli.py index d30b54c4..20a0af61 100644 --- a/evalbench/generators/models/codex_cli.py +++ b/evalbench/generators/models/codex_cli.py @@ -4,6 +4,7 @@ import json import logging import re +import shutil import sys import threading import time @@ -65,9 +66,11 @@ def __init__(self, querygenerator_config): os.path.join(".venv", "fake_home_codex")) self.codex_config_dir = os.path.join(self.fake_home, ".codex") + self.skills_dir = os.path.join(self.codex_config_dir, "skills") os.makedirs(self.fake_home, exist_ok=True) os.makedirs(self.codex_config_dir, exist_ok=True) + os.makedirs(self.skills_dir, exist_ok=True) self.env = querygenerator_config.get("env", {}) self.env["HOME"] = self.fake_home @@ -234,6 +237,151 @@ def _setup(self): extra_config.update(self.setup_config.get("config", {})) self._write_config_toml(mcp_servers_config, extra_config) + skills_config = self.setup_config.get("skills", []) + if skills_config: + self._setup_skills(skills_config) + + skills_dir_path = self.setup_config.get("skills_dir") + if skills_dir_path: + self._setup_skills_from_dir(skills_dir_path) + + def _setup_skills(self, skills: list): + """Sets up skills by copying them locally or executing non-interactive link commands. + + Supports two paradigms: + 1. Bare strings: Replicates custom skills directly from the host (~/.codex/skills/) + into the sandboxed fake home path (.venv/fake_home_codex/.codex/skills/). + 2. Dictionaries: Supports precise linking (`action: link`) or external suite cloning + (`action: install_from_repo`) to map source repositories natively. + """ + if not skills: + return + + real_skills_dir = os.path.join(self.real_home, ".codex", "skills") + + setup_env = os.environ.copy() + setup_env.update(self.env) + + # Resolve base command prefix based on whether the version is pinned via npm package spec + base_cmd = [] + if self.codex_cli_version.startswith("@") or "/" in self.codex_cli_version: + base_cmd = ["npm", "exec", "--yes", self.codex_cli_version, "--"] + else: + base_cmd = [self.codex_cli_version] + + for skill_config in skills: + # Paradigm 1: Local Sandbox Replication + if isinstance(skill_config, str): + skill_name = skill_config + real_skill_path = os.path.join(real_skills_dir, skill_name) + fake_skill_path = os.path.join(self.skills_dir, skill_name) + + if not os.path.exists(real_skill_path): + logging.warning( + f"Requested skill '{skill_name}' not found at {real_skill_path}." + ) + continue + + logging.info(f"Syncing skill: {skill_name}") + if os.path.exists(fake_skill_path): + shutil.rmtree(fake_skill_path) + try: + shutil.copytree(real_skill_path, fake_skill_path) + except Exception as e: + logging.error(f"Failed to copy skill {skill_name}: {e}") + + # Paradigm 2: Declarative Command Setup + elif isinstance(skill_config, dict): + action = skill_config.get("action") + path = skill_config.get("path") + name = skill_config.get("name") + + cmd = None + if action == "link" and path: + if not os.path.exists(path): + logging.warning( + f"Skill path to link '{path}' does not exist. Skipping link action." + ) + continue + logging.info(f"Linking skill from path: {path}") + cmd = base_cmd + ["plugin", "marketplace", "add", os.path.abspath(path)] + elif action == "install_from_repo": + url = skill_config.get("url") + if url: + # Support version/branch pinning via URL fragments (#) + clone_url, _, version_tag = url.partition("#") + repo_name = re.sub(r"\.git$", "", clone_url.rstrip("/").split("/")[-1]) + clone_target = os.path.join(self.fake_home, ".codex", "repos", repo_name) + if os.path.exists(clone_target): + shutil.rmtree(clone_target) + os.makedirs(os.path.dirname(clone_target), exist_ok=True) + + git_cmd = ["git", "clone", "--depth", "1"] + if version_tag: + git_cmd.extend(["--branch", version_tag]) + git_cmd.extend([clone_url, clone_target]) + + logging.info(f"Cloning skill repo '{url}' to {clone_target}") + try: + res = subprocess.run(git_cmd, capture_output=True, text=True, check=False, env=setup_env, timeout=120) + if res.returncode == 0: + # Automatically link the cloned repository source + cmd = base_cmd + ["plugin", "marketplace", "add", clone_target] + else: + logging.error(f"Failed to clone skill repo '{url}': {res.stderr.strip()}") + except Exception as e: + logging.error(f"Exception cloning skill repo '{url}': {e}") + else: + logging.warning(f"Missing 'url' for install_from_repo skill config: {skill_config}") + else: + logging.warning( + f"Unsupported or malformed skill config: {skill_config}" + ) + + if cmd: + try: + result = subprocess.run( + cmd, + check=False, + capture_output=True, + text=True, + env=setup_env, + ) + if result.returncode != 0: + logging.error( + f"Failed to execute skill action '{action}'. Output: {result.stdout}, Error: {result.stderr}" + ) + except Exception as e: + logging.error(f"Failed to execute skill action '{action}': {e}") + + def _setup_skills_from_dir(self, skills_dir_path: str): + """Registers a top-level local marketplace directory via native linking. + + Matches Claude Code's `setup.skills_dir` configuration model, providing direct compatibility + for monolithic local suite directories. + """ + if not os.path.isdir(skills_dir_path): + logging.warning(f"Skills directory not found: {skills_dir_path}") + return + + setup_env = os.environ.copy() + setup_env.update(self.env) + + base_cmd = [] + if self.codex_cli_version.startswith("@") or "/" in self.codex_cli_version: + base_cmd = ["npm", "exec", "--yes", self.codex_cli_version, "--"] + else: + base_cmd = [self.codex_cli_version] + + cmd = base_cmd + ["plugin", "marketplace", "add", os.path.abspath(skills_dir_path)] + logging.info(f"Linking skills directory: {skills_dir_path}") + try: + result = subprocess.run(cmd, check=False, capture_output=True, text=True, env=setup_env) + if result.returncode != 0: + logging.error(f"Failed to link skills directory '{skills_dir_path}': {result.stderr or result.stdout}") + except Exception as e: + logging.error(f"Exception linking skills directory '{skills_dir_path}': {e}") + def _write_config_toml(self, mcp_servers_config: dict, extra_config: dict): """Writes Codex CLI's `config.toml` with MCP server declarations. @@ -792,6 +940,113 @@ def extract_tools(self, stdout: str) -> list[str]: return list(output_json["stats"]["tools"]["byName"].keys()) return [] + @staticmethod + def _collect_skills(skills_root: str, into: set): + """Recursively scans a target root directory and collects skill folders. + + Subdirectories are added to the set to enable comprehensive trajectory matching. + """ + if not os.path.isdir(skills_root): + return + for entry in os.listdir(skills_root): + full_p = os.path.join(skills_root, entry) + if os.path.isdir(full_p): + into.add(entry) + + def _get_installed_skills(self) -> set[str]: + """Authoritatively aggregates all available skill folder basenames across multiple source locations. + + Scans: + 1. Sandboxed replication folders (.codex/skills). + 2. Dynamic remote clones (.codex/repos). + 3. Top-level monolithic skill directories (`skills_dir`). + 4. Explicitly mapped local links (`action: link`). + """ + installed = set() + # Source 1: Replicated host folders + if os.path.isdir(self.skills_dir): + for entry in os.listdir(self.skills_dir): + installed.add(entry) + + # Source 2: Dynamically cloned remote marketplace suites + repos_dir = os.path.join(self.fake_home, ".codex", "repos") + if os.path.isdir(repos_dir): + for repo in os.listdir(repos_dir): + skills_sub = os.path.join(repos_dir, repo, "skills") + if os.path.isdir(skills_sub): + self._collect_skills(skills_sub, installed) + else: + self._collect_skills(os.path.join(repos_dir, repo), installed) + + # Source 3: Top-level linked monolithic marketplace directories + skills_dir_path = (self.setup_config or {}).get("skills_dir") + if skills_dir_path: + skills_sub = os.path.join(skills_dir_path, "skills") + if os.path.isdir(skills_sub): + self._collect_skills(skills_sub, installed) + else: + self._collect_skills(skills_dir_path, installed) + + # Source 4: Explicitly linked individual paths + for skill_cfg in (self.setup_config or {}).get("skills", []): + if isinstance(skill_cfg, dict) and skill_cfg.get("action") == "link": + link_p = skill_cfg.get("path") + if link_p and os.path.isdir(link_p): + installed.add(os.path.basename(os.path.abspath(link_p))) + + return installed + + def extract_skills(self, stdout: str) -> list[str]: + """Extracts activated true skill names directly from the parsed tool trajectory. + + Matches explicitly declared tool uses against the unified set of installed skills, + and intercepts parameters passed to dynamic activation tools (e.g., `activate_skill`). + """ + output_json = self.parse_response(stdout) + try: + by_name = output_json["stats"]["tools"]["byName"] + except (KeyError, TypeError): + return [] + + installed_skills = self._get_installed_skills() + items = [] + + # Pattern 1: Direct tool exposure matching known skill suites + for tool_name in by_name: + if tool_name in installed_skills and tool_name not in items: + items.append(tool_name) + + # Pattern 2: Built-in skill invocation wrappers + for t_key in ("activate_skill", "Skill"): + calls = by_name.get(t_key, {}) + for params in calls.get("parameters", []): + name = params.get("skill_name") or params.get("skillName") or params.get("skill") or params.get("name") + if name and name not in items: + items.append(name) + + return items + + def extract_skill_scripts(self, stdout: str) -> list[str]: + """Extracts underlying skill-script names (e.g., list_instances.js) from shell/bash execution tools. + + Enables granular trajectory tracing by identifying invoked internal scripts inside tool argument commands. + """ + output_json = self.parse_response(stdout) + try: + by_name = output_json["stats"]["tools"]["byName"] + except (KeyError, TypeError): + return [] + + scripts = [] + for tool_name, tstat in by_name.items(): + if tool_name.lower() in ("bash", "shell", "command_execution"): + for params in tstat.get("parameters", []) or []: + command = params.get("command", "") if isinstance(params, dict) else "" + match = re.search(r'/scripts/([a-zA-Z0-9_-]+\.[a-zA-Z0-9]+)', command) + if match and match.group(1) not in scripts: + scripts.append(match.group(1)) + return scripts + def safe_generate(self, cli_cmd: CLICommand) -> subprocess.CompletedProcess: result = self.generate_internal(cli_cmd) if isinstance(result, str): diff --git a/evalbench/scorers/skillsbestpractices.py b/evalbench/scorers/skillsbestpractices.py index b3fb3c78..98256bdd 100644 --- a/evalbench/scorers/skillsbestpractices.py +++ b/evalbench/scorers/skillsbestpractices.py @@ -40,17 +40,21 @@ def __init__(self, config: dict, global_models): # Claude Code sandbox path used by the generator. self.skills_dir = config.get("skills_dir") or "" if not self.skills_dir: - fake_home_skills = os.path.join( - ".venv", "fake_home_claude", ".claude", "skills") - if os.path.isdir(fake_home_skills): - self.skills_dir = os.path.abspath(fake_home_skills) - logging.info( - f"Using fake_home skills directory: {self.skills_dir}") + for candidate in [ + os.path.join(".venv", "fake_home_claude", ".claude", "skills"), + os.path.join(".venv", "fake_home_codex", ".codex", "skills"), + os.path.join(".venv", "fake_home", ".gemini", "skills"), + ]: + if os.path.isdir(candidate): + self.skills_dir = os.path.abspath(candidate) + logging.info( + f"Using fake_home skills directory: {self.skills_dir}") + break if not self.skills_dir: raise ValueError( "skills_dir is required: set scorers.skills_best_practices.skills_dir, " - "or run the Claude Code generator first so .venv/fake_home_claude/.claude/skills exists." + "or run a generator first (Claude, Codex, or Gemini) to create the sandbox skills folder." ) def _find_skill_md(self, skill_name: str) -> str | None: