Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 37 additions & 0 deletions datasets/codex-cli-tools/codex-cli-skills.evalset.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
{
"scenarios": [
{
"id": "cloud-sql-list-instances-01",
"starting_prompt": "list all Cloud SQL instances in project ext-test-cloud-sql-postgres",
"conversation_plan": "Ask the agent to list instances in project ext-test-cloud-sql-postgres. Once all instances are listed if daily-ci-evals-db exist get its state and validate its RUNNABLE",
"expected_trajectory": [
"list_instances.js",
"get_instance.js"
],
"expected_skills": [
"cloud-sql-postgres-admin"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
},
"kind": "tools",
"max_turns": 3
},
{
"id": "csql-instance-not-found-failure",
"starting_prompt": "Update the instance 'non-existent-db-123' to have 8 cores.",
"conversation_plan": "The user asks to interact with an instance named 'non-existent-db-123' in ext-test-cloud-sql-postgres project that doesn't exist. The agent should try to get the instance details or update it directly, fail to find it, and inform the user. The user will then ask to list instances to find the correct name.",
"expected_trajectory": [
"list_instances.js"
],
"expected_skills": [
"cloud-sql-postgres-admin"
],
"env": {
"GOOGLE_CLOUD_PROJECT": "ext-test-cloud-sql-postgres"
},
"kind": "tools",
"max_turns": 4
}
]
}
40 changes: 40 additions & 0 deletions datasets/codex-cli-tools/example_run_skills_config.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,40 @@
############################################################
### Dataset / Eval Items
############################################################
dataset_config: datasets/codex-cli-tools/codex-cli-skills.evalset.json
dataset_format: agent-format

# Orchestrator Configuration
orchestrator: agent
model_config: datasets/model_configs/codex_cli_skills_model.yaml
simulated_user_model_config: datasets/model_configs/gemini_2.5_pro_model.yaml

# Concurrency: number of scenarios to run in parallel.
# Set to 1 for sequential runs (easier to follow logs, avoids session conflicts
# on the shared sandboxed ~/.codex store).
runners:
agent_runners: 1

############################################################
### Scorer Related Configs
############################################################
scorers:
skills_trajectory:
enforce_order: false
skills_best_practices:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
goal_completion:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
behavioral_metrics:
model_config: datasets/model_configs/gemini_2.5_pro_model.yaml
turn_count: {}
end_to_end_latency: {}
tool_call_latency: {}
token_consumption: {}

############################################################
### Reporting Related Configs
############################################################
reporting:
csv:
output_directory: 'results'
28 changes: 28 additions & 0 deletions datasets/model_configs/codex_cli_skills_model.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
# OpenAI Codex CLI with declarative Skills support.
codex_cli_version: "@openai/codex@latest"

generator: codex_cli

model: "gpt-5.5"

openai_api_key_secret: "projects/549584275235/secrets/OPENAI_API_KEY/versions/1"

pricing:
input_per_million_usd: 1.25
cached_input_per_million_usd: 0.125
output_per_million_usd: 10.0

env:
GOOGLE_CLOUD_PROJECT: "ext-test-cloud-sql-postgres"
GOOGLE_CLOUD_LOCATION: "us-central1"

setup:
skills:
# Example of cloning and linking a custom plugin marketplace repo
- action: install_from_repo
url: "https://github.com/gemini-cli-extensions/cloud-sql-postgresql"


# Example of linking a custom local skill package
# - action: link
# path: "/path/to/local/skill"
2 changes: 1 addition & 1 deletion evalbench/evaluator/agentevaluator.py
Original file line number Diff line number Diff line change
Expand Up @@ -176,7 +176,7 @@ def process_scenario(
accumulated_tools.extend(tools)

# Extract skills from generator output
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator)):
if isinstance(self.generator, (GeminiCliGenerator, ClaudeCodeGenerator, CodexCliGenerator)):
skills = self.generator.extract_skills(result.stdout)
accumulated_skills.extend(skills)

Expand Down
Loading
Loading