OpenDCAI · Dingxingdi · Apr 15, 2026 · Apr 16, 2026 · Apr 16, 2026 · Apr 16, 2026
diff --git a/benchmark/code_benchmark.jsonl b/benchmark/code_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"}
+{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}}
diff --git a/benchmark/mcp_canvas_benchmark.jsonl b/benchmark/mcp_canvas_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
+{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=<code>, enrolled=<int>.", "answer": ""}
diff --git a/benchmark/mcp_snowflake_benchmark.jsonl b/benchmark/mcp_snowflake_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
+{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""}
diff --git a/benchmark/mcp_train_benchmark.jsonl b/benchmark/mcp_train_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
+{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=<station>, arrival=<station>.", "answer": ""}
diff --git a/benchmark/mcp_woocommerce_benchmark.jsonl b/benchmark/mcp_woocommerce_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
+{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=<email>, orders=<int>.", "answer": ""}
diff --git a/benchmark/mcp_yahoo_finance_benchmark.jsonl b/benchmark/mcp_yahoo_finance_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
+{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=<ticker>.", "answer": ""}
diff --git a/benchmark/mcp_youtube_benchmark.jsonl b/benchmark/mcp_youtube_benchmark.jsonl
@@ -0,0 +1,2 @@
+{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
+{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=<id>, language=<lang>.", "answer": ""}
diff --git a/configs/sandbox-server/code_config.json b/configs/sandbox-server/code_config.json
@@ -0,0 +1,21 @@
+{
+  "server": {
+    "url": "http://127.0.0.1:18890",
+    "port": 18890,
+    "session_ttl": 300
+  },
+  "resources": {
+    "code": {
+      "enabled": true,
+      "description": "Lightweight coding backend powered by vendored internal tools",
+      "backend_class": "sandbox.server.backends.resources.code.CodeBackend",
+      "config": {
+        "workspace_root": "/tmp/agentflow_code"
+      }
+    }
+  },
+  "warmup": {
+    "enabled": false,
+    "resources": []
+  }
+}
diff --git a/configs/sandbox-server/mcp_config.json b/configs/sandbox-server/mcp_config.json
@@ -10,14 +10,26 @@
       "description": "Toolathlon-GYM MCP backend",
       "backend_class": "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend",
       "config": {
-        "enabled_mcp_servers": ["filesystem", "terminal", "snowflake"],
+        "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers",
+        "enabled_mcp_servers": [
+          "canvas",
+          "snowflake",
+          "woocommerce",
+          "yahoo-finance",
+          "youtube",
+          "youtube-transcript",
+          "rail_12306",
+          "filesystem"
+        ],
         "workspace_root": "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}",
         "env_overrides": {
-          "PGHOST": "${PGHOST:-toolathlon_pg}",
+          "PGHOST": "${PGHOST:-localhost}",
           "PGPORT": "${PGPORT:-5432}",
           "PGUSER": "${PGUSER:-eigent}",
           "PGPASSWORD": "${PGPASSWORD:-camel}",
-          "PGDATABASE": "${PGDATABASE:-toolathlon_gym}"
+          "PGDATABASE": "${PGDATABASE:-toolathlon_gym}",
+          "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}",
+          "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}"
         }
       }
     }

diff --git a/configs/synthesis/code_config.json b/configs/synthesis/code_config.json
@@ -0,0 +1,44 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 10,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/code_config.json",
+  "resource_types": ["code"],
+  "resource_init_configs": {
+    "code": {
+      "content": {
+        "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo"
+      }
+    }
+  },
+  "available_tools": ["code-*"],
+  "sampling_tips": [
+    "Inspect the repository before proposing edits.",
+    "Use code-bash only for lightweight checks that fit the bundled demo repo."
+  ],
+  "synthesis_tips": [
+    "Generate repo-grounded QA only.",
+    "Prefer file-path, function-behavior, and small edit-validation questions over open-ended design prompts."
+  ],
+  "qa_examples": [
+    {
+      "question": "Which file stores the greeting suffix used by the demo app? Reply with the relative file path only.",
+      "answer": "config/app_config.json"
+    },
+    {
+      "question": "What string does `build_message()` return before any edits? Reply with the exact string only.",
+      "answer": "Hello, AgentFlow?"
+    }
+  ],
+  "seed_description": "Coding demo repository prompts",
+  "seeds_file": "seeds/code/seeds.jsonl",
+  "output_dir": "results/code"
+}
diff --git a/configs/synthesis/mcp_canvas_config.json b/configs/synthesis/mcp_canvas_config.json
@@ -0,0 +1,38 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 12,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/mcp_config.json",
+  "resource_types": ["mcp"],
+  "resource_init_configs": {},
+  "available_tools": ["mcp:canvas.*", "mcp:filesystem.*"],
+  "sampling_tips": [
+    "Inspect courses, assignments, and enrollments before drafting any question.",
+    "Prefer filesystem tools only for scratch notes or short saved artifacts."
+  ],
+  "synthesis_tips": [
+    "Generate domain-grounded factual QA only.",
+    "Keep answers short and directly verifiable from Canvas MCP tool outputs."
+  ],
+  "qa_examples": [
+    {
+      "question": "If a Canvas tool result shows a course with code HIST-201 and 28 enrolled students, how should the answer be formatted?",
+      "answer": "code=HIST-201, enrolled=28"
+    },
+    {
+      "question": "If the first three course names in alphabetical order are Biology 101, Chemistry Lab, and World History, how should the answer be returned?",
+      "answer": "Biology 101, Chemistry Lab, World History"
+    }
+  ],
+  "seed_description": "Canvas MCP prompts",
+  "seeds_file": "seeds/mcp/canvas_seeds.jsonl",
+  "output_dir": "results/mcp_canvas"
+}
diff --git a/configs/synthesis/mcp_snowflake_config.json b/configs/synthesis/mcp_snowflake_config.json
@@ -0,0 +1,38 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 12,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/mcp_config.json",
+  "resource_types": ["mcp"],
+  "resource_init_configs": {},
+  "available_tools": ["mcp:snowflake.*", "mcp:filesystem.*"],
+  "sampling_tips": [
+    "Inspect schemas and table names before choosing a reporting question.",
+    "Prefer filesystem tools only for scratch notes or short saved artifacts."
+  ],
+  "synthesis_tips": [
+    "Generate domain-grounded factual QA only.",
+    "Keep answers short and directly verifiable from Snowflake MCP query outputs."
+  ],
+  "qa_examples": [
+    {
+      "question": "If the first three visible tables are CUSTOMERS, LINE_ITEMS, and ORDERS, how should the answer be returned?",
+      "answer": "CUSTOMERS, LINE_ITEMS, ORDERS"
+    },
+    {
+      "question": "If a Snowflake aggregate query returns total_orders=125, how should the answer be formatted?",
+      "answer": "total_orders=125"
+    }
+  ],
+  "seed_description": "Snowflake MCP prompts",
+  "seeds_file": "seeds/mcp/snowflake_seeds.jsonl",
+  "output_dir": "results/mcp_snowflake"
+}
diff --git a/configs/synthesis/mcp_train_config.json b/configs/synthesis/mcp_train_config.json
@@ -0,0 +1,38 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 12,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/mcp_config.json",
+  "resource_types": ["mcp"],
+  "resource_init_configs": {},
+  "available_tools": ["mcp:rail_12306.*", "mcp:filesystem.*"],
+  "sampling_tips": [
+    "Inspect stations, routes, and train options before drafting a travel lookup question.",
+    "Prefer filesystem tools only for scratch notes or short saved artifacts."
+  ],
+  "synthesis_tips": [
+    "Generate domain-grounded factual QA only.",
+    "Keep answers short and directly verifiable from rail_12306 MCP tool outputs."
+  ],
+  "qa_examples": [
+    {
+      "question": "If the first three station names alphabetically are Beijing, Hangzhou, and Shanghai, how should the answer be returned?",
+      "answer": "Beijing, Hangzhou, Shanghai"
+    },
+    {
+      "question": "If a route lookup shows departure Shanghai and arrival Nanjing, how should the answer be formatted?",
+      "answer": "departure=Shanghai, arrival=Nanjing"
+    }
+  ],
+  "seed_description": "Train MCP prompts",
+  "seeds_file": "seeds/mcp/train_seeds.jsonl",
+  "output_dir": "results/mcp_train"
+}
diff --git a/configs/synthesis/mcp_woocommerce_config.json b/configs/synthesis/mcp_woocommerce_config.json
@@ -0,0 +1,38 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 12,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/mcp_config.json",
+  "resource_types": ["mcp"],
+  "resource_init_configs": {},
+  "available_tools": ["mcp:woocommerce.*", "mcp:filesystem.*"],
+  "sampling_tips": [
+    "Inspect customers, products, and orders before selecting a small store question.",
+    "Prefer filesystem tools only for scratch notes or short saved artifacts."
+  ],
+  "synthesis_tips": [
+    "Generate domain-grounded factual QA only.",
+    "Keep answers short and directly verifiable from WooCommerce MCP tool outputs."
+  ],
+  "qa_examples": [
+    {
+      "question": "If the first three product names alphabetically are Backpack, Coffee Mug, and Notebook, how should the answer be returned?",
+      "answer": "Backpack, Coffee Mug, Notebook"
+    },
+    {
+      "question": "If a customer email is alex@example.com and that customer has 3 orders, how should the answer be formatted?",
+      "answer": "email=alex@example.com, orders=3"
+    }
+  ],
+  "seed_description": "WooCommerce MCP prompts",
+  "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl",
+  "output_dir": "results/mcp_woocommerce"
+}
diff --git a/configs/synthesis/mcp_yahoo_finance_config.json b/configs/synthesis/mcp_yahoo_finance_config.json
@@ -0,0 +1,38 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 12,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/mcp_config.json",
+  "resource_types": ["mcp"],
+  "resource_init_configs": {},
+  "available_tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"],
+  "sampling_tips": [
+    "Inspect available tickers and quote fields before drafting a finance lookup question.",
+    "Prefer filesystem tools only for scratch notes or short saved artifacts."
+  ],
+  "synthesis_tips": [
+    "Generate domain-grounded factual QA only.",
+    "Keep answers short and directly verifiable from Yahoo Finance MCP tool outputs."
+  ],
+  "qa_examples": [
+    {
+      "question": "If the available tickers sorted alphabetically begin with AAPL, MSFT, and NVDA, how should the answer be returned?",
+      "answer": "AAPL, MSFT, NVDA"
+    },
+    {
+      "question": "If one comparison shows MSFT has the larger price, how should the answer be formatted?",
+      "answer": "symbol=MSFT"
+    }
+  ],
+  "seed_description": "Yahoo Finance MCP prompts",
+  "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl",
+  "output_dir": "results/mcp_yahoo_finance"
+}
diff --git a/configs/synthesis/mcp_youtube_config.json b/configs/synthesis/mcp_youtube_config.json
@@ -0,0 +1,38 @@
+{
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_depth": 12,
+  "branching_factor": 2,
+  "depth_threshold": 2,
+  "min_depth": 2,
+  "max_selected_traj": 1,
+  "path_similarity_threshold": 0.7,
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/mcp_config.json",
+  "resource_types": ["mcp"],
+  "resource_init_configs": {},
+  "available_tools": ["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"],
+  "sampling_tips": [
+    "Inspect video metadata first, then use transcript tools only when language or transcript details matter.",
+    "Prefer filesystem tools only for scratch notes or short saved artifacts."
+  ],
+  "synthesis_tips": [
+    "Generate domain-grounded factual QA only.",
+    "Keep answers short and directly verifiable from YouTube MCP tool outputs."
+  ],
+  "qa_examples": [
+    {
+      "question": "If the first three video titles alphabetically are Intro to Databases, MCP Demo, and Testing Walkthrough, how should the answer be returned?",
+      "answer": "Intro to Databases, MCP Demo, Testing Walkthrough"
+    },
+    {
+      "question": "If a transcript lookup shows video id abc123 with language en, how should the answer be formatted?",
+      "answer": "video=abc123, language=en"
+    }
+  ],
+  "seed_description": "YouTube MCP prompts",
+  "seeds_file": "seeds/mcp/youtube_seeds.jsonl",
+  "output_dir": "results/mcp_youtube"
+}
diff --git a/configs/trajectory/code_trajectory.json b/configs/trajectory/code_trajectory.json
@@ -0,0 +1,31 @@
+{
+  "benchmark_name": "code_trajectory",
+  "model_name": "openai/gpt-oss-120b",
+  "api_key": "${OPENAI_API_KEY}",
+  "base_url": "${OPENAI_API_URL}",
+  "max_turns": 12,
+  "available_tools": ["code-*"],
+  "sandbox_server_url": "http://127.0.0.1:18890",
+  "sandbox_auto_start": false,
+  "sandbox_config_path": "configs/sandbox-server/code_config.json",
+  "resource_types": ["code"],
+  "resource_init_configs": {
+    "code": {
+      "content": {
+        "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo"
+      }
+    }
+  },
+  "system_prompt": [
+    "You are a coding assistant working inside a small repository.",
+    "Inspect files before editing them.",
+    "When a task asks for verification, run the requested command inside the coding workspace before giving the final answer."
+  ],
+  "evaluate_results": false,
+  "data_path": "benchmark/code_benchmark.jsonl",
+  "output_dir": "trajectory_results/code",
+  "save_results": true,
+  "save_trajectories": true,
+  "trajectory_only": true,
+  "save_summary": false
+}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"}
		{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
		{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=<code>, enrolled=<int>.", "answer": ""}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
		{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
		{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=<station>, arrival=<station>.", "answer": ""}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
		{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=<email>, orders=<int>.", "answer": ""}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
		{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=<ticker>.", "answer": ""}
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1,2 @@
		{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
		{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=<id>, language=<lang>.", "answer": ""}