Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
40 commits
Select commit Hold shift + click to select a range
702999a
docs: add code backend single-repo vendoring design
Dingxingdi Apr 15, 2026
a49ce05
docs: align code real smoke plan with mcp
Dingxingdi Apr 16, 2026
1672c2d
Vendor code backend tools
Dingxingdi Apr 16, 2026
a2c0bd1
Fix vendored tool upstream contracts
Dingxingdi Apr 16, 2026
96dc16e
Rewrite code backend around vendored tools
Dingxingdi Apr 16, 2026
3255926
Make vendored bash tool non-blocking
Dingxingdi Apr 16, 2026
652cf7b
Clean up canceled vendored bash subprocesses
Dingxingdi Apr 16, 2026
255a377
Restore vendored bash text-mode decoding
Dingxingdi Apr 16, 2026
c20cc19
Align code backend schema docs and config tests
Dingxingdi Apr 16, 2026
5d38fda
Add code rollout real smoke gating
Dingxingdi Apr 16, 2026
2d52b29
Fix code real smoke collection gating
Dingxingdi Apr 16, 2026
7937314
Set code real smoke turn budget
Dingxingdi Apr 16, 2026
9be169a
Fix lazy loading for code backend imports
Dingxingdi Apr 16, 2026
3108a59
Adapt code backend smoke test to vendored config
Dingxingdi Apr 16, 2026
b7fc32d
Tighten code backend loader smoke test
Dingxingdi Apr 16, 2026
0a5baac
Restore eager resource exports
Dingxingdi Apr 16, 2026
38adac8
Align MCP tests with eager imports
Dingxingdi Apr 16, 2026
529dcb0
Merge main into code backend vendor tools
Dingxingdi Apr 16, 2026
f5e8ce7
Align MCP test and desktop env requirements with main
Dingxingdi Apr 16, 2026
6876b06
Restore MCP backend test formatting from main
Dingxingdi Apr 16, 2026
c1f6bf6
Fix vendored code bash and grep failures
Dingxingdi Apr 16, 2026
b6d8d7b
Add code backend failure regressions
Dingxingdi Apr 16, 2026
35149a5
Quote interpreter in bash backend test
Dingxingdi Apr 16, 2026
4f42a5f
docs: add MCP and Coding examples design spec
Dingxingdi Apr 20, 2026
e758ef2
docs: refine MCP and Coding examples spec
Dingxingdi Apr 20, 2026
85d4496
docs: correct MCP local_servers path in spec
Dingxingdi Apr 20, 2026
e58bc0d
test: lock MCP example sandbox contract
Dingxingdi Apr 21, 2026
cbd1ad4
Tighten MCP backend config template assertions
Dingxingdi Apr 21, 2026
54e712a
Add failing MCP example config and asset tests
Dingxingdi Apr 21, 2026
ad33358
feat: add MCP example assets
Dingxingdi Apr 21, 2026
8f736e7
docs: add MCP example guide
Dingxingdi Apr 21, 2026
22d3fc0
Clarify MCP server name mapping in MCPAgent doc
Dingxingdi Apr 21, 2026
6628e1e
test: add coding example contract tests
Dingxingdi Apr 21, 2026
f092925
Fix synthesis pytest import path
Dingxingdi Apr 21, 2026
4d4e670
feat: add Coding example assets
Dingxingdi Apr 21, 2026
d665dd9
fix: include demo helper module
Dingxingdi Apr 21, 2026
237ac5a
Fix coding smoke test runtime contract
Dingxingdi Apr 21, 2026
2900503
docs: add Coding example guide
Dingxingdi Apr 21, 2026
2fcf88e
Fix coding example doc contract
Dingxingdi Apr 21, 2026
c72c658
Add verified MCP and coding examples
Dingxingdi Apr 21, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions benchmark/code_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"}
{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}}
2 changes: 2 additions & 0 deletions benchmark/mcp_canvas_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=<code>, enrolled=<int>.", "answer": ""}
2 changes: 2 additions & 0 deletions benchmark/mcp_snowflake_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""}
2 changes: 2 additions & 0 deletions benchmark/mcp_train_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=<station>, arrival=<station>.", "answer": ""}
2 changes: 2 additions & 0 deletions benchmark/mcp_woocommerce_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=<email>, orders=<int>.", "answer": ""}
2 changes: 2 additions & 0 deletions benchmark/mcp_yahoo_finance_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=<ticker>.", "answer": ""}
2 changes: 2 additions & 0 deletions benchmark/mcp_youtube_benchmark.jsonl
Original file line number Diff line number Diff line change
@@ -0,0 +1,2 @@
{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}
{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=<id>, language=<lang>.", "answer": ""}
21 changes: 21 additions & 0 deletions configs/sandbox-server/code_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
{
"server": {
"url": "http://127.0.0.1:18890",
"port": 18890,
"session_ttl": 300
},
"resources": {
"code": {
"enabled": true,
"description": "Lightweight coding backend powered by vendored internal tools",
"backend_class": "sandbox.server.backends.resources.code.CodeBackend",
"config": {
"workspace_root": "/tmp/agentflow_code"
}
}
},
"warmup": {
"enabled": false,
"resources": []
}
}
18 changes: 15 additions & 3 deletions configs/sandbox-server/mcp_config.json
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,26 @@
"description": "Toolathlon-GYM MCP backend",
"backend_class": "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend",
"config": {
"enabled_mcp_servers": ["filesystem", "terminal", "snowflake"],
"mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers",
"enabled_mcp_servers": [
"canvas",
"snowflake",
"woocommerce",
"yahoo-finance",
"youtube",
"youtube-transcript",
"rail_12306",
"filesystem"
],
"workspace_root": "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}",
"env_overrides": {
"PGHOST": "${PGHOST:-toolathlon_pg}",
"PGHOST": "${PGHOST:-localhost}",
"PGPORT": "${PGPORT:-5432}",
"PGUSER": "${PGUSER:-eigent}",
"PGPASSWORD": "${PGPASSWORD:-camel}",
"PGDATABASE": "${PGDATABASE:-toolathlon_gym}"
"PGDATABASE": "${PGDATABASE:-toolathlon_gym}",
"CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}",
"WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}"
}
}
}
Expand Down
44 changes: 44 additions & 0 deletions configs/synthesis/code_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 10,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/code_config.json",
"resource_types": ["code"],
"resource_init_configs": {
"code": {
"content": {
"source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo"
}
}
},
"available_tools": ["code-*"],
"sampling_tips": [
"Inspect the repository before proposing edits.",
"Use code-bash only for lightweight checks that fit the bundled demo repo."
],
"synthesis_tips": [
"Generate repo-grounded QA only.",
"Prefer file-path, function-behavior, and small edit-validation questions over open-ended design prompts."
],
"qa_examples": [
{
"question": "Which file stores the greeting suffix used by the demo app? Reply with the relative file path only.",
"answer": "config/app_config.json"
},
{
"question": "What string does `build_message()` return before any edits? Reply with the exact string only.",
"answer": "Hello, AgentFlow?"
}
],
"seed_description": "Coding demo repository prompts",
"seeds_file": "seeds/code/seeds.jsonl",
"output_dir": "results/code"
}
38 changes: 38 additions & 0 deletions configs/synthesis/mcp_canvas_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 12,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/mcp_config.json",
"resource_types": ["mcp"],
"resource_init_configs": {},
"available_tools": ["mcp:canvas.*", "mcp:filesystem.*"],
"sampling_tips": [
"Inspect courses, assignments, and enrollments before drafting any question.",
"Prefer filesystem tools only for scratch notes or short saved artifacts."
],
"synthesis_tips": [
"Generate domain-grounded factual QA only.",
"Keep answers short and directly verifiable from Canvas MCP tool outputs."
],
"qa_examples": [
{
"question": "If a Canvas tool result shows a course with code HIST-201 and 28 enrolled students, how should the answer be formatted?",
"answer": "code=HIST-201, enrolled=28"
},
{
"question": "If the first three course names in alphabetical order are Biology 101, Chemistry Lab, and World History, how should the answer be returned?",
"answer": "Biology 101, Chemistry Lab, World History"
}
],
"seed_description": "Canvas MCP prompts",
"seeds_file": "seeds/mcp/canvas_seeds.jsonl",
"output_dir": "results/mcp_canvas"
}
38 changes: 38 additions & 0 deletions configs/synthesis/mcp_snowflake_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 12,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/mcp_config.json",
"resource_types": ["mcp"],
"resource_init_configs": {},
"available_tools": ["mcp:snowflake.*", "mcp:filesystem.*"],
"sampling_tips": [
"Inspect schemas and table names before choosing a reporting question.",
"Prefer filesystem tools only for scratch notes or short saved artifacts."
],
"synthesis_tips": [
"Generate domain-grounded factual QA only.",
"Keep answers short and directly verifiable from Snowflake MCP query outputs."
],
"qa_examples": [
{
"question": "If the first three visible tables are CUSTOMERS, LINE_ITEMS, and ORDERS, how should the answer be returned?",
"answer": "CUSTOMERS, LINE_ITEMS, ORDERS"
},
{
"question": "If a Snowflake aggregate query returns total_orders=125, how should the answer be formatted?",
"answer": "total_orders=125"
}
],
"seed_description": "Snowflake MCP prompts",
"seeds_file": "seeds/mcp/snowflake_seeds.jsonl",
"output_dir": "results/mcp_snowflake"
}
38 changes: 38 additions & 0 deletions configs/synthesis/mcp_train_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 12,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/mcp_config.json",
"resource_types": ["mcp"],
"resource_init_configs": {},
"available_tools": ["mcp:rail_12306.*", "mcp:filesystem.*"],
"sampling_tips": [
"Inspect stations, routes, and train options before drafting a travel lookup question.",
"Prefer filesystem tools only for scratch notes or short saved artifacts."
],
"synthesis_tips": [
"Generate domain-grounded factual QA only.",
"Keep answers short and directly verifiable from rail_12306 MCP tool outputs."
],
"qa_examples": [
{
"question": "If the first three station names alphabetically are Beijing, Hangzhou, and Shanghai, how should the answer be returned?",
"answer": "Beijing, Hangzhou, Shanghai"
},
{
"question": "If a route lookup shows departure Shanghai and arrival Nanjing, how should the answer be formatted?",
"answer": "departure=Shanghai, arrival=Nanjing"
}
],
"seed_description": "Train MCP prompts",
"seeds_file": "seeds/mcp/train_seeds.jsonl",
"output_dir": "results/mcp_train"
}
38 changes: 38 additions & 0 deletions configs/synthesis/mcp_woocommerce_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 12,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/mcp_config.json",
"resource_types": ["mcp"],
"resource_init_configs": {},
"available_tools": ["mcp:woocommerce.*", "mcp:filesystem.*"],
"sampling_tips": [
"Inspect customers, products, and orders before selecting a small store question.",
"Prefer filesystem tools only for scratch notes or short saved artifacts."
],
"synthesis_tips": [
"Generate domain-grounded factual QA only.",
"Keep answers short and directly verifiable from WooCommerce MCP tool outputs."
],
"qa_examples": [
{
"question": "If the first three product names alphabetically are Backpack, Coffee Mug, and Notebook, how should the answer be returned?",
"answer": "Backpack, Coffee Mug, Notebook"
},
{
"question": "If a customer email is alex@example.com and that customer has 3 orders, how should the answer be formatted?",
"answer": "email=alex@example.com, orders=3"
}
],
"seed_description": "WooCommerce MCP prompts",
"seeds_file": "seeds/mcp/woocommerce_seeds.jsonl",
"output_dir": "results/mcp_woocommerce"
}
38 changes: 38 additions & 0 deletions configs/synthesis/mcp_yahoo_finance_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 12,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/mcp_config.json",
"resource_types": ["mcp"],
"resource_init_configs": {},
"available_tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"],
"sampling_tips": [
"Inspect available tickers and quote fields before drafting a finance lookup question.",
"Prefer filesystem tools only for scratch notes or short saved artifacts."
],
"synthesis_tips": [
"Generate domain-grounded factual QA only.",
"Keep answers short and directly verifiable from Yahoo Finance MCP tool outputs."
],
"qa_examples": [
{
"question": "If the available tickers sorted alphabetically begin with AAPL, MSFT, and NVDA, how should the answer be returned?",
"answer": "AAPL, MSFT, NVDA"
},
{
"question": "If one comparison shows MSFT has the larger price, how should the answer be formatted?",
"answer": "symbol=MSFT"
}
],
"seed_description": "Yahoo Finance MCP prompts",
"seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl",
"output_dir": "results/mcp_yahoo_finance"
}
38 changes: 38 additions & 0 deletions configs/synthesis/mcp_youtube_config.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
{
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_depth": 12,
"branching_factor": 2,
"depth_threshold": 2,
"min_depth": 2,
"max_selected_traj": 1,
"path_similarity_threshold": 0.7,
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/mcp_config.json",
"resource_types": ["mcp"],
"resource_init_configs": {},
"available_tools": ["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"],
"sampling_tips": [
"Inspect video metadata first, then use transcript tools only when language or transcript details matter.",
"Prefer filesystem tools only for scratch notes or short saved artifacts."
],
"synthesis_tips": [
"Generate domain-grounded factual QA only.",
"Keep answers short and directly verifiable from YouTube MCP tool outputs."
],
"qa_examples": [
{
"question": "If the first three video titles alphabetically are Intro to Databases, MCP Demo, and Testing Walkthrough, how should the answer be returned?",
"answer": "Intro to Databases, MCP Demo, Testing Walkthrough"
},
{
"question": "If a transcript lookup shows video id abc123 with language en, how should the answer be formatted?",
"answer": "video=abc123, language=en"
}
],
"seed_description": "YouTube MCP prompts",
"seeds_file": "seeds/mcp/youtube_seeds.jsonl",
"output_dir": "results/mcp_youtube"
}
31 changes: 31 additions & 0 deletions configs/trajectory/code_trajectory.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,31 @@
{
"benchmark_name": "code_trajectory",
"model_name": "openai/gpt-oss-120b",
"api_key": "${OPENAI_API_KEY}",
"base_url": "${OPENAI_API_URL}",
"max_turns": 12,
"available_tools": ["code-*"],
"sandbox_server_url": "http://127.0.0.1:18890",
"sandbox_auto_start": false,
"sandbox_config_path": "configs/sandbox-server/code_config.json",
"resource_types": ["code"],
"resource_init_configs": {
"code": {
"content": {
"source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo"
}
}
},
"system_prompt": [
"You are a coding assistant working inside a small repository.",
"Inspect files before editing them.",
"When a task asks for verification, run the requested command inside the coding workspace before giving the final answer."
],
"evaluate_results": false,
"data_path": "benchmark/code_benchmark.jsonl",
"output_dir": "trajectory_results/code",
"save_results": true,
"save_trajectories": true,
"trajectory_only": true,
"save_summary": false
}
Loading