diff --git a/benchmark/code_benchmark.jsonl b/benchmark/code_benchmark.jsonl new file mode 100644 index 0000000..2d6c7b2 --- /dev/null +++ b/benchmark/code_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"} +{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}} diff --git a/benchmark/mcp_canvas_benchmark.jsonl b/benchmark/mcp_canvas_benchmark.jsonl new file mode 100644 index 0000000..91076fd --- /dev/null +++ b/benchmark/mcp_canvas_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=, enrolled=.", "answer": ""} diff --git a/benchmark/mcp_snowflake_benchmark.jsonl b/benchmark/mcp_snowflake_benchmark.jsonl new file mode 100644 index 0000000..5612f4e --- /dev/null +++ b/benchmark/mcp_snowflake_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""} diff --git a/benchmark/mcp_train_benchmark.jsonl b/benchmark/mcp_train_benchmark.jsonl new file mode 100644 index 0000000..40897a7 --- /dev/null +++ b/benchmark/mcp_train_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=, arrival=.", "answer": ""} diff --git a/benchmark/mcp_woocommerce_benchmark.jsonl b/benchmark/mcp_woocommerce_benchmark.jsonl new file mode 100644 index 0000000..5af2943 --- /dev/null +++ b/benchmark/mcp_woocommerce_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=, orders=.", "answer": ""} diff --git a/benchmark/mcp_yahoo_finance_benchmark.jsonl b/benchmark/mcp_yahoo_finance_benchmark.jsonl new file mode 100644 index 0000000..66fb80e --- /dev/null +++ b/benchmark/mcp_yahoo_finance_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=.", "answer": ""} diff --git a/benchmark/mcp_youtube_benchmark.jsonl b/benchmark/mcp_youtube_benchmark.jsonl new file mode 100644 index 0000000..7528cc2 --- /dev/null +++ b/benchmark/mcp_youtube_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=, language=.", "answer": ""} diff --git a/configs/sandbox-server/code_config.json b/configs/sandbox-server/code_config.json new file mode 100644 index 0000000..22bd72a --- /dev/null +++ b/configs/sandbox-server/code_config.json @@ -0,0 +1,21 @@ +{ + "server": { + "url": "http://127.0.0.1:18890", + "port": 18890, + "session_ttl": 300 + }, + "resources": { + "code": { + "enabled": true, + "description": "Lightweight coding backend powered by vendored internal tools", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": "/tmp/agentflow_code" + } + } + }, + "warmup": { + "enabled": false, + "resources": [] + } +} diff --git a/configs/sandbox-server/mcp_config.json b/configs/sandbox-server/mcp_config.json index 18baf9c..639597b 100644 --- a/configs/sandbox-server/mcp_config.json +++ b/configs/sandbox-server/mcp_config.json @@ -10,14 +10,26 @@ "description": "Toolathlon-GYM MCP backend", "backend_class": "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend", "config": { - "enabled_mcp_servers": ["filesystem", "terminal", "snowflake"], + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers", + "enabled_mcp_servers": [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem" + ], "workspace_root": "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}", "env_overrides": { - "PGHOST": "${PGHOST:-toolathlon_pg}", + "PGHOST": "${PGHOST:-localhost}", "PGPORT": "${PGPORT:-5432}", "PGUSER": "${PGUSER:-eigent}", "PGPASSWORD": "${PGPASSWORD:-camel}", - "PGDATABASE": "${PGDATABASE:-toolathlon_gym}" + "PGDATABASE": "${PGDATABASE:-toolathlon_gym}", + "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}", + "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}" } } } diff --git a/configs/synthesis/code_config.json b/configs/synthesis/code_config.json new file mode 100644 index 0000000..eb8bb98 --- /dev/null +++ b/configs/synthesis/code_config.json @@ -0,0 +1,44 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 10, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "available_tools": ["code-*"], + "sampling_tips": [ + "Inspect the repository before proposing edits.", + "Use code-bash only for lightweight checks that fit the bundled demo repo." + ], + "synthesis_tips": [ + "Generate repo-grounded QA only.", + "Prefer file-path, function-behavior, and small edit-validation questions over open-ended design prompts." + ], + "qa_examples": [ + { + "question": "Which file stores the greeting suffix used by the demo app? Reply with the relative file path only.", + "answer": "config/app_config.json" + }, + { + "question": "What string does `build_message()` return before any edits? Reply with the exact string only.", + "answer": "Hello, AgentFlow?" + } + ], + "seed_description": "Coding demo repository prompts", + "seeds_file": "seeds/code/seeds.jsonl", + "output_dir": "results/code" +} diff --git a/configs/synthesis/mcp_canvas_config.json b/configs/synthesis/mcp_canvas_config.json new file mode 100644 index 0000000..efd7d85 --- /dev/null +++ b/configs/synthesis/mcp_canvas_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect courses, assignments, and enrollments before drafting any question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from Canvas MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If a Canvas tool result shows a course with code HIST-201 and 28 enrolled students, how should the answer be formatted?", + "answer": "code=HIST-201, enrolled=28" + }, + { + "question": "If the first three course names in alphabetical order are Biology 101, Chemistry Lab, and World History, how should the answer be returned?", + "answer": "Biology 101, Chemistry Lab, World History" + } + ], + "seed_description": "Canvas MCP prompts", + "seeds_file": "seeds/mcp/canvas_seeds.jsonl", + "output_dir": "results/mcp_canvas" +} diff --git a/configs/synthesis/mcp_snowflake_config.json b/configs/synthesis/mcp_snowflake_config.json new file mode 100644 index 0000000..063dc38 --- /dev/null +++ b/configs/synthesis/mcp_snowflake_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect schemas and table names before choosing a reporting question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from Snowflake MCP query outputs." + ], + "qa_examples": [ + { + "question": "If the first three visible tables are CUSTOMERS, LINE_ITEMS, and ORDERS, how should the answer be returned?", + "answer": "CUSTOMERS, LINE_ITEMS, ORDERS" + }, + { + "question": "If a Snowflake aggregate query returns total_orders=125, how should the answer be formatted?", + "answer": "total_orders=125" + } + ], + "seed_description": "Snowflake MCP prompts", + "seeds_file": "seeds/mcp/snowflake_seeds.jsonl", + "output_dir": "results/mcp_snowflake" +} diff --git a/configs/synthesis/mcp_train_config.json b/configs/synthesis/mcp_train_config.json new file mode 100644 index 0000000..5bd67b8 --- /dev/null +++ b/configs/synthesis/mcp_train_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect stations, routes, and train options before drafting a travel lookup question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from rail_12306 MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the first three station names alphabetically are Beijing, Hangzhou, and Shanghai, how should the answer be returned?", + "answer": "Beijing, Hangzhou, Shanghai" + }, + { + "question": "If a route lookup shows departure Shanghai and arrival Nanjing, how should the answer be formatted?", + "answer": "departure=Shanghai, arrival=Nanjing" + } + ], + "seed_description": "Train MCP prompts", + "seeds_file": "seeds/mcp/train_seeds.jsonl", + "output_dir": "results/mcp_train" +} diff --git a/configs/synthesis/mcp_woocommerce_config.json b/configs/synthesis/mcp_woocommerce_config.json new file mode 100644 index 0000000..926b9fc --- /dev/null +++ b/configs/synthesis/mcp_woocommerce_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect customers, products, and orders before selecting a small store question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from WooCommerce MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the first three product names alphabetically are Backpack, Coffee Mug, and Notebook, how should the answer be returned?", + "answer": "Backpack, Coffee Mug, Notebook" + }, + { + "question": "If a customer email is alex@example.com and that customer has 3 orders, how should the answer be formatted?", + "answer": "email=alex@example.com, orders=3" + } + ], + "seed_description": "WooCommerce MCP prompts", + "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl", + "output_dir": "results/mcp_woocommerce" +} diff --git a/configs/synthesis/mcp_yahoo_finance_config.json b/configs/synthesis/mcp_yahoo_finance_config.json new file mode 100644 index 0000000..3eb2f4d --- /dev/null +++ b/configs/synthesis/mcp_yahoo_finance_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect available tickers and quote fields before drafting a finance lookup question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from Yahoo Finance MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the available tickers sorted alphabetically begin with AAPL, MSFT, and NVDA, how should the answer be returned?", + "answer": "AAPL, MSFT, NVDA" + }, + { + "question": "If one comparison shows MSFT has the larger price, how should the answer be formatted?", + "answer": "symbol=MSFT" + } + ], + "seed_description": "Yahoo Finance MCP prompts", + "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl", + "output_dir": "results/mcp_yahoo_finance" +} diff --git a/configs/synthesis/mcp_youtube_config.json b/configs/synthesis/mcp_youtube_config.json new file mode 100644 index 0000000..dfd987f --- /dev/null +++ b/configs/synthesis/mcp_youtube_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect video metadata first, then use transcript tools only when language or transcript details matter.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from YouTube MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the first three video titles alphabetically are Intro to Databases, MCP Demo, and Testing Walkthrough, how should the answer be returned?", + "answer": "Intro to Databases, MCP Demo, Testing Walkthrough" + }, + { + "question": "If a transcript lookup shows video id abc123 with language en, how should the answer be formatted?", + "answer": "video=abc123, language=en" + } + ], + "seed_description": "YouTube MCP prompts", + "seeds_file": "seeds/mcp/youtube_seeds.jsonl", + "output_dir": "results/mcp_youtube" +} diff --git a/configs/trajectory/code_trajectory.json b/configs/trajectory/code_trajectory.json new file mode 100644 index 0000000..2265f0a --- /dev/null +++ b/configs/trajectory/code_trajectory.json @@ -0,0 +1,31 @@ +{ + "benchmark_name": "code_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 12, + "available_tools": ["code-*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "system_prompt": [ + "You are a coding assistant working inside a small repository.", + "Inspect files before editing them.", + "When a task asks for verification, run the requested command inside the coding workspace before giving the final answer." + ], + "evaluate_results": false, + "data_path": "benchmark/code_benchmark.jsonl", + "output_dir": "trajectory_results/code", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true, + "save_summary": false +} diff --git a/configs/trajectory/mcp_canvas_trajectory.json b/configs/trajectory/mcp_canvas_trajectory.json new file mode 100644 index 0000000..b6458ed --- /dev/null +++ b/configs/trajectory/mcp_canvas_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_canvas_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_canvas_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_canvas", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_snowflake_trajectory.json b/configs/trajectory/mcp_snowflake_trajectory.json new file mode 100644 index 0000000..a087b61 --- /dev/null +++ b/configs/trajectory/mcp_snowflake_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_snowflake_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_snowflake_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_snowflake", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_train_trajectory.json b/configs/trajectory/mcp_train_trajectory.json new file mode 100644 index 0000000..d29e844 --- /dev/null +++ b/configs/trajectory/mcp_train_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_train_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_train_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_train", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_woocommerce_trajectory.json b/configs/trajectory/mcp_woocommerce_trajectory.json new file mode 100644 index 0000000..0f17d05 --- /dev/null +++ b/configs/trajectory/mcp_woocommerce_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_woocommerce_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_woocommerce_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_woocommerce", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_yahoo_finance_trajectory.json b/configs/trajectory/mcp_yahoo_finance_trajectory.json new file mode 100644 index 0000000..e4ad8b2 --- /dev/null +++ b/configs/trajectory/mcp_yahoo_finance_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_yahoo_finance_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_yahoo_finance_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_yahoo_finance", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_youtube_trajectory.json b/configs/trajectory/mcp_youtube_trajectory.json new file mode 100644 index 0000000..17bed8c --- /dev/null +++ b/configs/trajectory/mcp_youtube_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_youtube_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_youtube_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_youtube", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md b/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md new file mode 100644 index 0000000..e282aed --- /dev/null +++ b/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md @@ -0,0 +1,727 @@ +# Code Backend Single-Repository Vendoring Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make AgentFlow's `code` backend self-contained by vendoring the six upstream-style code tools into this repository, removing `claude_code_root`/`allow_bash`/`bash_timeout_seconds`, and adding an opt-in real rollout smoke. + +**Architecture:** Keep `code` as a session-scoped sandbox backend with AgentFlow-owned workspace lifecycle and path-boundary enforcement. Replace all external-root loading with a small internal `code_vendor` package, route all six tools through the same vendored `tool.call(...)` path, and verify the result with updated sandbox tests plus an MCP-style env-gated real rollout smoke. + +**Tech Stack:** Python 3.10, pytest, FastAPI sandbox server, pathlib/shutil, vendored upstream-style tool classes, RolloutPipeline, real LLM smoke via env-gated pytest collection + pytest CLI options + +--- + +**Known baseline:** In this worktree, `PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py sandbox/tests/test_code_tool_schemas.py sandbox/tests/test_sandbox_config_loading.py rollout/tests/test_config.py rollout/tests/test_integration.py` passes (`68 passed, 2 skipped`). `pip install -r requirements.txt` still hits the existing unrelated `pyxcursor` dependency-resolution issue from the VM stack; do not block this plan on that package. + +## File Map + +### New files + +- `sandbox/server/backends/resources/code_vendor/__init__.py` + Internal package export surface for the vendored six-tool compatibility layer. +- `sandbox/server/backends/resources/code_vendor/tool.py` + Minimal vendored `Tool` base class used by the six code tools. +- `sandbox/server/backends/resources/code_vendor/file_tools.py` + Vendored upstream-style `ReadTool`, `GlobTool`, `GrepTool`, and `BashTool`. +- `sandbox/server/backends/resources/code_vendor/edit_tools.py` + Vendored upstream-style `EditTool` and `WriteTool`. +- `sandbox/tests/test_code_vendor_tools.py` + Focused behavior-contract tests for the vendored tool package independent of `CodeBackend`. +- `rollout/tests/conftest.py` + Mirror the MCP real-smoke collection-gating pattern for `code` rollout tests and add pytest CLI options for real-smoke credentials. +- `rollout/tests/test_code_real_smoke.py` + Opt-in real rollout smoke that starts sandbox, uses the real LLM path, and proves at least one real `code:*` call happens against a temporary fixture repo. + +### Modified files + +- `sandbox/server/backends/resources/code.py` + Remove external-root logic and `bash` special casing; load vendored tools directly and execute all six through the same code path. +- `configs/sandbox-server/code_config.json` + Remove deleted config fields and present the `code` backend as a native AgentFlow capability with only `workspace_root`. +- `sandbox/tests/test_code_backend.py` + Delete obsolete external-root/bash-wrapper tests, keep valid workspace/boundary coverage, and rewrite tool-loading expectations around internal vendoring. +- `sandbox/tool_schemas/code_tools.py` + Update `code-bash` description to remove backend-config-dependent availability wording. +- `sandbox/tests/test_code_tool_schemas.py` + Update schema assertions to match the new `code-bash` description and keep the rest of the parameter contract coverage. +- `sandbox/tests/test_sandbox_config_loading.py` + Replace the `CLAUDE_CODE_ROOT` env-expansion test with a config-loading assertion for the simplified `code` backend template. + +### Intentionally unchanged files + +- `rollout/core/config.py` +- `rollout/core/runner.py` +- `rollout/pipeline.py` +- `sandbox/tool_schemas/__init__.py` +- `sandbox/server/backends/resources/__init__.py` + +The rollout and backend registration plumbing already supports the target design. Do not widen scope into rollout engine rewrites or unrelated backend refactors. + +## Chunk 1: Vendor the Upstream-Style Tool Subset + +### Task 1: Add the internal `code_vendor` package and behavior-contract tests + +**Files:** +- Create: `sandbox/server/backends/resources/code_vendor/__init__.py` +- Create: `sandbox/server/backends/resources/code_vendor/tool.py` +- Create: `sandbox/server/backends/resources/code_vendor/file_tools.py` +- Create: `sandbox/server/backends/resources/code_vendor/edit_tools.py` +- Create: `sandbox/tests/test_code_vendor_tools.py` + +- [ ] **Step 1: Write the failing vendored-tool tests** + +Create `sandbox/tests/test_code_vendor_tools.py` with focused tests like: + +```python +import asyncio +from pathlib import Path +from types import SimpleNamespace + +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool + + +def test_read_tool_returns_line_numbered_content(tmp_path): + target = tmp_path / "demo.py" + target.write_text("first\nsecond\n", encoding="utf-8") + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run(ReadTool().call({"file_path": str(target)}, ctx)) + + assert "1" in result + assert "first" in result + assert "second" in result + + +def test_edit_tool_requires_unique_match(tmp_path): + target = tmp_path / "demo.py" + target.write_text("x = 1\nx = 1\n", encoding="utf-8") + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run( + EditTool().call( + { + "file_path": str(target), + "old_string": "x = 1", + "new_string": "x = 2", + }, + ctx, + ) + ) + + assert result.startswith("Error:") + assert "appears" in result + + +def test_bash_tool_combines_stdout_and_stderr(tmp_path): + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run( + BashTool().call( + { + "command": "python -c \"import sys; print('out'); print('err', file=sys.stderr)\"" + }, + ctx, + ) + ) + + assert result == "out\n\n[stderr]:\nerr" +``` + +- [ ] **Step 2: Run the vendored-tool tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py +``` + +Expected: FAIL because the `code_vendor` package does not exist yet. + +- [ ] **Step 3: Add the vendored tool package** + +Create `sandbox/server/backends/resources/code_vendor/tool.py` with a minimal base: + +```python +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class Tool(ABC): + name: str + description: str + + @property + @abstractmethod + def input_schema(self) -> dict: + ... + + @abstractmethod + async def call(self, args: dict, ctx) -> str: + ... + + def is_read_only(self, args: dict) -> bool: + return False +``` + +Create `sandbox/server/backends/resources/code_vendor/file_tools.py` with vendored upstream-style implementations: + +```python +from __future__ import annotations + +import subprocess +from pathlib import Path + +from .tool import Tool + + +class BashTool(Tool): + name = "Bash" + description = "Execute a shell command and return stdout/stderr." + + @property + def input_schema(self) -> dict: + return { + "type": "object", + "properties": {"command": {"type": "string", "description": "Shell command to run"}}, + "required": ["command"], + } + + async def call(self, args: dict, ctx) -> str: + result = subprocess.run( + args["command"], + shell=True, + capture_output=True, + text=True, + cwd=ctx.cwd, + ) + out = result.stdout + if result.stderr: + out += f"\n[stderr]:\n{result.stderr}" + return out.strip() or "(no output)" + + def is_read_only(self, args: dict) -> bool: + return False +``` + +Add matching vendored implementations for `ReadTool`, `GlobTool`, `GrepTool`, `EditTool`, and `WriteTool`, preserving the current upstream-style semantics already described in the approved spec. Keep imports package-local only; do not carry over `log.py`, `trace.py`, or a vendored tool executor. + +Create `sandbox/server/backends/resources/code_vendor/__init__.py` to export the six tool classes. + +- [ ] **Step 4: Run the vendored-tool tests** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py +``` + +Expected: PASS. The vendored tool package exists and captures the expected upstream-style behavior. + +- [ ] **Step 5: Commit the vendored tool package** + +```bash +git add sandbox/server/backends/resources/code_vendor/__init__.py \ + sandbox/server/backends/resources/code_vendor/tool.py \ + sandbox/server/backends/resources/code_vendor/file_tools.py \ + sandbox/server/backends/resources/code_vendor/edit_tools.py \ + sandbox/tests/test_code_vendor_tools.py +git commit -m "feat: vendor code backend tool subset" +``` + +## Chunk 2: Simplify `CodeBackend` to Use Vendored Tools Only + +### Task 2: Rewrite `CodeBackend` around the internal tool package + +**Files:** +- Modify: `sandbox/server/backends/resources/code.py` +- Modify: `sandbox/tests/test_code_backend.py` +- Modify: `configs/sandbox-server/code_config.json` + +- [ ] **Step 1: Rewrite the backend tests around the new design** + +In `sandbox/tests/test_code_backend.py`: + +- delete external-root helper factories such as `create_fake_claude_code_root()` and `create_marker_claude_code_root()` +- delete all tests whose only purpose is external-root loading, root-local support modules, or per-root loader isolation +- delete all `allow_bash` and `bash_timeout_seconds` tests +- add/keep failing tests like: + +```python +def build_backend_config(tmp_path): + return BackendConfig( + enabled=True, + default_config={ + "workspace_root": str(tmp_path / "agentflow_code"), + }, + description="Code backend", + ) + + +def test_initialize_does_not_require_external_root(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + + session = asyncio.run(backend.initialize("runner_123", {})) + + assert Path(session["workspace"]).exists() + + +def test_load_code_tools_uses_internal_vendor_package(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + + tools = backend._load_code_tools() + + assert set(tools) == {"read", "glob", "grep", "bash", "edit", "write"} + assert tools["bash"].__class__.__module__.endswith("code_vendor.file_tools") + + +def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + fake_server = FakeServer() + backend.bind_server(fake_server) + workspace = tmp_path / "agentflow_code" / "worker-1" + workspace.mkdir(parents=True) + + executor = ToolExecutor( + tools=fake_server._tools, + tool_name_index={}, + tool_resource_types=fake_server._tool_resource_types, + resource_router=FakeResourceRouter( + {"session_id": "sid", "data": {"workspace": str(workspace)}} + ), + ) + + result = asyncio.run( + executor.execute( + action="code:bash", + params={"command": "pwd"}, + worker_id="worker-1", + trace_id="trace-1", + ) + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"].strip() == str(workspace.resolve(strict=False)) +``` + +- [ ] **Step 2: Run the rewritten backend tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py -k "does_not_require_external_root or internal_vendor_package or runs_bash_via_vendored_tool" +``` + +Expected: FAIL because `CodeBackend` still depends on `claude_code_root` and still special-cases `bash`. + +- [ ] **Step 3: Rewrite `sandbox/server/backends/resources/code.py`** + +Update `CodeBackend` to: + +- keep only `workspace_root` in its default config +- rename the internal tool loader to something neutral like `_load_code_tools()` +- import vendored classes directly, for example: + +```python +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool +``` + +- cache vendored instances per backend instance: + +```python +self._tool_instances = { + "read": ReadTool(), + "glob": GlobTool(), + "grep": GrepTool(), + "bash": BashTool(), + "edit": EditTool(), + "write": WriteTool(), +} +``` + +- remove these methods entirely: + - `_get_claude_code_root()` + - `_validate_claude_code_root_prerequisites()` + - `_load_root_support_modules()` + - `_temporary_module_aliases()` + - `_load_module_from_path()` + - `_run_bash_command()` + +- remove any `tool_name == "bash"` branch in `_dispatch()` +- after session/workspace validation and path normalization, always run: + +```python +tool = self._load_code_tools()[tool_name] +ctx = SimpleNamespace(cwd=str(workspace)) +result = await tool.call(normalized_params, ctx) +``` + +- keep AgentFlow-owned path normalization and workspace identity enforcement exactly as the valid existing tests expect + +Update `configs/sandbox-server/code_config.json` so the `code` backend config becomes: + +```json +"config": { + "workspace_root": "/tmp/agentflow_code" +} +``` + +and update the description string to describe the backend as vendored/internal rather than powered by an external repository. + +- [ ] **Step 4: Run the backend regression subset** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py +``` + +Expected: PASS after the obsolete tests/helpers are removed and the remaining coverage is adapted to the internal vendored model. + +- [ ] **Step 5: Commit the backend simplification** + +```bash +git add sandbox/server/backends/resources/code.py \ + sandbox/tests/test_code_backend.py \ + configs/sandbox-server/code_config.json +git commit -m "refactor: vendor code backend runtime" +``` + +## Chunk 3: Refresh Schema and Config Tests Around the New Contract + +### Task 3: Update schema docs, config-loading tests, and obsolete assertions + +**Files:** +- Modify: `sandbox/tool_schemas/code_tools.py` +- Modify: `sandbox/tests/test_code_tool_schemas.py` +- Modify: `sandbox/tests/test_sandbox_config_loading.py` + +- [ ] **Step 1: Write the failing schema/config assertions** + +Update the tests to the new expected contract: + +```python +def test_code_bash_description_mentions_workspace_shell_execution(): + schema = _code_schemas_by_name()["code-bash"] + description = schema["description"].lower() + + assert "workspace" in description + assert "shell command" in description + assert "backend config" not in description + + +def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): + config_path = tmp_path / "code_config.json" + raw_config = { + "resources": { + "code": { + "enabled": True, + "config": { + "workspace_root": "/tmp/agentflow_code" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert loaded["resources"]["code"]["config"]["workspace_root"] == "/tmp/agentflow_code" +``` + +- [ ] **Step 2: Run the schema/config tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_tool_schemas.py sandbox/tests/test_sandbox_config_loading.py +``` + +Expected: FAIL because the current schema text still mentions backend-config-dependent availability and the config-loading test still asserts `CLAUDE_CODE_ROOT` expansion. + +- [ ] **Step 3: Update schema text and config-loading coverage** + +In `sandbox/tool_schemas/code_tools.py`, change `code-bash` to something like: + +```python +{ + "name": "code-bash", + "description": "Run a shell command in the coding workspace using the current workspace as the working directory.", + "parameters": [ + { + "name": "command", + "type": "string", + "description": "Shell command to execute.", + "required": True, + } + ], +} +``` + +In `sandbox/tests/test_sandbox_config_loading.py`, replace the `CLAUDE_CODE_ROOT` test with the simplified `workspace_root` expectation and remove `monkeypatch.delenv("CLAUDE_CODE_ROOT", ...)`. + +- [ ] **Step 4: Run the updated schema/config regression suite** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/test_config.py +``` + +Expected: PASS. The code tool docs and config-loading tests now reflect the new single-repository contract. + +- [ ] **Step 5: Commit the schema/config cleanup** + +```bash +git add sandbox/tool_schemas/code_tools.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py +git commit -m "test: align code backend schema and config coverage" +``` + +## Chunk 4: Add the Opt-In Real Rollout Smoke + +### Task 4: Add MCP-style env-gated real `code` rollout smoke support + +**Files:** +- Create: `rollout/tests/conftest.py` +- Create: `rollout/tests/test_code_real_smoke.py` + +- [ ] **Step 1: Write the smoke test first** + +Create `rollout/tests/test_code_real_smoke.py` with a real-smoke shape like: + +```python +import json +from pathlib import Path + +from rollout import RolloutConfig, RolloutPipeline + + +def test_code_real_rollout_smoke(tmp_path, code_real_settings): + fixture_repo = tmp_path / "fixture_repo" + fixture_repo.mkdir() + nested = fixture_repo / "nested" + nested.mkdir() + token = "AF_CODE_SMOKE_TOKEN_7F3A91" + (nested / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") + + benchmark_path = tmp_path / "benchmark.jsonl" + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": "Use the available code tools to read nested/TOKEN.txt. Reply with only the exact token.", + "answer": token, + } + ) + + "\n", + encoding="utf-8", + ) + + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + output_dir=str(tmp_path / "out"), + model_name=code_real_settings["model"], + api_key=code_real_settings["api_key"], + base_url=code_real_settings["base_url"], + max_turns=5, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={"code": {"content": {"source_dir": str(fixture_repo)}}}, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + number_of_tasks=1, + ) + + summary = RolloutPipeline(config, output_dir=str(tmp_path / "out")).run() + + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 +``` + +- [ ] **Step 2: Run collection to verify the smoke currently fails** + +Run: + +```bash +PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: FAIL because the new smoke file or its `code_real_settings` fixture does not exist yet. + +- [ ] **Step 3: Add MCP-style collection gating and credential CLI options** + +Create `rollout/tests/conftest.py`: + +```python +import os +from pathlib import Path + +import pytest + + +_REAL_CODE_TEST_FILES = { + "test_code_real_smoke.py", +} + + +def pytest_addoption(parser): + parser.addoption("--real-api-key", action="store", default="") + parser.addoption("--real-base-url", action="store", default="") + parser.addoption("--real-model", action="store", default="") + + +def pytest_ignore_collect(collection_path, config): + if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": + return False + + path = Path(str(collection_path)) + return path.name in _REAL_CODE_TEST_FILES + + +@pytest.fixture +def code_real_settings(request): + api_key = request.config.getoption("--real-api-key") + base_url = request.config.getoption("--real-base-url") + model = request.config.getoption("--real-model") + if not api_key or not base_url or not model: + pytest.skip( + "Provide --real-api-key, --real-base-url, and --real-model to run code_real smoke tests." + ) + return {"api_key": api_key, "base_url": base_url, "model": model} +``` + +Then complete `rollout/tests/test_code_real_smoke.py` so it also: + +- locates the results file written by `RolloutPipeline` +- loads the single saved result +- asserts there is at least one `code:*` tool call in the trajectory +- asserts the final answer equals the unique token +- asserts the token appears in the tool-result chain + +- [ ] **Step 4: Verify collection gating behavior** + +Run: + +```bash +PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: no tests collected from this file unless opt-in is enabled, matching the MCP real-smoke pattern. + +Run: + +```bash +AGENTFLOW_RUN_CODE_REAL=1 PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: PASS. The real-smoke file is now collected explicitly, matching MCP's env-gated behavior. + +- [ ] **Step 5: Commit the real-smoke scaffolding** + +```bash +git add rollout/tests/conftest.py \ + rollout/tests/test_code_real_smoke.py +git commit -m "test: add opt-in code rollout real smoke" +``` + +## Chunk 5: Final Verification and Live Smoke Run + +### Task 5: Run the full targeted regression suite and the real smoke with supplied credentials + +**Files:** +- No code changes expected + +- [ ] **Step 1: Run the full targeted regression suite** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/test_config.py \ + rollout/tests/test_integration.py +``` + +Expected: PASS. The vendored tool package, the simplified backend, schema/config coverage, and rollout baseline all pass together. + +- [ ] **Step 2: Run the real rollout smoke with explicit credentials** + +Run: + +```bash +AGENTFLOW_RUN_CODE_REAL=1 PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py \ + --real-api-key '' \ + --real-base-url '' \ + --real-model '' \ + -s +``` + +Expected: PASS. Sandbox starts, the `code` session is created, at least one real `code:*` tool call occurs, and the final answer matches the unique token from the fixture repo. + +- [ ] **Step 3: Inspect the real-smoke output** + +Verify in the saved trajectory/result payload that: + +- the trajectory contains at least one `code:*` tool call +- at least one `code:read` or `code:glob` appears +- the token from `nested/TOKEN.txt` is present in tool-result observations +- the final predicted answer equals the token exactly + +- [ ] **Step 4: Commit the integrated result** + +```bash +git status --short +git add sandbox/server/backends/resources/code_vendor \ + sandbox/server/backends/resources/code.py \ + configs/sandbox-server/code_config.json \ + sandbox/tests/test_code_vendor_tools.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tool_schemas/code_tools.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/conftest.py \ + rollout/tests/test_code_real_smoke.py +git commit -m "refactor: vendor code backend tools into agentflow" +``` + +- [ ] **Step 5: Record final verification notes** + +Capture: + +- exact targeted pytest command and pass result +- exact `AGENTFLOW_RUN_CODE_REAL=1 ... pytest ...` command used +- whether the real smoke passed +- any residual risk, especially around powerful `bash` behavior and live-model variability + +## Execution Notes + +- Use `PYTHONPATH=.` for pytest commands in this repository unless the execution harness already injects the repo root. +- Keep path-boundary enforcement in `CodeBackend`; do not push it into the vendored tool files. +- Do not leave any `claude_code_root`, `allow_bash`, or `bash_timeout_seconds` references behind in tests or config templates. +- Do not widen the real-smoke scope into general rollout refactoring. + +Plan complete and saved to `docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md`. Ready to execute? diff --git a/docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md b/docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md new file mode 100644 index 0000000..7730586 --- /dev/null +++ b/docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md @@ -0,0 +1,1575 @@ +# MCP and Coding Examples Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add official MCP and Coding example docs, configs, seeds, demo assets, and validation coverage so both backends have runnable three-step examples aligned with the existing `examples/` set. + +**Architecture:** Implement the work in four isolated chunks. Chunk 1 locks the shared MCP sandbox template contract so the example path has a stable server subset and `mcp_servers_path` behavior. Chunk 2 adds the six-domain MCP example assets and doc with MCP-specific tests. Chunk 3 adds the Coding example assets, bundled demo repo, and doc with Coding-specific tests. Chunk 4 runs the final combined verification suite plus optional real-environment dry runs without expanding scope to training or infer. + +**Tech Stack:** Python, pytest, JSON, JSONL, Markdown, AgentFlow sandbox config loader, synthesis config loader, rollout config loader, Toolathlon-GYM MCP backend, Code backend + +--- + +**Assumptions and Guardrails** + +- The approved spec at `docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md` remains the source of truth. +- Keep `examples/MCPAgent.md` and `examples/CodingAgent.md` at the same granularity as `examples/DSAgent.md`: Overview, Prerequisites, Pipeline Overview, Step 1, Step 2, Step 3, Configuration Reference, FAQ. +- Official committed docs and configs must stay generic. Do not mention `/home/a1/sdb/dxd/DataFlow` or any other machine-local absolute path in committed example assets. +- MCP examples assume a local `toolathlon_gym` checkout is already initialized and running before AgentFlow starts. AgentFlow does not bootstrap Toolathlon-GYM services in this task. +- Coding examples use a committed demo repo by default through `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`, but the docs must also say `source_dir` can be overridden to point at a user-provided repo. +- Both new docs must explicitly stop at Step 3 and explain that training / deployment / infer are not covered yet, matching the DS-style scope the user approved. +- If the current synthesis pipeline still writes to the repo’s shared aggregation directory instead of respecting per-config `output_dir`, document the actual observed behavior in the new example docs rather than broadening this task into a synthesis pipeline refactor. + +## File Map + +### Shared MCP sandbox contract + +- Modify: `configs/sandbox-server/mcp_config.json` + Responsibility: official MCP sandbox entry point for the example path; declare `mcp_servers_path`, the approved MCP server subset, localhost-style env defaults, and warmup settings. +- Modify: `sandbox/tests/test_sandbox_config_loading.py` + Responsibility: verify `${TOOLATHLON_GYM_ROOT}/local_servers` survives env expansion when the env var is unset. +- Modify: `sandbox/tests/test_mcp_backend.py` + Responsibility: verify the checked-in MCP sandbox template exposes the exact approved config contract, including warmup. +- Modify: `sandbox/tests/test_mcp_client.py` + Responsibility: verify MCP YAML resolution still maps `${local_servers_paths}` to the JSON config’s `mcp_servers_path`. + +### MCP example validation and assets + +- Create: `synthesis/tests/test_mcp_example_synthesis_configs.py` + Responsibility: validate all six MCP synthesis configs, tool exposure, seed references, and empty `resource_init_configs`. +- Create: `rollout/tests/test_mcp_example_assets.py` + Responsibility: validate all six MCP rollout configs plus the MCP seeds and benchmark files. +- Create: `rollout/tests/test_mcp_example_doc.py` + Responsibility: validate `examples/MCPAgent.md` has the DS-style structure and the exact prerequisite/config references the user approved. +- Create: `examples/MCPAgent.md` +- Create: `configs/synthesis/mcp_canvas_config.json` +- Create: `configs/synthesis/mcp_snowflake_config.json` +- Create: `configs/synthesis/mcp_woocommerce_config.json` +- Create: `configs/synthesis/mcp_yahoo_finance_config.json` +- Create: `configs/synthesis/mcp_youtube_config.json` +- Create: `configs/synthesis/mcp_train_config.json` +- Create: `configs/trajectory/mcp_canvas_trajectory.json` +- Create: `configs/trajectory/mcp_snowflake_trajectory.json` +- Create: `configs/trajectory/mcp_woocommerce_trajectory.json` +- Create: `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- Create: `configs/trajectory/mcp_youtube_trajectory.json` +- Create: `configs/trajectory/mcp_train_trajectory.json` +- Create: `seeds/mcp/canvas_seeds.jsonl` +- Create: `seeds/mcp/snowflake_seeds.jsonl` +- Create: `seeds/mcp/woocommerce_seeds.jsonl` +- Create: `seeds/mcp/yahoo_finance_seeds.jsonl` +- Create: `seeds/mcp/youtube_seeds.jsonl` +- Create: `seeds/mcp/train_seeds.jsonl` +- Create: `benchmark/mcp_canvas_benchmark.jsonl` +- Create: `benchmark/mcp_snowflake_benchmark.jsonl` +- Create: `benchmark/mcp_woocommerce_benchmark.jsonl` +- Create: `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- Create: `benchmark/mcp_youtube_benchmark.jsonl` +- Create: `benchmark/mcp_train_benchmark.jsonl` + +### Coding example validation and assets + +- Create: `synthesis/tests/test_code_example_synthesis_config.py` + Responsibility: validate the Coding synthesis config, `code-*` tool exposure, and repo-local `source_dir` contract. +- Create: `rollout/tests/test_code_example_assets.py` + Responsibility: validate the Coding rollout config, bundled demo repo, seed data, and mixed read/edit benchmark tasks. +- Create: `rollout/tests/test_code_example_doc.py` + Responsibility: validate `examples/CodingAgent.md` has the DS-style structure, exact `AGENTFLOW_REPO_ROOT` setup steps, and no machine-local path leakage. +- Create: `examples/CodingAgent.md` +- Create: `configs/synthesis/code_config.json` +- Create: `configs/trajectory/code_trajectory.json` +- Create: `seeds/code/seeds.jsonl` +- Create: `seeds/code/seed/demo_repo/README.md` +- Create: `seeds/code/seed/demo_repo/app.py` +- Create: `seeds/code/seed/demo_repo/config/app_config.json` +- Create: `seeds/code/seed/demo_repo/lib/helpers.py` +- Create: `seeds/code/seed/demo_repo/tests/smoke_test.py` +- Create: `benchmark/code_benchmark.jsonl` + +## Chunk 1: Shared MCP Sandbox Contract + +### Task 1: Lock the official MCP sandbox template + +**Files:** +- Modify: `configs/sandbox-server/mcp_config.json` +- Modify: `sandbox/tests/test_sandbox_config_loading.py` +- Modify: `sandbox/tests/test_mcp_backend.py` +- Modify: `sandbox/tests/test_mcp_client.py` + +- [ ] **Step 1: Add the failing MCP contract tests** + +Add these exact assertions. + +In `sandbox/tests/test_sandbox_config_loading.py`, add: + +```python +def test_load_server_config_keeps_required_mcp_servers_path_placeholder_when_env_missing( + tmp_path, monkeypatch +): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + + config_path = tmp_path / "mcp_config.json" + raw_config = { + "resources": { + "mcp": { + "enabled": True, + "config": { + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert ( + loaded["resources"]["mcp"]["config"]["mcp_servers_path"] + == "${TOOLATHLON_GYM_ROOT}/local_servers" + ) +``` + +In `sandbox/tests/test_mcp_backend.py`, replace the current lightweight template parse check with: + +```python +def test_mcp_config_template_declares_example_server_subset(monkeypatch): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + monkeypatch.delenv("PGHOST", raising=False) + monkeypatch.delenv("PGPORT", raising=False) + monkeypatch.delenv("PGUSER", raising=False) + monkeypatch.delenv("PGPASSWORD", raising=False) + monkeypatch.delenv("PGDATABASE", raising=False) + monkeypatch.delenv("CANVAS_DOMAIN", raising=False) + monkeypatch.delenv("WORDPRESS_SITE_URL", raising=False) + + loader = ConfigLoader() + config_path = ( + Path(__file__).resolve().parents[2] + / "configs" + / "sandbox-server" + / "mcp_config.json" + ) + + config = loader.load(str(config_path)) + mcp_resource = config.resources["mcp"] + mcp_config = mcp_resource.config + + assert mcp_resource.backend_class == ( + "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend" + ) + assert mcp_config["mcp_servers_path"] == "${TOOLATHLON_GYM_ROOT}/local_servers" + assert mcp_config["enabled_mcp_servers"] == [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem", + ] + assert mcp_config["env_overrides"] == { + "PGHOST": "localhost", + "PGPORT": "5432", + "PGUSER": "eigent", + "PGPASSWORD": "camel", + "PGDATABASE": "toolathlon_gym", + "CANVAS_DOMAIN": "localhost:8080", + "WORDPRESS_SITE_URL": "http://localhost:8081", + } + assert config.warmup.enabled is True + assert config.warmup.resources == ["mcp"] +``` + +In `sandbox/tests/test_mcp_client.py`, add: + +```python +def test_load_mcp_process_config_resolves_toolathlon_local_servers_path(tmp_path): + module = load_mcp_client_module() + config_dir = tmp_path / "configs" / "mcp_servers" + config_dir.mkdir(parents=True) + (config_dir / "filesystem.yaml").write_text( + """ +type: stdio +name: filesystem +params: + command: node + args: + - ${local_servers_paths}/filesystem/environment/dist/index.js + - ${agent_workspace} + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="filesystem", + agent_workspace="/tmp/agentflow-worker", + mcp_servers_path="/tmp/toolathlon/local_servers", + config_dir=config_dir, + ) + + assert config.command == "node" + assert config.args == [ + "/tmp/toolathlon/local_servers/filesystem/environment/dist/index.js", + "/tmp/agentflow-worker", + ] +``` + +- [ ] **Step 2: Run the MCP contract tests and confirm they fail** + +Run: + +```bash +pytest \ + sandbox/tests/test_sandbox_config_loading.py::test_load_server_config_keeps_required_mcp_servers_path_placeholder_when_env_missing \ + sandbox/tests/test_mcp_backend.py::test_mcp_config_template_declares_example_server_subset \ + sandbox/tests/test_mcp_client.py::test_load_mcp_process_config_resolves_toolathlon_local_servers_path \ + -v +``` + +Expected: the suite should fail before the template update; likely causes are the missing `mcp_servers_path`, the old server subset, or the old `PGHOST` default, but the exact failing assertion may vary slightly if the branch state drifts. + +- [ ] **Step 3: Update `configs/sandbox-server/mcp_config.json` to match the approved example contract** + +Make `resources.mcp.config` match this exact shape: + +```json +{ + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers", + "enabled_mcp_servers": [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem" + ], + "workspace_root": "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}", + "env_overrides": { + "PGHOST": "${PGHOST:-localhost}", + "PGPORT": "${PGPORT:-5432}", + "PGUSER": "${PGUSER:-eigent}", + "PGPASSWORD": "${PGPASSWORD:-camel}", + "PGDATABASE": "${PGDATABASE:-toolathlon_gym}", + "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}", + "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}" + } +} +``` + +Keep: + +```json +"warmup": { + "enabled": true, + "resources": ["mcp"] +} +``` + +Do not add `terminal` back into the official example template. + +- [ ] **Step 4: Re-run the MCP contract tests and verify they pass** + +Run the same pytest command from Step 2. + +Expected: PASS for all three tests. + +- [ ] **Step 5: Commit the MCP sandbox contract change** + +```bash +git add \ + configs/sandbox-server/mcp_config.json \ + sandbox/tests/test_sandbox_config_loading.py \ + sandbox/tests/test_mcp_backend.py \ + sandbox/tests/test_mcp_client.py +git commit -m "test: lock MCP example sandbox contract" +``` + +## Chunk 2: MCP Example Assets and Guide + +### Task 2: Add MCP config and asset tests first + +**Files:** +- Create: `synthesis/tests/test_mcp_example_synthesis_configs.py` +- Create: `rollout/tests/test_mcp_example_assets.py` + +- [ ] **Step 1: Write the failing synthesis-config test file** + +Create `synthesis/tests/test_mcp_example_synthesis_configs.py` with: + +```python +import json +from pathlib import Path + +import pytest + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/canvas_seeds.jsonl", + "seed_description": "Canvas MCP prompts", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/snowflake_seeds.jsonl", + "seed_description": "Snowflake MCP prompts", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl", + "seed_description": "WooCommerce MCP prompts", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl", + "seed_description": "Yahoo Finance MCP prompts", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "seeds_file": "seeds/mcp/youtube_seeds.jsonl", + "seed_description": "YouTube MCP prompts", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/train_seeds.jsonl", + "seed_description": "Train MCP prompts", + }, +} + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_synthesis_config_contract(domain): + expected = EXPECTED[domain] + config_path = REPO_ROOT / "configs" / "synthesis" / f"mcp_{domain}_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == "${OPENAI_API_KEY}" + assert config.base_url == "${OPENAI_API_URL}" + assert config.max_depth == 12 + assert config.branching_factor == 2 + assert config.depth_threshold == 2 + assert config.min_depth == 2 + assert config.max_selected_traj == 1 + assert config.path_similarity_threshold == 0.7 + assert config.available_tools == expected["tools"] + assert config.seeds_file == expected["seeds_file"] + assert config.output_dir == f"results/mcp_{domain}" + assert raw["seed_description"] == expected["seed_description"] + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() +``` + +- [ ] **Step 2: Write the failing rollout-asset test file** + +Create `rollout/tests/test_mcp_example_assets.py` with: + +```python +import json +from pathlib import Path + +import pytest + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_canvas_benchmark.jsonl", + "benchmark_name": "mcp_canvas_trajectory", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_snowflake_benchmark.jsonl", + "benchmark_name": "mcp_snowflake_trajectory", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_woocommerce_benchmark.jsonl", + "benchmark_name": "mcp_woocommerce_trajectory", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_yahoo_finance_benchmark.jsonl", + "benchmark_name": "mcp_yahoo_finance_trajectory", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "benchmark": "benchmark/mcp_youtube_benchmark.jsonl", + "benchmark_name": "mcp_youtube_trajectory", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_train_benchmark.jsonl", + "benchmark_name": "mcp_train_trajectory", + }, +} + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_rollout_config_contract(domain): + expected = EXPECTED[domain] + config_path = REPO_ROOT / "configs" / "trajectory" / f"mcp_{domain}_trajectory.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = RolloutConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.benchmark_name == expected["benchmark_name"] + assert config.data_path == expected["benchmark"] + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == "${OPENAI_API_KEY}" + assert config.base_url == "${OPENAI_API_URL}" + assert config.max_turns == 20 + assert config.available_tools == expected["tools"] + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert "MCP domain assistant" in config.system_prompt + assert "Use only the available MCP tools" in config.system_prompt + assert "Reply with the final answer only" in config.system_prompt + assert config.evaluate_results is False + assert config.output_dir == f"trajectory_results/mcp_{domain}" + assert config.save_results is True + assert config.trajectory_only is True + assert config.save_trajectories is True + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_seed_files_are_two_row_jsonl(domain): + seed_path = REPO_ROOT / "seeds" / "mcp" / f"{domain}_seeds.jsonl" + rows = _read_jsonl(seed_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_benchmark_files_have_two_row_jsonl_contract(domain): + benchmark_path = REPO_ROOT / "benchmark" / f"mcp_{domain}_benchmark.jsonl" + rows = _read_jsonl(benchmark_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"id", "question", "answer"} for row in rows) + assert all(isinstance(row["question"], str) and row["question"].strip() for row in rows) + assert all(isinstance(row["answer"], str) for row in rows) +``` + +- [ ] **Step 3: Run the new MCP tests and verify they fail because the assets do not exist yet** + +Run: + +```bash +pytest \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + -v +``` + +Expected: FAIL with missing-file errors for the new MCP configs, seeds, and benchmarks. + +### Task 3: Create the six-domain MCP seeds, benchmarks, and configs + +**Files:** +- Create: `configs/synthesis/mcp_canvas_config.json` +- Create: `configs/synthesis/mcp_snowflake_config.json` +- Create: `configs/synthesis/mcp_woocommerce_config.json` +- Create: `configs/synthesis/mcp_yahoo_finance_config.json` +- Create: `configs/synthesis/mcp_youtube_config.json` +- Create: `configs/synthesis/mcp_train_config.json` +- Create: `configs/trajectory/mcp_canvas_trajectory.json` +- Create: `configs/trajectory/mcp_snowflake_trajectory.json` +- Create: `configs/trajectory/mcp_woocommerce_trajectory.json` +- Create: `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- Create: `configs/trajectory/mcp_youtube_trajectory.json` +- Create: `configs/trajectory/mcp_train_trajectory.json` +- Create: `seeds/mcp/canvas_seeds.jsonl` +- Create: `seeds/mcp/snowflake_seeds.jsonl` +- Create: `seeds/mcp/woocommerce_seeds.jsonl` +- Create: `seeds/mcp/yahoo_finance_seeds.jsonl` +- Create: `seeds/mcp/youtube_seeds.jsonl` +- Create: `seeds/mcp/train_seeds.jsonl` +- Create: `benchmark/mcp_canvas_benchmark.jsonl` +- Create: `benchmark/mcp_snowflake_benchmark.jsonl` +- Create: `benchmark/mcp_woocommerce_benchmark.jsonl` +- Create: `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- Create: `benchmark/mcp_youtube_benchmark.jsonl` +- Create: `benchmark/mcp_train_benchmark.jsonl` + +- [ ] **Step 1: Create the six MCP seed files** + +Use exactly two rows per file, each shaped as `{"content": "...", "kwargs": {}}`. + +Seed rows: + +- `seeds/mcp/canvas_seeds.jsonl` + - `Use the available Canvas MCP tools to inspect courses, assignments, and enrollment information.` + - `Find a small Canvas reporting task that can be answered from the mock data and save intermediate notes with filesystem tools if helpful.` +- `seeds/mcp/snowflake_seeds.jsonl` + - `Use the available Snowflake MCP tools to inspect schemas, tables, and small analytical queries in the mock warehouse.` + - `Find one compact warehouse reporting question that can be answered from the available Snowflake tools.` +- `seeds/mcp/woocommerce_seeds.jsonl` + - `Use the WooCommerce MCP tools to inspect customers, products, and orders in the mock store.` + - `Find one small sales or operations question that can be answered from the WooCommerce mock data.` +- `seeds/mcp/yahoo_finance_seeds.jsonl` + - `Use the Yahoo Finance MCP tools to inspect the mock ticker and market data available locally.` + - `Find one small finance lookup or comparison question that can be answered directly from the available tools.` +- `seeds/mcp/youtube_seeds.jsonl` + - `Use the YouTube and YouTube Transcript MCP tools to inspect mock video metadata and transcript data.` + - `Find one small content-discovery or transcript lookup question that can be answered from the local mock data.` +- `seeds/mcp/train_seeds.jsonl` + - `Use the rail_12306 MCP tools to inspect mock train, station, and route information.` + - `Find one small travel-planning or route lookup question that can be answered from the available railway tools.` + +- [ ] **Step 2: Create the six MCP benchmark files** + +Use exactly two rows per file with schema `{"id": "...", "question": "...", "answer": "..."}`. + +Questions: + +- `benchmark/mcp_canvas_benchmark.jsonl` + - `{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=, enrolled=.", "answer": ""}` +- `benchmark/mcp_snowflake_benchmark.jsonl` + - `{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""}` +- `benchmark/mcp_woocommerce_benchmark.jsonl` + - `{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=, orders=.", "answer": ""}` +- `benchmark/mcp_yahoo_finance_benchmark.jsonl` + - `{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=.", "answer": ""}` +- `benchmark/mcp_youtube_benchmark.jsonl` + - `{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=, language=.", "answer": ""}` +- `benchmark/mcp_train_benchmark.jsonl` + - `{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=, arrival=.", "answer": ""}` + +During Chunk 2, keep all twelve `answer` fields as empty strings. Do not guess or invent answers here; Chunk 4 Task 9 is the required live grounding step that will fill them from the prepared mock environment. + +- [ ] **Step 3: Create the six MCP synthesis configs** + +Use `configs/synthesis/ds_config.json` for field ordering, but make each MCP file follow this exact contract: + +```json +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Domain-specific MCP exploration guidance", + "Prefer using filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from tool outputs." + ], + "qa_examples": [ + {"question": "Example question 1", "answer": "Example answer 1"}, + {"question": "Example question 2", "answer": "Example answer 2"} + ], + "seed_description": "Domain-specific MCP prompts", + "seeds_file": "seeds/mcp/_seeds.jsonl", + "output_dir": "results/mcp_" +} +``` + +Exact tool mappings: + +- `canvas`: `["mcp:canvas.*", "mcp:filesystem.*"]` +- `snowflake`: `["mcp:snowflake.*", "mcp:filesystem.*"]` +- `woocommerce`: `["mcp:woocommerce.*", "mcp:filesystem.*"]` +- `yahoo_finance`: `["mcp:yahoo-finance.*", "mcp:filesystem.*"]` +- `youtube`: `["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"]` +- `train`: `["mcp:rail_12306.*", "mcp:filesystem.*"]` + +Exact `seed_description` strings: + +- `Canvas MCP prompts` +- `Snowflake MCP prompts` +- `WooCommerce MCP prompts` +- `Yahoo Finance MCP prompts` +- `YouTube MCP prompts` +- `Train MCP prompts` + +Each file must use 2-3 domain-specific `qa_examples`. Keep them domain-grounded and format-focused, but do not invent benchmark answers that depend on live mock data. + +For `resource_init_configs`, the committed files should prefer: + +```json +"resource_init_configs": {} +``` + +but the tests must also accept: + +```json +"resource_init_configs": { + "mcp": { + "content": {} + } +} +``` + +because the approved spec allows MCP init content to be omitted or explicitly empty. + +- [ ] **Step 4: Create the six MCP rollout configs** + +Use `configs/trajectory/ds_trajectory.json` for field ordering, but make each MCP rollout config follow this exact contract: + +```json +{ + "benchmark_name": "mcp__trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp__benchmark.jsonl", + "output_dir": "trajectory_results/mcp_", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} +``` + +For `youtube`, the `available_tools` list must include both `mcp:youtube.*` and `mcp:youtube-transcript.*`. + +As in Step 3, prefer `"resource_init_configs": {}` in the committed files, but keep the tests permissive enough to allow the spec-approved explicit-empty MCP init form. + +- [ ] **Step 5: Run the MCP synthesis-config and rollout-asset tests and make them pass** + +Run: + +```bash +pytest \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 6: Defer MCP benchmark answer grounding to Chunk 4** + +Keep this chunk deterministic. Do not block Chunk 2 on external MCP services or LLM credentials. The required live answer-grounding step for MCP benchmarks happens later in Chunk 4 Task 9. + +- [ ] **Step 7: Commit the MCP assets and configs** + +```bash +git add \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + configs/synthesis/mcp_*.json \ + configs/trajectory/mcp_*.json \ + seeds/mcp \ + benchmark/mcp_*.jsonl +git commit -m "feat: add MCP example assets" +``` + +### Task 4: Add the MCP guide and lock the doc contract + +**Files:** +- Create: `rollout/tests/test_mcp_example_doc.py` +- Create: `examples/MCPAgent.md` + +- [ ] **Step 1: Write the failing MCP doc contract test** + +Create `rollout/tests/test_mcp_example_doc.py` with: + +```python +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_mcp_example_doc_has_required_sections_and_exact_prerequisite_contract(): + content = (REPO_ROOT / "examples" / "MCPAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "configs/sandbox-server/mcp_config.json", + "configs/synthesis/mcp_canvas_config.json", + "configs/synthesis/mcp_snowflake_config.json", + "configs/synthesis/mcp_woocommerce_config.json", + "configs/synthesis/mcp_yahoo_finance_config.json", + "configs/synthesis/mcp_youtube_config.json", + "configs/synthesis/mcp_train_config.json", + "configs/trajectory/mcp_canvas_trajectory.json", + "configs/trajectory/mcp_snowflake_trajectory.json", + "configs/trajectory/mcp_woocommerce_trajectory.json", + "configs/trajectory/mcp_yahoo_finance_trajectory.json", + "configs/trajectory/mcp_youtube_trajectory.json", + "configs/trajectory/mcp_train_trajectory.json", + "export TOOLATHLON_GYM_ROOT=", + "${TOOLATHLON_GYM_ROOT}/local_servers", + "./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json", + "node", + "uv", + "PGHOST", + "PGPORT", + "PGUSER", + "PGPASSWORD", + "PGDATABASE", + "CANVAS_DOMAIN", + "WORDPRESS_SITE_URL", + ] + for needle in required_strings: + assert needle in content + + lowered = content.lower() + assert "/home/" not in content + assert "training" in lowered + assert "deployment" in lowered + assert "infer" in lowered + assert "not covered" in lowered + assert "Step 4" not in content + assert "Step 5" not in content +``` + +- [ ] **Step 2: Run the doc contract test and confirm it fails** + +Run: + +```bash +pytest rollout/tests/test_mcp_example_doc.py -v +``` + +Expected: FAIL because `examples/MCPAgent.md` does not exist yet. + +- [ ] **Step 3: Write `examples/MCPAgent.md` in DS-style structure** + +The guide must: + +- open with a DS-style three-step title, not a five-step title +- cover the six domains: `canvas`, `snowflake`, `woocommerce`, `yahoo_finance`, `youtube`, `train` +- explain that the example uses `configs/sandbox-server/mcp_config.json` +- include prerequisites for: + - `cd AgentFlow` + - `export OPENAI_API_KEY=...` + - `export OPENAI_API_URL=...` + - `export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym` + - an already prepared and already running Toolathlon-GYM environment + - `node` and `uv` +- mention that the MCP server bundle is resolved from `${TOOLATHLON_GYM_ROOT}/local_servers` +- list the env override fields surfaced by `mcp_config.json` +- give one Step 1 command: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json +``` + +- give Step 2 commands for all six synthesis configs +- give Step 3 commands for all six rollout configs +- explicitly say later training / deployment / infer are not covered yet +- keep explanations short and example-oriented; do not add internal orchestration advice about restarting sandboxes between domains + +- [ ] **Step 4: Run the MCP doc test and a targeted MCP suite** + +Run: + +```bash +pytest \ + rollout/tests/test_mcp_example_doc.py \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit the MCP guide** + +```bash +git add rollout/tests/test_mcp_example_doc.py examples/MCPAgent.md +git commit -m "docs: add MCP example guide" +``` + +## Chunk 3: Coding Example Assets and Guide + +### Task 5: Add Coding config and asset tests first + +**Files:** +- Create: `synthesis/tests/test_code_example_synthesis_config.py` +- Create: `rollout/tests/test_code_example_assets.py` + +- [ ] **Step 1: Write the failing Coding synthesis-config test** + +Create `synthesis/tests/test_code_example_synthesis_config.py` with: + +```python +import json +from pathlib import Path + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_code_synthesis_config_contract(): + config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.available_tools == ["code-*"] + assert config.seeds_file == "seeds/code/seeds.jsonl" + assert raw["seed_description"] == "Coding demo repository prompts" + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() +``` + +- [ ] **Step 2: Write the failing Coding rollout-asset test** + +Create `rollout/tests/test_code_example_assets.py` with: + +```python +import json +from pathlib import Path + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def test_code_rollout_config_contract(): + config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + config = RolloutConfig.from_json(str(config_path)) + + assert config.benchmark_name == "code_trajectory" + assert config.data_path == "benchmark/code_benchmark.jsonl" + assert config.available_tools == ["code-*"] + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.evaluate_results is False + assert config.trajectory_only is True + assert config.save_trajectories is True + assert config.save_summary is False + + +def test_code_seed_file_contract(): + rows = _read_jsonl(REPO_ROOT / "seeds" / "code" / "seeds.jsonl") + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +def test_code_benchmark_contract(): + rows = _read_jsonl(REPO_ROOT / "benchmark" / "code_benchmark.jsonl") + + assert len(rows) == 2 + assert all({"id", "question", "answer"} <= set(row.keys()) for row in rows) + assert rows[0]["id"] == "code_read_001" + assert "metadata" not in rows[0] + assert rows[1]["id"] == "code_edit_001" + assert "tests/smoke_test.py" in rows[1]["question"] + assert rows[1]["answer"] == "smoke test passed" + assert rows[1]["metadata"] == { + "target_files": ["app.py"], + "check_command": "python tests/smoke_test.py", + } + assert all("/home/" not in json.dumps(row, ensure_ascii=False) for row in rows) + assert all("DataFlow" not in json.dumps(row, ensure_ascii=False) for row in rows) + + +def test_code_demo_repo_contract(): + repo_root = REPO_ROOT / "seeds" / "code" / "seed" / "demo_repo" + + required_paths = [ + repo_root / "README.md", + repo_root / "app.py", + repo_root / "config" / "app_config.json", + repo_root / "lib" / "helpers.py", + repo_root / "tests" / "smoke_test.py", + ] + for path in required_paths: + assert path.exists(), path + + smoke_test = (repo_root / "tests" / "smoke_test.py").read_text(encoding="utf-8") + assert "build_message" in smoke_test + assert "SMOKE_OK" in smoke_test +``` + +- [ ] **Step 3: Run the new Coding tests and verify they fail because the assets do not exist yet** + +Run: + +```bash +pytest \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + -v +``` + +Expected: FAIL with missing-file errors for the new Coding config, seed, benchmark, and demo repo files. + +### Task 6: Create the Coding demo repo, seeds, benchmark, and configs + +**Files:** +- Create: `configs/synthesis/code_config.json` +- Create: `configs/trajectory/code_trajectory.json` +- Create: `seeds/code/seeds.jsonl` +- Create: `seeds/code/seed/demo_repo/README.md` +- Create: `seeds/code/seed/demo_repo/app.py` +- Create: `seeds/code/seed/demo_repo/config/app_config.json` +- Create: `seeds/code/seed/demo_repo/lib/helpers.py` +- Create: `seeds/code/seed/demo_repo/tests/smoke_test.py` +- Create: `benchmark/code_benchmark.jsonl` + +- [ ] **Step 1: Create the committed demo repo contents** + +Use these exact file contents. + +`seeds/code/seed/demo_repo/README.md` + +```md +# Coding Example Demo Repo + +This tiny repository is bundled for AgentFlow's CodingAgent example. + +- `app.py` builds a greeting string. +- `config/app_config.json` stores the expected name and suffix. +- `lib/helpers.py` contains the formatting helper. +- `tests/smoke_test.py` is the verification command used by the rollout example. +``` + +`seeds/code/seed/demo_repo/config/app_config.json` + +```json +{ + "default_name": "AgentFlow", + "suffix": "!" +} +``` + +`seeds/code/seed/demo_repo/lib/helpers.py` + +```python +def render_greeting(name: str, suffix: str) -> str: + return f"Hello, {name}{suffix}" +``` + +`seeds/code/seed/demo_repo/app.py` + +```python +import json +from pathlib import Path + +from lib.helpers import render_greeting + + +CONFIG_PATH = Path(__file__).parent / "config" / "app_config.json" + + +def load_config() -> dict: + return json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + + +def build_message() -> str: + config = load_config() + return render_greeting(config["default_name"], "?") + + +if __name__ == "__main__": + print(build_message()) +``` + +`seeds/code/seed/demo_repo/tests/smoke_test.py` + +```python +from app import build_message + + +def main() -> None: + message = build_message() + assert message == "Hello, AgentFlow!", message + print("SMOKE_OK") + + +if __name__ == "__main__": + main() +``` + +Keep the `app.py` / `smoke_test.py` mismatch intentional here: the hard-coded `"?"` is the planned edit-task bug that the bundled Coding benchmark will exercise later. + +- [ ] **Step 2: Create the Coding seed and benchmark files** + +`seeds/code/seeds.jsonl` must contain exactly: + +```jsonl +{"content": "Inspect the demo repository and trace how the greeting is assembled from config and helper code.", "kwargs": {}} +{"content": "Look for a small repository bug that can be fixed with a minimal edit and validated with the committed smoke test.", "kwargs": {}} +``` + +`benchmark/code_benchmark.jsonl` must contain exactly two rows: + +```jsonl +{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"} +{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}} +``` + +- [ ] **Step 3: Create the Coding synthesis config** + +Create `configs/synthesis/code_config.json` with this exact contract: + +```json +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 10, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "available_tools": ["code-*"], + "sampling_tips": [ + "Inspect the repository before proposing edits.", + "Use code-bash only for lightweight checks that fit the bundled demo repo." + ], + "synthesis_tips": [ + "Generate repo-grounded QA only.", + "Prefer file-path, function-behavior, and small edit-validation questions over open-ended design prompts." + ], + "qa_examples": [ + { + "question": "Which file stores the greeting suffix used by the demo app? Reply with the relative file path only.", + "answer": "config/app_config.json" + }, + { + "question": "What string does `build_message()` return before any edits? Reply with the exact string only.", + "answer": "Hello, AgentFlow?" + } + ], + "seed_description": "Coding demo repository prompts", + "seeds_file": "seeds/code/seeds.jsonl", + "output_dir": "results/code" +} +``` + +- [ ] **Step 4: Create the Coding rollout config** + +Create `configs/trajectory/code_trajectory.json` with this exact contract: + +```json +{ + "benchmark_name": "code_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 12, + "available_tools": ["code-*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "system_prompt": [ + "You are a coding assistant working inside a small repository.", + "Inspect files before editing them.", + "When a task asks for verification, run the requested command inside the coding workspace before giving the final answer." + ], + "evaluate_results": false, + "data_path": "benchmark/code_benchmark.jsonl", + "output_dir": "trajectory_results/code", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true, + "save_summary": false +} +``` + +- [ ] **Step 5: Run the Coding config and asset tests plus deterministic backend coverage** + +Run: + +```bash +pytest \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 6: Defer the credential-dependent live Coding rollout to Chunk 4** + +Do not block this chunk on external credentials or a running sandbox. The required gate for Chunk 3 is the deterministic pytest suite from Step 5. Perform the representative live Coding rollout later using Chunk 4 Task 10. + +- [ ] **Step 7: Commit the Coding assets** + +```bash +git add \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + configs/synthesis/code_config.json \ + configs/trajectory/code_trajectory.json \ + seeds/code \ + benchmark/code_benchmark.jsonl +git commit -m "feat: add Coding example assets" +``` + +### Task 7: Add the Coding guide and lock the doc contract + +**Files:** +- Create: `rollout/tests/test_code_example_doc.py` +- Create: `examples/CodingAgent.md` + +- [ ] **Step 1: Write the failing Coding doc contract test** + +Create `rollout/tests/test_code_example_doc.py` with: + +```python +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_coding_example_doc_has_required_sections_and_repo_root_contract(): + content = (REPO_ROOT / "examples" / "CodingAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "cd AgentFlow", + "export AGENTFLOW_REPO_ROOT=$(pwd)", + "code-*", + "configs/sandbox-server/code_config.json", + "configs/synthesis/code_config.json", + "configs/trajectory/code_trajectory.json", + "benchmark/code_benchmark.jsonl", + "seeds/code/seeds.jsonl", + "seeds/code/seed/demo_repo", + "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo", + "source_dir", + "./start_sandbox_server.sh --config configs/sandbox-server/code_config.json", + "python tests/smoke_test.py", + "training / deployment / infer are not covered yet", + ] + for needle in required_strings: + assert needle in content + + assert "/home/a1/sdb/dxd/DataFlow" not in content + assert "DataFlow" not in content + assert "Step 4" not in content + assert "Step 5" not in content +``` + +- [ ] **Step 2: Run the Coding doc test and confirm it fails** + +Run: + +```bash +pytest rollout/tests/test_code_example_doc.py -v +``` + +Expected: FAIL because `examples/CodingAgent.md` does not exist yet. + +- [ ] **Step 3: Write `examples/CodingAgent.md` in DS-style structure** + +The guide must: + +- use a DS-style three-step title +- explain that CodingAgent uses the `code` backend’s six tools through `code-*` +- include prerequisites for: + - `cd AgentFlow` + - `export OPENAI_API_KEY=...` + - `export OPENAI_API_URL=...` + - `export AGENTFLOW_REPO_ROOT=$(pwd)` +- document the default committed repo path `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` +- say users can replace `resource_init_configs.code.content.source_dir` with their own repo path if desired +- use `configs/sandbox-server/code_config.json` for Step 1 +- use `configs/synthesis/code_config.json` for Step 2 +- use `configs/trajectory/code_trajectory.json` for Step 3 +- mention `benchmark/code_benchmark.jsonl` +- mention the bundled read-only plus edit-task example style +- explicitly say later training / deployment / infer are not covered yet +- keep the explanation example-oriented; do not mention the user’s local DataFlow path + +- [ ] **Step 4: Run the Coding doc test and the targeted Coding suite** + +Run: + +```bash +pytest \ + rollout/tests/test_code_example_doc.py \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit the Coding guide** + +```bash +git add rollout/tests/test_code_example_doc.py examples/CodingAgent.md +git commit -m "docs: add Coding example guide" +``` + +## Chunk 4: Final Verification and Local Dry Runs + +### Task 8: Run the required deterministic verification suite + +**Files:** +- No planned file changes. If any failures appear here, fix them in the owning chunk and create a normal corrective commit before reporting completion. + +- [ ] **Step 1: Run the full targeted example suite** + +Run: + +```bash +pytest \ + sandbox/tests/test_sandbox_config_loading.py \ + sandbox/tests/test_mcp_backend.py \ + sandbox/tests/test_mcp_client.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_mcp_example_assets.py \ + rollout/tests/test_mcp_example_doc.py \ + rollout/tests/test_code_example_assets.py \ + rollout/tests/test_code_example_doc.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 2: Record final status without creating a verification-only commit** + +If Step 1 is green, continue to Task 9 to finalize the MCP benchmark answers. If a verification failure requires code or doc fixes, make the fix in the owning chunk, rerun the affected tests, and create a normal corrective commit rather than a “verification only” commit. + +### Task 9: Finalize MCP benchmark answers against the prepared Toolathlon-GYM environment + +**Files:** +- Modify: `benchmark/mcp_canvas_benchmark.jsonl` +- Modify: `benchmark/mcp_snowflake_benchmark.jsonl` +- Modify: `benchmark/mcp_woocommerce_benchmark.jsonl` +- Modify: `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- Modify: `benchmark/mcp_youtube_benchmark.jsonl` +- Modify: `benchmark/mcp_train_benchmark.jsonl` + +This task is required to convert the placeholder MCP benchmark answers from Chunk 2 into real benchmark answers. If `OPENAI_API_KEY`, `OPENAI_API_URL`, or the prepared Toolathlon-GYM environment are unavailable, stop here and report a blocker instead of inventing answers. + +- [ ] **Step 1: Start the MCP sandbox server** + +Run in a dedicated terminal from the AgentFlow repo root: + +```bash +export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym +./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json +``` + +Expected: the MCP sandbox stays running while Step 2 and Step 3 execute. + +- [ ] **Step 2: Run one MCP synthesis smoke check** + +Run: + +```bash +export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +python synthesis/pipeline.py \ + --config configs/synthesis/mcp_canvas_config.json \ + --seeds seeds/mcp/canvas_seeds.jsonl \ + --output-dir /tmp/agentflow-mcp-canvas-synth +``` + +Expected: the command starts successfully with the checked-in MCP config. If the current synthesis pipeline still writes to the repo’s shared aggregation directory, confirm that new QA / trajectory rows appear there; otherwise confirm output appears under `/tmp/agentflow-mcp-canvas-synth/`. + +- [ ] **Step 3: Run the six MCP rollout configs and transcribe grounded answers** + +Run: + +```bash +export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +python -m rollout.pipeline --config configs/trajectory/mcp_canvas_trajectory.json --output-dir /tmp/agentflow-mcp-canvas-check +python -m rollout.pipeline --config configs/trajectory/mcp_snowflake_trajectory.json --output-dir /tmp/agentflow-mcp-snowflake-check +python -m rollout.pipeline --config configs/trajectory/mcp_woocommerce_trajectory.json --output-dir /tmp/agentflow-mcp-woocommerce-check +python -m rollout.pipeline --config configs/trajectory/mcp_yahoo_finance_trajectory.json --output-dir /tmp/agentflow-mcp-yahoo-finance-check +python -m rollout.pipeline --config configs/trajectory/mcp_youtube_trajectory.json --output-dir /tmp/agentflow-mcp-youtube-check +python -m rollout.pipeline --config configs/trajectory/mcp_train_trajectory.json --output-dir /tmp/agentflow-mcp-train-check +``` + +For each domain: + +- open the newest results JSONL under the matching `/tmp/agentflow-mcp--check/` directory +- for each task, verify the candidate answer against the saved trajectory’s MCP tool output (`trajectory.messages` tool entries and/or `trajectory.tool_calls[*].result`) rather than trusting only `predicted_answer` +- copy the tool-supported final textual answers into the corresponding two rows in `benchmark/mcp__benchmark.jsonl` + +Expected: all six rollout commands start successfully, each output directory contains a results JSONL file, and every committed MCP benchmark answer is backed by observed tool output. + +- [ ] **Step 4: Validate that all MCP benchmark answers are now populated** + +Run: + +```bash +python - <<'PY' +import json +from pathlib import Path + +for path in sorted(Path("benchmark").glob("mcp_*_benchmark.jsonl")): + rows = [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + assert rows, path + assert all(isinstance(row["answer"], str) and row["answer"].strip() for row in rows), path +print("MCP benchmark answers verified") +PY +``` + +Expected: the script prints `MCP benchmark answers verified`. + +- [ ] **Step 5: Re-run the MCP asset contract test after editing the benchmark files** + +Run: + +```bash +pytest rollout/tests/test_mcp_example_assets.py -v +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the grounded MCP benchmark answers** + +```bash +git add \ + benchmark/mcp_canvas_benchmark.jsonl \ + benchmark/mcp_snowflake_benchmark.jsonl \ + benchmark/mcp_woocommerce_benchmark.jsonl \ + benchmark/mcp_yahoo_finance_benchmark.jsonl \ + benchmark/mcp_youtube_benchmark.jsonl \ + benchmark/mcp_train_benchmark.jsonl +git commit -m "test: ground MCP benchmark answers" +``` + +### Task 10: Run optional Coding live smoke checks when LLM credentials are available + +**Files:** +- No planned file changes. Skip this task if `OPENAI_API_KEY` or `OPENAI_API_URL` are unavailable. Record the skip reason in the execution report. + +- [ ] **Step 1: Start the Code sandbox server** + +Run in a dedicated terminal from the AgentFlow repo root: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/code_config.json +``` + +Expected: the Code sandbox stays running while Step 2-4 execute. + +- [ ] **Step 2: Run one Coding synthesis smoke check with the bundled repo** + +Run: + +```bash +# from the AgentFlow repo root +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +export AGENTFLOW_REPO_ROOT=$(pwd) +python synthesis/pipeline.py \ + --config configs/synthesis/code_config.json \ + --seeds seeds/code/seeds.jsonl \ + --output-dir /tmp/agentflow-code-synth-check +``` + +Expected: the command starts successfully with the checked-in Coding synthesis config. If the current synthesis pipeline still writes to the repo’s shared aggregation directory, confirm that new QA / trajectory rows appear there; otherwise confirm output appears under `/tmp/agentflow-code-synth-check/`. + +- [ ] **Step 3: Run one Coding rollout smoke check with the bundled repo** + +Run: + +```bash +# from the AgentFlow repo root +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +export AGENTFLOW_REPO_ROOT=$(pwd) +python -m rollout.pipeline \ + --config configs/trajectory/code_trajectory.json \ + --task-ids code_edit_001 \ + --output-dir /tmp/agentflow-code-final-check +``` + +Expected: the rollout starts successfully, copies the bundled demo repo into the code workspace, and writes results JSONL under `/tmp/agentflow-code-final-check/`. + +- [ ] **Step 4: Optionally prove the documented `source_dir` override path works with another local repo** + +Run: + +```bash +# from the AgentFlow repo root +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +export AGENTFLOW_REPO_ROOT=$(pwd) +export LOCAL_CODE_REPO=/abs/path/to/local/repo +python - <<'PY' +import json +import os +from pathlib import Path + +src = Path("configs/trajectory/code_trajectory.json") +dst = Path("/tmp/code_trajectory_local_repo.json") +payload = json.loads(src.read_text(encoding="utf-8")) +payload["resource_init_configs"]["code"]["content"]["source_dir"] = os.environ["LOCAL_CODE_REPO"] +payload["data_path"] = "/tmp/code_override_benchmark.jsonl" +dst.write_text(json.dumps(payload, indent=2), encoding="utf-8") +Path("/tmp/code_override_benchmark.jsonl").write_text( + json.dumps( + { + "id": "code_override_read_001", + "question": "Use code tools to inspect the repository and reply with the relative path of any one file located at the repository root.", + } + ) + + "\n", + encoding="utf-8", +) +print(dst) +PY +python -m rollout.pipeline \ + --config /tmp/code_trajectory_local_repo.json \ + --max-tasks 1 \ + --output-dir /tmp/agentflow-code-local-repo-check +``` + +Expected: the override-repo rollout starts successfully against the temporary generic benchmark and writes a result file, proving the documented `source_dir` override works without hard-coding any machine-local repo path into committed assets. + +Plan complete and saved to `docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md`. Ready to execute? diff --git a/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md new file mode 100644 index 0000000..7085867 --- /dev/null +++ b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md @@ -0,0 +1,356 @@ +## Code Backend Single-Repository Vendoring Design + +Date: 2026-04-15 +Status: Approved for planning +Supersedes: `docs/superpowers/specs/2026-04-15-code-backend-design.md` + +## Summary + +AgentFlow's current `code` backend depends on an external `claude-code-py` source tree through `claude_code_root`. That makes the feature non-portable, couples runtime behavior to a closed or separately managed repository, and introduces an avoidable configuration requirement. + +This design replaces that approach with an internal vendored compatibility layer inside AgentFlow. AgentFlow will vendor the minimal upstream code-tool subset it actually needs, and the `code` backend will load those vendored classes directly from the AgentFlow repository. + +The backend remains a session-scoped resource backend with per-worker workspaces and AgentFlow-owned path-boundary enforcement. The six exposed coding tools remain: + +- `code:read` +- `code:glob` +- `code:grep` +- `code:bash` +- `code:edit` +- `code:write` + +Unlike the current implementation, all six tools, including `bash`, will execute through vendored upstream-style tool classes. AgentFlow will stop treating `bash` as a separately wrapped special case. + +## Problem Statement + +The current design has three architectural problems: + +- It requires `claude_code_root` in sandbox config, which breaks single-repository portability. +- It relies on dynamic source loading from another tree, which is brittle and hard to reason about. +- It contains an internal inconsistency: `code:bash` is nominally part of the reused six-tool set, but in practice it bypasses the loaded upstream `BashTool` and runs through an AgentFlow-specific subprocess wrapper. + +These are not desirable "advanced configuration" choices. They are design mistakes for a feature that should ship as a self-contained AgentFlow capability. + +## Goals + +- Make the `code` backend runnable from the AgentFlow repository alone. +- Keep the six coding tools behaviorally aligned with the upstream lightweight tool implementations. +- Remove all dependency on external `claude-code-py` runtime paths and dynamic import plumbing. +- Keep `code` as a session-scoped backend with isolated worker workspaces. +- Preserve AgentFlow's existing rollout and sandbox abstractions. +- Add a clear test strategy that covers unit behavior, backend integration, and a real rollout smoke path. + +## Non-Goals + +- Do not vendor the full `claude-code-py` runtime. +- Do not vendor query loops, skills, tracing, memory loading, or sub-agent functionality. +- Do not add hard shell sandboxing. +- Do not redesign rollout configuration, sandbox protocols, or tool schema conventions. +- Do not keep backward compatibility for `claude_code_root`, `allow_bash`, or `bash_timeout_seconds`. + +## Core Decisions + +### 1. Vendor a minimal compatibility layer + +AgentFlow will vendor only the minimal code-tool slice needed for the `code` backend: + +- a minimal `Tool` base class +- `ReadTool` +- `GlobTool` +- `GrepTool` +- `BashTool` +- `EditTool` +- `WriteTool` + +The vendored code should be a small, clearly bounded package inside AgentFlow, with only the minimum import adjustments required to make it internal and self-contained. + +### 2. Remove the external-root model completely + +The new design deletes the idea that AgentFlow should discover coding tools from another source tree at runtime. + +Delete these concepts from implementation, config, tests, and docs: + +- `claude_code_root` +- dynamic import of upstream files +- root-local support-module loading +- compatibility tests that verify loading from an external tree + +This is an intentional removal, not a soft deprecation. + +### 3. Treat `bash` as a normal member of the six-tool set + +The vendored `BashTool` will be used the same way as the other five vendored tools: through the common tool-loading path and `tool.call(params, ctx)` execution model. + +Delete these concepts from implementation, config, tests, and docs: + +- `allow_bash` +- `bash_timeout_seconds` +- AgentFlow-specific `_run_bash_command()` behavior +- config-availability messaging for `code-bash` + +The `code` backend will expose all six tools all the time. + +### 4. Keep AgentFlow-owned environment boundaries + +Vendoring the tool classes does not move workspace safety into the vendored code. AgentFlow still owns: + +- per-worker workspace creation +- `source_dir` copying +- worker/session identity checks +- file-path normalization relative to workspace +- path-escape rejection for file-oriented tools + +This separation keeps the vendored code small and keeps environment policy at the backend boundary where AgentFlow already owns session state. + +## Architecture + +### Vendored package layout + +Add a dedicated internal package for the vendored code tools, for example: + +- `sandbox/server/backends/resources/code_vendor/__init__.py` +- `sandbox/server/backends/resources/code_vendor/tool.py` +- `sandbox/server/backends/resources/code_vendor/file_tools.py` +- `sandbox/server/backends/resources/code_vendor/edit_tools.py` + +The package name should make it obvious that this is a bounded internal compatibility layer, not a general-purpose reimplementation of `claude-code-py`. + +### Backend responsibilities + +`CodeBackend` remains responsible for: + +- registering the six `code:*` bridge tools +- creating and cleaning per-worker workspaces +- copying optional `source_dir` contents into the session workspace +- validating the session workspace against the worker id +- enforcing file-path boundaries for file-oriented tools +- instantiating and caching the six vendored tool classes + +`CodeBackend` no longer needs: + +- `_get_claude_code_root()` +- `_validate_claude_code_root_prerequisites()` +- `_load_root_support_modules()` +- dynamic module alias installation +- `_run_bash_command()` +- any `bash`-only dispatch branch + +### Runtime flow + +The runtime flow becomes: + +1. `initialize()` validates `worker_id`, prepares a staged workspace, and optionally copies `source_dir`. +2. The backend ensures vendored tool instances are loaded from the repository itself. +3. The staged workspace becomes the active workspace. +4. Bridge dispatch resolves the worker session workspace. +5. For file-oriented tools, AgentFlow normalizes and bounds path-like parameters to the workspace. +6. The backend creates a minimal context adapter with `cwd=`. +7. All six tools execute through the same vendored `tool.call(...)` path. +8. AgentFlow wraps results into standard backend success/error responses. + +### Minimal context adapter + +The vendored six-tool subset only needs a tiny runtime context: + +```python +SimpleNamespace(cwd=str(workspace)) +``` + +No full agent runtime model is needed. + +## Tool Behavior Contract + +The tool surface remains unchanged: + +- prompt-visible schemas stay `code-read`, `code-glob`, `code-grep`, `code-bash`, `code-edit`, `code-write` +- runtime names stay `code:read`, `code:glob`, `code:grep`, `code:bash`, `code:edit`, `code:write` +- parameter names remain aligned with the vendored upstream tool classes + +Behaviorally, the backend should preserve: + +- line-numbered `read` output +- recursive globbing behavior +- recursive grep behavior with optional file filter +- exact-match edit semantics with uniqueness checks +- full-file overwrite semantics for `write` +- upstream-style shell execution behavior for `bash` + +AgentFlow should not add new `bash`-specific runtime policy once vendoring is complete. + +## Configuration Design + +### Backend config + +After the redesign, `code` backend config should keep only what is still meaningfully owned by AgentFlow: + +- `workspace_root` + +The config example should therefore look like: + +```json +{ + "server": { + "url": "http://127.0.0.1:18890", + "port": 18890, + "session_ttl": 300 + }, + "resources": { + "code": { + "enabled": true, + "description": "Lightweight coding backend with vendored upstream-style tools", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": "/tmp/agentflow_code" + } + } + }, + "warmup": { + "enabled": false, + "resources": [] + } +} +``` + +### Session init config + +Session init remains intentionally small: + +- `source_dir`: optional directory copied into the session workspace + +Rollout-facing use stays: + +- `resource_types=["code"]` +- `available_tools=["code-*"]` +- `resource_init_configs["code"]["content"]["source_dir"]` + +## Testing Strategy + +The testing changes must be explicit. This work is not only about adding tests; it also requires deleting tests and rewriting tests that lock in the old design mistake. + +### Delete old tests + +Delete tests that exist only to validate the old external-root or AgentFlow-specific bash wrapper model, including categories such as: + +- external `claude_code_root` requirement +- fake external upstream roots +- dynamic loading from another repository +- root-local support-module loading +- isolated-per-root loader behavior +- `allow_bash` gating +- `bash_timeout_seconds` +- AgentFlow-specific `bash` input validation that no longer exists in the vendored-upstream model +- config-template checks that still mention deleted fields +- env-var expansion tests whose only purpose was `CLAUDE_CODE_ROOT` + +### Modify existing tests + +Keep and adapt the tests that remain valid for the new architecture: + +- tool registration tests +- workspace initialization and `source_dir` copy tests +- workspace recreation tests +- cleanup safety tests +- `worker_id` validation tests +- session workspace identity and boundary tests +- file-path normalization and escape rejection tests +- successful bridge dispatch and standard response-shape tests +- tool schema presence/filtering/parameter-contract tests + +Schema tests must update descriptions so `code-bash` no longer claims backend-config-dependent availability. + +### Add new tests + +Add new focused tests for the vendored model: + +- vendored tool loading from the internal package +- all six tools executing through the same tool-call path +- vendored `BashTool` behavior contract +- vendored `EditTool`/`WriteTool` behavior contract where existing bridge tests do not already cover it + +### Add a rollout-facing smoke test + +Add one end-to-end rollout smoke that exercises the real rollout-to-sandbox-to-code-backend path. + +The smoke should: + +- live under `rollout/tests/` +- follow the MCP real-smoke opt-in pattern +- not be collected in default pytest execution +- require explicit manual invocation, for example by setting `AGENTFLOW_RUN_CODE_REAL=1` +- use a real LLM response path and a real sandbox/code backend path + +This smoke should not mock sandbox components. It should really: + +- start sandbox +- create a `code` session +- copy a tiny fixture repo into the workspace +- expose `code-*` tools through rollout +- execute at least one real `code:*` tool call + +Recommended smoke structure: + +1. Create a temporary fixture repo with a uniquely identifiable file, for example `nested/TOKEN.txt`. +2. Write a hard-to-guess token into that file. +3. Create a one-task benchmark asking the agent to use code tools to read the file and return only the exact token. +4. Run `RolloutPipeline` with: + - `available_tools=["code-*"]` + - `resource_types=["code"]` + - `resource_init_configs["code"]["content"]["source_dir"]=fixture_repo` + - `sandbox_config_path="configs/sandbox-server/code_config.json"` + - `sandbox_auto_start=True` + - `number_of_tasks=1` + - `evaluate_results=False` + - `save_trajectories=True` +5. Assert that: + - the task succeeds + - the trajectory contains at least one `code:*` tool call + - the final answer equals the token + - the token appears in the observed tool-result chain + +Credential provisioning for that opt-in real smoke remains an execution-time concern and must not be hardcoded into repository defaults. + +## Documentation Changes + +Update all user-facing and internal docs that still describe the deleted design: + +- `configs/sandbox-server/code_config.json` +- `sandbox/tests/test_sandbox_config_loading.py` +- `sandbox/tests/test_code_tool_schemas.py` +- any code-backend README/tutorial snippets +- the prior `2026-04-15-code-backend-design.md` should be treated as superseded + +The resulting documentation should consistently present the `code` backend as a native AgentFlow capability. + +## Risks + +- Vendored code can drift from future upstream changes. + Mitigation: treat the vendored subset as an intentionally frozen internal compatibility layer and cover it with explicit behavior tests. + +- Real rollout smoke tests can be flaky because they depend on live model behavior and external connectivity. + Mitigation: keep them opt-in and strongly constrain the task prompt and fixture. + +- `bash` remains powerful because it executes shell commands relative to the workspace but without OS-level isolation. + Mitigation: document this clearly as an inherent property of the `code` backend rather than disguising it behind partial configuration toggles. + +## Recommended Implementation Order + +1. Vendor the minimal upstream six-tool compatibility layer into AgentFlow. +2. Simplify `CodeBackend` to load vendored tools directly and remove all external-root logic. +3. Remove `bash` special handling so all six tools share one execution path. +4. Simplify `code` backend config to `workspace_root` only. +5. Update schema descriptions and sandbox config examples. +6. Delete old tests tied to the removed design. +7. Adapt retained backend and schema tests. +8. Add vendored-tool behavior coverage. +9. Add the opt-in real rollout smoke. + +## Decision + +AgentFlow should stop treating the `code` backend as a thin adapter over an external source tree and instead ship a self-contained, vendored upstream-style compatibility layer inside the repository. + +This restores the intended product boundary: + +- AgentFlow owns the coding environment as a native feature +- all six code tools are internally available +- rollout and sandbox integration stay unchanged +- the repository becomes portable again diff --git a/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md new file mode 100644 index 0000000..e34502c --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md @@ -0,0 +1,506 @@ +## MCP and Coding Examples Design + +Date: 2026-04-20 +Status: Approved for planning + +## Summary + +AgentFlow now has MCP and `code` backends, but the repository still lacks official example guides and matching example assets for them. This design adds two new examples that stay aligned with the current example set in structure, tone, and scope: + +- `examples/MCPAgent.md` +- `examples/CodingAgent.md` + +Both examples will follow the lighter three-step pattern already used by `DSAgent`: + +- Step 1: Start the Sandbox Server +- Step 2: Synthesize QA Data +- Step 3: Synthesize Trajectory Data + +They will ship with the configs, seeds, and benchmark files needed to make those steps runnable. + +## Problem Statement + +The repository already contains: + +- a working MCP backend with 25 Toolathlon-GYM MCP servers +- a working `code` backend with six coding tools +- sandbox config templates for both backends +- tests that prove both backends work at the backend level + +What is missing is the user-facing example layer: + +- no official `MCPAgent` example document +- no official `CodingAgent` example document +- no matching synthesis configs +- no matching trajectory configs +- no seeds for either example +- no benchmark data for MCP rollout +- no committed benchmark data for coding rollout + +Without these assets, the new backends are discoverable in code but not presented as first-class AgentFlow example workflows. + +## Goals + +- Add official MCP and Coding example guides under `examples/`. +- Keep both guides stylistically aligned with the current example set. +- Keep document granularity aligned with current examples rather than exposing design or orchestration internals. +- Limit both examples to the currently practical scope: sandbox startup, QA synthesis, and trajectory rollout. +- Add the configs and data assets needed to support those examples. +- Use demo-scale seeds and benchmarks that are easy to run and verify. +- Make MCP example tasks operate against the initialized Toolathlon-GYM mock database. +- Make Coding example tasks operate against a known demo repository copied into the coding workspace. + +## Non-Goals + +- Do not turn either example into a full Toolathlon task replay framework. +- Do not document or expose internal validation strategy, subagent orchestration, or workspace isolation rationale in the example docs. +- Do not add Step 4 and Step 5 sections for training, deployment, or infer/eval. +- Do not introduce extra registry files or config-generation layers just to organize domains. +- Do not require MCP `task_dir`, `initial_workspace`, or `preprocess` flows for the example path. +- Do not make CodingAgent depend on an arbitrary external repository whose contents are unknown to the repository. + +## User-Facing Outcome + +After this work, the repository will present MCP and Coding the same way it already presents RAG, Doc, DS, and Text2SQL: + +- a dedicated example document +- a sandbox config entry point +- a synthesis config +- a trajectory config +- seed data +- benchmark data where rollout needs it + +The examples will read like the existing examples and will not require readers to understand internal backend architecture. + +## Core Decisions + +### 1. Both new examples use the three-step DS-style structure + +`MCPAgent.md` and `CodingAgent.md` will both mirror the scope of `examples/DSAgent.md` rather than the five-step examples. + +Each document will include: + +- Overview +- Prerequisites +- Pipeline Overview +- Step 1: Start the Sandbox Server +- Step 2: Synthesize QA Data +- Step 3: Synthesize Trajectory Data +- Configuration Reference +- FAQ + +Each document will explicitly note that the repository currently covers data synthesis and trajectory rollout for that example, but not the later training/deployment/infer stages as an official example workflow. + +### 2. MCPAgent is a domain-level demo over the initialized Toolathlon-GYM database + +The MCP example will use Toolathlon-GYM as the backing environment, but it will not replay full Toolathlon task directories. + +Instead, it will use: + +- the initialized Toolathlon-GYM mock PostgreSQL database +- the MCP servers exposed through AgentFlow +- small domain-level seeds +- small domain-level rollout benchmarks + +This keeps the example aligned with the rest of the repository's example style while still using the real MCP domain data. + +### 3. MCPAgent reuses `configs/sandbox-server/mcp_config.json` + +The existing MCP sandbox config path remains the canonical entry point: + +- `configs/sandbox-server/mcp_config.json` + +This file will be updated so its default `enabled_mcp_servers` matches the exact server subset needed by the official example domains, rather than the full 25-server backend surface: + +- `canvas` +- `snowflake` +- `woocommerce` +- `yahoo-finance` +- `youtube` +- `youtube-transcript` +- `rail_12306` +- `filesystem` + +This keeps Step 1 aligned with the example scope while still allowing all six documented domains to run from the shared MCP sandbox entry point. + +This is an intentional example-oriented default, not a removal of backend capability. The current checked-in `mcp_config.json` already enables only a subset of servers today, and full-surface MCP usage will remain available by expanding `enabled_mcp_servers` in the same file or in a user-local copy outside the official examples. + +The same config will also define an explicit MCP server path contract so the checked-in MCP YAMLs can resolve `${local_servers_paths}` at runtime. The planned default is an environment-backed path such as: + +- `mcp_servers_path: "${TOOLATHLON_GYM_ROOT}/local_servers"` + +The implementation will rely on the existing MCP backend translation layer: `ToolathlonGymBackend` passes `mcp_servers_path` into the MCP YAML loader, and that loader substitutes the value into `${local_servers_paths}` when resolving each bundled server YAML. +In other words, `mcp_servers_path` is the JSON config field name, while `${local_servers_paths}` is the existing MCP YAML placeholder name for the same `local_servers/` directory. + +The current repository version of `configs/sandbox-server/mcp_config.json` does not yet define `mcp_servers_path`; adding that field is part of this example work. + +No separate `mcp_all_config.json` or metadata registry file will be introduced. + +### 4. MCP tool exposure is defined by server wildcard, not hand-picked tool names + +For each MCP domain config, `available_tools` will expose tools via server wildcard patterns such as: + +- `mcp:canvas.*` +- `mcp:snowflake.*` +- `mcp:filesystem.*` + +This avoids silent omission of tools from an included MCP server namespace and keeps the config surface simple. + +### 5. MCP resource init config stays minimal + +The MCP backend supports session init fields such as: + +- `task_dir` +- `copy_initial_workspace` +- `run_preprocess` +- `launch_time` + +Those fields are only needed when running task-directory-style Toolathlon tasks. + +For the MCP example path in this design, the session only needs a normal MCP workspace plus access to the initialized mock database through the configured MCP servers. Therefore: + +- `resource_types` will include `["mcp"]` +- `resource_init_configs.mcp.content` will be omitted or empty in example configs + +### 6. CodingAgent uses a repository-local demo repo via `source_dir` + +The coding backend always creates its own workspace, but a meaningful coding task needs actual repository contents inside that workspace. + +Therefore the official Coding example will use: + +- a small demo repository committed inside AgentFlow +- `resource_types=["code"]` +- `resource_init_configs["code"]["content"]["source_dir"]` pointing to that demo repository through the explicit repo-root contract `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` + +AgentFlow's config loader already expands `${VAR}` placeholders before backend initialization, so no new path resolver is needed for this contract. The official docs will require exporting `AGENTFLOW_REPO_ROOT` after `cd AgentFlow`, and Step 2 / Step 3 will use that variable consistently. Users can later replace `source_dir` with their own repository path, but the official example will ship with a known default so its seeds and benchmark remain correct. + +### 7. CodingAgent uses demo-scale mixed tasks + +CodingAgent will use a small mixed task set: + +- read-only repository inspection tasks +- one or more controlled edit tasks with straightforward verification + +This preserves the intended coding flavor without making the example depend on large or fragile repository setups. + +### 8. Keep the docs at current example granularity + +The example docs should not explain design trade-offs, internal isolation, subagent strategy, or backend reasoning unless the current example set already does so. + +They should look and read like the existing repository examples: + +- concrete commands +- config file references +- short explanations of required inputs +- key config field summaries +- brief FAQs + +## MCPAgent Design + +### Covered domains + +The MCP example will cover the six data-rich Toolathlon-GYM domains already reflected in current MCP integration smoke tests: + +- `canvas` +- `snowflake` +- `woocommerce` +- `yahoo_finance` +- `youtube` +- `train` + +The docs will cover all six domains, but each domain is still a small demo workflow rather than a long end-to-end enterprise task. + +Server name mapping will follow the current MCP backend naming: + +- `yahoo_finance` uses MCP server `yahoo-finance` +- `train` uses MCP server `rail_12306` + +### Sandbox prerequisites and server subset + +`examples/MCPAgent.md` will document the minimum local prerequisites needed for Step 1 to be runnable: + +- a local `toolathlon_gym` checkout that has already completed its own setup and is running before AgentFlow starts +- `TOOLATHLON_GYM_ROOT` pointing to that checkout +- the MCP server bundle reachable at `${TOOLATHLON_GYM_ROOT}/local_servers` +- required local runtimes such as `node` and `uv` +- the following planned example defaults in `configs/sandbox-server/mcp_config.json`: + - `PGHOST=localhost` + - `PGPORT=5432` + - `PGUSER=eigent` + - `PGPASSWORD=camel` + - `PGDATABASE=toolathlon_gym` + - `CANVAS_DOMAIN=localhost:8080` + - `WORDPRESS_SITE_URL=http://localhost:8081` + +AgentFlow will not bootstrap the Toolathlon-GYM services itself in the official example. If a local setup differs from those defaults, the doc will show them as explicit override points in `mcp_config.json`. + +The checked-in MCP sandbox config will enable only the shared example subset listed in Core Decision 3, so warmup behavior matches the domains covered by the example doc. + +### Files to add + +Add: + +- `examples/MCPAgent.md` +- `configs/synthesis/mcp_canvas_config.json` +- `configs/synthesis/mcp_snowflake_config.json` +- `configs/synthesis/mcp_woocommerce_config.json` +- `configs/synthesis/mcp_yahoo_finance_config.json` +- `configs/synthesis/mcp_youtube_config.json` +- `configs/synthesis/mcp_train_config.json` +- `configs/trajectory/mcp_canvas_trajectory.json` +- `configs/trajectory/mcp_snowflake_trajectory.json` +- `configs/trajectory/mcp_woocommerce_trajectory.json` +- `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- `configs/trajectory/mcp_youtube_trajectory.json` +- `configs/trajectory/mcp_train_trajectory.json` +- `seeds/mcp/canvas_seeds.jsonl` +- `seeds/mcp/snowflake_seeds.jsonl` +- `seeds/mcp/woocommerce_seeds.jsonl` +- `seeds/mcp/yahoo_finance_seeds.jsonl` +- `seeds/mcp/youtube_seeds.jsonl` +- `seeds/mcp/train_seeds.jsonl` +- `benchmark/mcp_canvas_benchmark.jsonl` +- `benchmark/mcp_snowflake_benchmark.jsonl` +- `benchmark/mcp_woocommerce_benchmark.jsonl` +- `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- `benchmark/mcp_youtube_benchmark.jsonl` +- `benchmark/mcp_train_benchmark.jsonl` + +Modify: + +- `configs/sandbox-server/mcp_config.json` + +### Synthesis config shape + +Each `configs/synthesis/mcp__config.json` will follow the same structure as existing synthesis configs and will include: + +- model settings +- sandbox settings +- `resource_types: ["mcp"]` +- `available_tools` using MCP server wildcards +- domain-specific `sampling_tips` +- domain-specific `synthesis_tips` +- small `qa_examples` +- `seeds_file` +- `output_dir` + +These configs will rely on the shared MCP sandbox startup path above rather than redefining server startup details per domain. + +### Trajectory config shape + +Each `configs/trajectory/mcp__trajectory.json` will follow the same shape as existing rollout trajectory configs and will include: + +- `benchmark_name` +- model settings +- sandbox settings +- `resource_types: ["mcp"]` +- `available_tools` using MCP server wildcards +- `system_prompt` +- `data_path` +- `output_dir` +- `save_results` +- `save_trajectories` +- `trajectory_only: true` +- `evaluate_results: false` + +### MCP domain tool exposure + +Planned MCP wildcard exposure: + +- `canvas` + - `mcp:canvas.*` + - `mcp:filesystem.*` + +- `snowflake` + - `mcp:snowflake.*` + - `mcp:filesystem.*` + +- `woocommerce` + - `mcp:woocommerce.*` + - `mcp:filesystem.*` + +- `yahoo_finance` + - `mcp:yahoo-finance.*` + - `mcp:filesystem.*` + +- `youtube` + - `mcp:youtube.*` + - `mcp:youtube-transcript.*` + - `mcp:filesystem.*` + +- `train` + - `mcp:rail_12306.*` + - `mcp:filesystem.*` + +### Seeds and benchmark style + +MCP seeds and benchmarks will be demo-scale and domain-focused. + +They should validate that the MCP backend and domain server set work cleanly inside AgentFlow, not replicate the full complexity of Toolathlon task packs. + +Expected MCP task style: + +- query real mock data from the target domain +- optionally save a result artifact into the workspace through filesystem tools +- produce answers that are easy to verify in a small benchmark + +Benchmark correctness will be defined by the final textual answer in each benchmark row. Workspace artifact creation is allowed as an illustrative side effect, but it is not required for benchmark success and will not be treated as a scoring criterion in the official example data. + +Examples of target task shape: + +- list a small set of course or user information from Canvas +- query a small Snowflake-backed table result +- inspect WooCommerce customer or order data +- fetch Yahoo Finance stock information +- search YouTube content or transcript metadata +- look up railway station or route information + +### MCP example document shape + +`examples/MCPAgent.md` will stay at the same granularity as current examples: + +- one shared sandbox setup section +- one synthesis step covering all six domain configs +- one trajectory step covering all six domain configs +- a compact configuration reference +- a short FAQ + +It will not explain internal domain orchestration, workspace strategy, or design rationale. + +## CodingAgent Design + +### Sandbox entry point + +`examples/CodingAgent.md` will reuse the existing coding sandbox config entry point: + +- `configs/sandbox-server/code_config.json` + +Step 1 in the Coding example will start that config directly, matching the current repository pattern of reusing a checked-in sandbox config rather than introducing a second coding sandbox file. + +### Files to add + +Add: + +- `examples/CodingAgent.md` +- `configs/synthesis/code_config.json` +- `configs/trajectory/code_trajectory.json` +- `seeds/code/seeds.jsonl` +- `seeds/code/seed/demo_repo/README.md` +- `seeds/code/seed/demo_repo/app.py` +- `seeds/code/seed/demo_repo/config/app_config.json` +- `seeds/code/seed/demo_repo/lib/helpers.py` +- `seeds/code/seed/demo_repo/tests/smoke_test.py` +- `benchmark/code_benchmark.jsonl` + +The demo repository under `seeds/code/seed/demo_repo/` should be small, stable, and easy to understand. + +### Synthesis config shape + +`configs/synthesis/code_config.json` will follow existing synthesis config structure and include: + +- model settings +- sandbox settings +- `resource_types: ["code"]` +- `resource_init_configs.code.content.source_dir` using the explicit repo-root contract `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` +- `available_tools: ["code-*"]` +- coding-specific `sampling_tips` +- coding-specific `synthesis_tips` +- small `qa_examples` +- `seeds_file` +- `output_dir` + +### Trajectory config shape + +`configs/trajectory/code_trajectory.json` will follow existing rollout config structure and include: + +- `benchmark_name` +- model settings +- sandbox settings +- `resource_types: ["code"]` +- `resource_init_configs.code.content.source_dir` using the same explicit repo-root contract `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` +- `available_tools: ["code-*"]` +- coding-specific `system_prompt` +- `data_path` +- `output_dir` +- `save_results` +- `save_trajectories` +- `trajectory_only: true` +- `evaluate_results: false` + +### Demo repository shape + +The demo repository should be intentionally small and support both task types: + +- repository inspection +- controlled edit and verification + +The repo should include a few files such as: + +- `README.md` describing the tiny app +- `app.py` as the main entry file +- `config/app_config.json` with one or two settings used by the app +- `lib/helpers.py` with at least one helper imported by `app.py` +- `tests/smoke_test.py` for a minimal verification path + +The goal is not realism through size. The goal is stable, example-quality coding tasks. + +### Seeds and benchmark style + +Coding seeds should focus on repository understanding prompts. + +Coding benchmark tasks should be few and simple, mixing: + +- read-only tasks such as locating files, reading configuration values, or identifying relationships +- edit tasks such as replacing a placeholder string or updating a simple setting + +The benchmark should be authored against the committed demo repository so expected answers remain stable. + +Benchmark contract: + +- read-only tasks will use the standard `id` + `question` + `answer` shape +- edit tasks will still run under `trajectory_only: true` and `evaluate_results: false`, so they are for trajectory capture rather than auto-grading +- edit-task rows will include a short expected completion statement in `answer` plus metadata such as `target_files` and `check_command` to document the intended post-run verification path +- those extra verification fields will live under benchmark `metadata`, so existing rollout loaders can safely ignore them +- the recommended verification path for edit tasks will be the committed `tests/smoke_test.py`, not rollout-time automatic scoring + +### Coding example document shape + +`examples/CodingAgent.md` will mirror the style and scope of `examples/DSAgent.md`: + +- Overview +- Prerequisites +- Pipeline Overview +- Step 1: Start the Sandbox Server +- Step 2: Synthesize QA Data +- Step 3: Synthesize Trajectory Data +- Configuration Reference +- FAQ + +It will explicitly state that the repository currently covers data synthesis and trajectory rollout for this coding example, but not later training/deployment/infer steps as part of the official example. + +## Validation Expectations + +The implementation should be considered correct only if: + +- the new example docs match the style and granularity of current examples +- the new configs parse successfully +- `configs/sandbox-server/mcp_config.json` resolves MCP server executables through the documented `TOOLATHLON_GYM_ROOT` contract +- MCP synthesis and rollout configs align with the real MCP backend surface +- Coding synthesis and rollout configs align with the real code backend surface +- the demo seeds and benchmarks are internally consistent with the assets they target +- representative runs can be executed by following the example documents +- Coding Step 1, QA synthesis, and rollout remain runnable when the documented `cd AgentFlow` plus `export AGENTFLOW_REPO_ROOT=$(pwd)` prerequisite is followed + +## Open Questions Resolved + +- Use `configs/sandbox-server/mcp_config.json` directly: yes +- Add a separate MCP registry file: no +- Use server wildcards instead of hand-picked MCP tool names: yes +- Use Toolathlon task-directory initialization for the example path: no +- Use Toolathlon-GYM mock database as the MCP data source: yes +- Add official infer/eval steps for MCP or Coding examples: no +- Use a repository-local demo repo for CodingAgent: yes + +## Implementation Readiness + +This design is ready for implementation planning. The work is focused, bounded, and does not require redesigning backend behavior. The main deliverables are user-facing docs, example configs, and small example data assets. diff --git a/examples/CodingAgent.md b/examples/CodingAgent.md new file mode 100644 index 0000000..8d754c2 --- /dev/null +++ b/examples/CodingAgent.md @@ -0,0 +1,139 @@ +# CodingAgent: Repository QA / Edit Agent — 3-Step Example Guide (No Training/Deployment/Inference) + +This guide explains how to use AgentFlow's CodingAgent example to synthesize QA data and trajectory data for a small repository workflow. + +CodingAgent uses the `code` backend's six tools through `code-*`, so the same setup covers repository inspection, search, shell validation, and small file edits. + +## Overview + +CodingAgent is a repository-grounded coding agent example. It works against a local code workspace and uses six code tools exposed through `code-*`: + +- `code-read` +- `code-glob` +- `code-grep` +- `code-bash` +- `code-edit` +- `code-write` + +The bundled example is intentionally small and example-oriented. It includes both a read-only question style and an edit-task style based on the committed demo repository, so you can synthesize data for repository inspection and minimal bug-fix workflows from the same assets. + +## Prerequisites + +Install and enter the repository: + +```bash +git clone https://github.com/OpenDCAI/AgentFlow +cd AgentFlow +pip install -e . +``` + +Configure model access and the repo-root contract used by the committed code example configs: + +```bash +export OPENAI_API_KEY=YOUR_KEY +export OPENAI_API_URL=https://openrouter.ai/api/v1 +export AGENTFLOW_REPO_ROOT=$(pwd) +``` + +The default committed repository for this example lives at `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`. + +Seed prompts are stored in `seeds/code/seeds.jsonl`. If you want to run the same pipeline on your own repository instead of the bundled demo repo, replace `resource_init_configs.code.content.source_dir` with your own repository path. + +## Pipeline Overview + +This CodingAgent example uses a simple three-step flow: + +```text +Step 1 Sandbox Server -> Step 2 QA Synthesis -> Step 3 Trajectory Data +``` + +The assets in this repo are already aligned for that flow: + +- Sandbox config: `configs/sandbox-server/code_config.json` +- QA synthesis config: `configs/synthesis/code_config.json` +- Trajectory rollout config: `configs/trajectory/code_trajectory.json` +- Benchmark file: `benchmark/code_benchmark.jsonl` + +The benchmark mixes a bundled read-only plus edit-task example style. One task asks the agent to inspect the repo and answer a question; another asks it to make a minimal fix and verify it with `python tests/smoke_test.py`. + +## Step 1: Start the Sandbox Server + +Start the sandbox server before synthesis or rollout: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/code_config.json +``` + +This launches the code resource backend and prepares per-run workspaces under the sandbox workspace root. + +## Step 2: Synthesize QA Data + +Use the committed synthesis config to generate repository-grounded QA from the coding seeds: + +```bash +python3 synthesis/pipeline.py \ + --config configs/synthesis/code_config.json \ + --seeds seeds/code/seeds.jsonl \ + --output-dir results/code +``` + +> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`. + +- QA: `results/ds_synthesized_qa/synthesized_qa.jsonl` +- Trajectory: `results/ds_synthesized_qa/trajectories.jsonl` + +By default, the synthesis config initializes the code resource from `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` through `resource_init_configs.code.content.source_dir`. + +The committed prompts are designed around the bundled demo repository and support both repository-reading questions and a lightweight edit-validation workflow. + +## Step 3: Synthesize Trajectory Data + +Use rollout to generate trajectory-only records with the committed benchmark: + +```bash +python -m rollout.pipeline \ + --config configs/trajectory/code_trajectory.json \ + --output-dir trajectory_results/code +``` + +This config reads tasks from `benchmark/code_benchmark.jsonl` and keeps the same default repo-root contract via `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`. + +One bundled task explicitly validates the edit workflow by asking the agent to run `python tests/smoke_test.py` after making a minimal fix. + +## Configuration Reference + +### Sandbox config + +`configs/sandbox-server/code_config.json` enables the `code` resource and points the sandbox to a temporary workspace root. + +### Synthesis config + +`configs/synthesis/code_config.json` defines: + +- `available_tools` as `code-*` +- `seeds_file` as `seeds/code/seeds.jsonl` +- `resource_init_configs.code.content.source_dir` as `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` + +If you want to use a different repository, update `source_dir` to your own path while keeping the rest of the pipeline structure the same. + +### Trajectory config + +`configs/trajectory/code_trajectory.json` defines: + +- `available_tools` as `code-*` +- `data_path` as `benchmark/code_benchmark.jsonl` +- `resource_init_configs.code.content.source_dir` as `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` + +## FAQ + +### What repository does the example use by default? + +The committed default is `seeds/code/seed/demo_repo`, resolved in config as `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`. + +### Can I point the example at my own repository? + +Yes. Replace `resource_init_configs.code.content.source_dir` with your own repo path in the synthesis or trajectory config you want to run. + +### Does this guide cover training or deployment? + +No. Later training / deployment / infer are not covered yet, so this guide stops after QA synthesis and trajectory generation. diff --git a/examples/MCPAgent.md b/examples/MCPAgent.md new file mode 100644 index 0000000..6c48595 --- /dev/null +++ b/examples/MCPAgent.md @@ -0,0 +1,187 @@ +# MCPAgent: MCP Tool Agent — 3-Step Example Guide (No Training/Deployment/Inference) + +This guide explains how to use AgentFlow's MCP example pipeline to generate QA data and trajectory data for six domains: `canvas`, `snowflake`, `woocommerce`, `yahoo_finance`, `youtube`, and `train`. + +This example is intentionally limited to sandbox startup, QA synthesis, and trajectory rollout. Later training, deployment, and infer workflows are not covered yet. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Pipeline Overview](#pipeline-overview) +- [Step 1: Start the Sandbox Server](#step-1-start-the-sandbox-server) +- [Step 2: Synthesize QA Data](#step-2-synthesize-qa-data) +- [Step 3: Synthesize Trajectory Data](#step-3-synthesize-trajectory-data) +- [Configuration Reference](#configuration-reference) +- [FAQ](#faq) + +--- + +## Overview + +MCPAgent is an example agent that talks to Toolathlon-GYM MCP servers through AgentFlow's sandbox server. The shared sandbox config is `configs/sandbox-server/mcp_config.json`, and the MCP server bundle is resolved from `${TOOLATHLON_GYM_ROOT}/local_servers`. + +The current example scope covers six domains: + +- `canvas` +- `snowflake` +- `woocommerce` +- `yahoo_finance` +- `youtube` +- `train` + +## Prerequisites + +Before running the example: + +- `cd AgentFlow` +- `export OPENAI_API_KEY=...` +- `export OPENAI_API_URL=...` +- `export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym` +- Have an already prepared and already running Toolathlon-GYM environment +- Ensure `node` and `uv` are installed + +## Pipeline Overview + +The verified MCP example pipeline in this repo is: + +```text +Sandbox Setup -> QA Synthesis -> Trajectory Rollout +``` + +All synthesis configs use the shared sandbox config `configs/sandbox-server/mcp_config.json`. The example stops after QA synthesis and trajectory rollout, so training, deployment, and infer/evaluation flows are not covered. + +## Step 1: Start the Sandbox Server + +Start the sandbox server once before running synthesis or rollout: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json +``` + +## Step 2: Synthesize QA Data + +Run the synthesis pipeline once per domain: + +```bash +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_canvas_config.json \ + --seeds seeds/mcp/canvas_seeds.jsonl \ + --output-dir results/mcp_canvas + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_snowflake_config.json \ + --seeds seeds/mcp/snowflake_seeds.jsonl \ + --output-dir results/mcp_snowflake + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_woocommerce_config.json \ + --seeds seeds/mcp/woocommerce_seeds.jsonl \ + --output-dir results/mcp_woocommerce + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_yahoo_finance_config.json \ + --seeds seeds/mcp/yahoo_finance_seeds.jsonl \ + --output-dir results/mcp_yahoo_finance + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_youtube_config.json \ + --seeds seeds/mcp/youtube_seeds.jsonl \ + --output-dir results/mcp_youtube + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_train_config.json \ + --seeds seeds/mcp/train_seeds.jsonl \ + --output-dir results/mcp_train +``` + +> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`. + +- QA: `results/ds_synthesized_qa/synthesized_qa.jsonl` +- Trajectory: `results/ds_synthesized_qa/trajectories.jsonl` + +These runs synthesize QA pairs and save the corresponding tool-use traces for the selected MCP domain. + +## Step 3: Synthesize Trajectory Data + +Run the rollout pipeline for trajectory-only data: + +```bash +python -m rollout.pipeline \ + --config configs/trajectory/mcp_canvas_trajectory.json \ + --output-dir trajectory_results/mcp_canvas + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_snowflake_trajectory.json \ + --output-dir trajectory_results/mcp_snowflake + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_woocommerce_trajectory.json \ + --output-dir trajectory_results/mcp_woocommerce + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_yahoo_finance_trajectory.json \ + --output-dir trajectory_results/mcp_yahoo_finance + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_youtube_trajectory.json \ + --output-dir trajectory_results/mcp_youtube + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_train_trajectory.json \ + --output-dir trajectory_results/mcp_train +``` + +This stage produces rollout trajectories only. Training, deployment, and infer-oriented serving flows are not covered in this example yet. + +## Configuration Reference + +### Shared Sandbox Config + +File: `configs/sandbox-server/mcp_config.json` + +Important fields: + +- `mcp_servers_path`: `${TOOLATHLON_GYM_ROOT}/local_servers` +- `enabled_mcp_servers`: includes the concrete MCP server identifiers from `configs/sandbox-server/mcp_config.json`, plus shared helpers such as `filesystem` and the YouTube transcript server. Most example domains use the same name as the server, but some differ: `yahoo_finance -> yahoo-finance` and `train -> rail_12306`. +- `env_overrides`: `PGHOST`, `PGPORT`, `PGUSER`, `PGPASSWORD`, `PGDATABASE`, `CANVAS_DOMAIN`, `WORDPRESS_SITE_URL` + +### Synthesis Configs + +Files: + +- `configs/synthesis/mcp_canvas_config.json` +- `configs/synthesis/mcp_snowflake_config.json` +- `configs/synthesis/mcp_woocommerce_config.json` +- `configs/synthesis/mcp_yahoo_finance_config.json` +- `configs/synthesis/mcp_youtube_config.json` +- `configs/synthesis/mcp_train_config.json` + +These configs point to the shared MCP sandbox and the domain-specific seeds for QA synthesis. + +### Trajectory Configs + +Files: + +- `configs/trajectory/mcp_canvas_trajectory.json` +- `configs/trajectory/mcp_snowflake_trajectory.json` +- `configs/trajectory/mcp_woocommerce_trajectory.json` +- `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- `configs/trajectory/mcp_youtube_trajectory.json` +- `configs/trajectory/mcp_train_trajectory.json` + +These configs run trajectory-only rollout for each MCP domain. + +## FAQ + +### What does this example cover? + +It covers sandbox startup, QA synthesis, and trajectory rollout for the six MCP domains in this repo. + +### Which external tools do I need ready first? + +You need a prepared Toolathlon-GYM environment, plus `node` and `uv`, because the MCP backend launches Toolathlon-GYM local servers from `${TOOLATHLON_GYM_ROOT}/local_servers`. + +### Why are training and deployment missing? + +This example is scoped to data generation only. Later training, deployment, and infer/evaluation workflows are not covered yet. diff --git a/rollout/core/config.py b/rollout/core/config.py index b67ca75..ac8c532 100644 --- a/rollout/core/config.py +++ b/rollout/core/config.py @@ -8,6 +8,8 @@ from typing import Dict, List, Any, Optional from dataclasses import dataclass, field, fields +from sandbox.server.config_loader import expand_env_vars + # Optional yaml support yaml = None try: @@ -84,6 +86,7 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> 'RolloutConfig': valid_fields = {f.name for f in fields(cls)} filtered = {k: v for k, v in config_dict.items() if k in valid_fields} + filtered = expand_env_vars(filtered) # Normalize text fields (allow list[str] for easier editing) def _normalize_text_field(v: Any) -> str: diff --git a/rollout/tests/conftest.py b/rollout/tests/conftest.py new file mode 100644 index 0000000..8c6c13d --- /dev/null +++ b/rollout/tests/conftest.py @@ -0,0 +1,134 @@ +import os +from pathlib import Path + +import pytest + + +_REAL_CODE_TEST_FILES = { + "test_code_real_smoke.py", +} + + +def _code_real_enabled(): + return os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1" + + +def pytest_ignore_collect(collection_path, config): + del config + if _code_real_enabled(): + return False + + path = Path(str(collection_path)) + return path.name in _REAL_CODE_TEST_FILES + + +def pytest_addoption(parser): + group = parser.getgroup("agentflow-code-real") + group.addoption( + "--real-api-key", + action="store", + default=None, + help="API key for opt-in real code rollout smoke tests.", + ) + group.addoption( + "--real-base-url", + action="store", + default=None, + help="Base URL for opt-in real code rollout smoke tests.", + ) + group.addoption( + "--real-model", + action="store", + default=None, + help="Model name for opt-in real code rollout smoke tests.", + ) + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "code_real: opt-in real code rollout smoke tests", + ) + + +def pytest_collection_modifyitems(config, items): + if _code_real_enabled(): + return + + deselected = [] + kept = [] + for item in items: + if item.get_closest_marker("code_real") is None: + kept.append(item) + continue + + if Path(str(item.fspath)).name in _REAL_CODE_TEST_FILES: + deselected.append(item) + else: + kept.append(item) + + if deselected: + items[:] = kept + config.hook.pytest_deselected(items=deselected) + + +def _get_real_credentials(config): + return { + "api_key": config.getoption("--real-api-key"), + "base_url": config.getoption("--real-base-url"), + "model": config.getoption("--real-model"), + } + + +def _missing_real_credential_options(config): + credentials = _get_real_credentials(config) + return [ + option_name + for option_name, value in ( + ("--real-api-key", credentials["api_key"]), + ("--real-base-url", credentials["base_url"]), + ("--real-model", credentials["model"]), + ) + if not value + ] + + +def pytest_runtest_setup(item): + if item.get_closest_marker("code_real") is None: + return + + if not _code_real_enabled(): + pytest.skip("set AGENTFLOW_RUN_CODE_REAL=1 to run real code rollout smoke tests") + + missing = _missing_real_credential_options(item.config) + if missing: + pytest.skip( + "code_real tests require all of " + "--real-api-key, --real-base-url, and --real-model" + ) + + +@pytest.fixture +def real_llm_credentials(request): + credentials = _get_real_credentials(request.config) + if _missing_real_credential_options(request.config): + pytest.skip( + "code_real tests require all of " + "--real-api-key, --real-base-url, and --real-model" + ) + return credentials + + +@pytest.fixture +def real_api_key(real_llm_credentials): + return real_llm_credentials["api_key"] + + +@pytest.fixture +def real_base_url(real_llm_credentials): + return real_llm_credentials["base_url"] + + +@pytest.fixture +def real_model(real_llm_credentials): + return real_llm_credentials["model"] diff --git a/rollout/tests/test_code_example_assets.py b/rollout/tests/test_code_example_assets.py new file mode 100644 index 0000000..87200e1 --- /dev/null +++ b/rollout/tests/test_code_example_assets.py @@ -0,0 +1,171 @@ +import json +import shutil +import subprocess +import sys +from pathlib import Path + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def test_code_rollout_config_contract_expands_repo_root_when_set(monkeypatch): + config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + monkeypatch.setenv("AGENTFLOW_REPO_ROOT", str(REPO_ROOT)) + config = RolloutConfig.from_json(str(config_path)) + + assert config.benchmark_name == "code_trajectory" + assert config.data_path == "benchmark/code_benchmark.jsonl" + assert config.available_tools == ["code-*"] + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": f"{REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.evaluate_results is False + assert config.trajectory_only is True + assert config.save_trajectories is True + assert config.save_summary is False + + +def test_code_rollout_config_preserves_placeholder_when_repo_root_unset(monkeypatch): + config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + monkeypatch.delenv("AGENTFLOW_REPO_ROOT", raising=False) + + config = RolloutConfig.from_json(str(config_path)) + + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + + +def test_rollout_config_from_dict_expands_nested_env_values(monkeypatch): + monkeypatch.setenv("CODE_ROOT", "/tmp/demo") + monkeypatch.delenv("UNSET_VALUE", raising=False) + + config = RolloutConfig.from_dict( + { + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${CODE_ROOT}/repo", + "fallback_dir": "${UNSET_VALUE:-/tmp/fallback}", + "preserved_dir": "${UNSET_VALUE}/repo", + "artifacts": [ + "${CODE_ROOT}/one", + "${UNSET_VALUE:-/tmp/two}", + "${UNSET_VALUE}/three", + ], + } + } + } + } + ) + + content = config.resource_init_configs["code"]["content"] + assert content["source_dir"] == "/tmp/demo/repo" + assert content["fallback_dir"] == "/tmp/fallback" + assert content["preserved_dir"] == "${UNSET_VALUE}/repo" + assert content["artifacts"] == [ + "/tmp/demo/one", + "/tmp/two", + "${UNSET_VALUE}/three", + ] + + +def test_code_seed_file_contract(): + rows = _read_jsonl(REPO_ROOT / "seeds" / "code" / "seeds.jsonl") + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +def test_code_benchmark_contract(): + rows = _read_jsonl(REPO_ROOT / "benchmark" / "code_benchmark.jsonl") + + assert len(rows) == 2 + assert all({"id", "question", "answer"} <= set(row.keys()) for row in rows) + assert rows[0]["id"] == "code_read_001" + assert "metadata" not in rows[0] + assert rows[1]["id"] == "code_edit_001" + assert "tests/smoke_test.py" in rows[1]["question"] + assert rows[1]["answer"] == "smoke test passed" + assert rows[1]["metadata"] == { + "target_files": ["app.py"], + "check_command": "python tests/smoke_test.py", + } + assert all("/home/" not in json.dumps(row, ensure_ascii=False) for row in rows) + assert all("DataFlow" not in json.dumps(row, ensure_ascii=False) for row in rows) + + +def test_code_demo_repo_contract(): + repo_root = REPO_ROOT / "seeds" / "code" / "seed" / "demo_repo" + + required_paths = [ + repo_root / "README.md", + repo_root / "app.py", + repo_root / "config" / "app_config.json", + repo_root / "lib" / "helpers.py", + repo_root / "tests" / "smoke_test.py", + ] + for path in required_paths: + assert path.exists(), path + + smoke_test = (repo_root / "tests" / "smoke_test.py").read_text(encoding="utf-8") + assert "build_message" in smoke_test + assert "SMOKE_OK" in smoke_test + + +def test_code_demo_repo_smoke_test_runtime_contract(tmp_path): + source_repo = REPO_ROOT / "seeds" / "code" / "seed" / "demo_repo" + repo_copy = tmp_path / "demo_repo" + shutil.copytree(source_repo, repo_copy) + + pre_fix = subprocess.run( + [sys.executable, "tests/smoke_test.py"], + cwd=repo_copy, + capture_output=True, + text=True, + check=False, + ) + + assert pre_fix.returncode != 0 + assert "AssertionError: Hello, AgentFlow?" in pre_fix.stderr + assert "ModuleNotFoundError" not in pre_fix.stderr + + app_path = repo_copy / "app.py" + app_text = app_path.read_text(encoding="utf-8") + app_path.write_text( + app_text.replace('render_greeting(config["default_name"], "?")', 'render_greeting(config["default_name"], "!")'), + encoding="utf-8", + ) + + post_fix = subprocess.run( + [sys.executable, "tests/smoke_test.py"], + cwd=repo_copy, + capture_output=True, + text=True, + check=False, + ) + + assert post_fix.returncode == 0, post_fix.stderr + assert post_fix.stdout.strip() == "SMOKE_OK" diff --git a/rollout/tests/test_code_example_doc.py b/rollout/tests/test_code_example_doc.py new file mode 100644 index 0000000..b4b470d --- /dev/null +++ b/rollout/tests/test_code_example_doc.py @@ -0,0 +1,79 @@ +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _get_section(content: str, heading: str, next_heading: str) -> str: + start = content.index(heading) + end = content.index(next_heading, start) + return content[start:end] + + +def test_coding_example_doc_has_required_sections_and_repo_root_contract(): + content = (REPO_ROOT / "examples" / "CodingAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "3-Step Example Guide", + "cd AgentFlow", + "export OPENAI_API_KEY=", + "export OPENAI_API_URL=", + "export AGENTFLOW_REPO_ROOT=$(pwd)", + "code-*", + "configs/sandbox-server/code_config.json", + "configs/synthesis/code_config.json", + "configs/trajectory/code_trajectory.json", + "benchmark/code_benchmark.jsonl", + "seeds/code/seeds.jsonl", + "seeds/code/seed/demo_repo", + "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo", + "source_dir", + "./start_sandbox_server.sh --config configs/sandbox-server/code_config.json", + "python tests/smoke_test.py", + "training / deployment / infer are not covered yet", + ] + for needle in required_strings: + assert needle in content + + step_2_content = _get_section( + content, + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + ) + + step_2_requirements = [ + "configs/synthesis/code_config.json", + "results/code", + "results/ds_synthesized_qa/", + "results/ds_synthesized_qa/synthesized_qa.jsonl", + "results/ds_synthesized_qa/trajectories.jsonl", + "> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`.", + ] + for needle in step_2_requirements: + assert needle in step_2_content + + expected_command = """python3 synthesis/pipeline.py \\ + --config configs/synthesis/code_config.json \\ + --seeds seeds/code/seeds.jsonl \\ + --output-dir results/code""" + assert expected_command in step_2_content + + assert "### Output files" not in step_2_content + + assert "/home/a1/sdb/dxd/DataFlow" not in content + assert "DataFlow" not in content + assert "Step 4" not in content + assert "Step 5" not in content diff --git a/rollout/tests/test_code_real_smoke.py b/rollout/tests/test_code_real_smoke.py new file mode 100644 index 0000000..9634ec1 --- /dev/null +++ b/rollout/tests/test_code_real_smoke.py @@ -0,0 +1,95 @@ +import json + +import pytest + +from rollout import RolloutConfig, RolloutPipeline + + +pytestmark = pytest.mark.code_real + + +def _canonical_tool_name(name): + for separator in (".", "_", "-"): + if separator in name: + prefix, suffix = name.split(separator, 1) + return f"{prefix}:{suffix}" + return name + + +def test_code_real_smoke_reads_token_via_real_tools( + tmp_path, + real_api_key, + real_base_url, + real_model, +): + fixture_repo = tmp_path / "fixture_repo" + nested_dir = fixture_repo / "nested" + nested_dir.mkdir(parents=True) + + token = f"token-{tmp_path.name}" + (nested_dir / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") + + benchmark_path = tmp_path / "benchmark.jsonl" + prompt = ( + "Use code tools to inspect the repository and read nested/TOKEN.txt. " + "Reply with only the exact token and nothing else." + ) + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": prompt, + "answer": token, + } + ) + + "\n", + encoding="utf-8", + ) + + output_dir = tmp_path / "rollout_output" + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + model_name=real_model, + api_key=real_api_key, + base_url=real_base_url, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={ + "code": {"content": {"source_dir": str(fixture_repo)}} + }, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + max_turns=5, + number_of_tasks=1, + ) + + assert config.max_turns == 5 + + summary = RolloutPipeline(config, output_dir=str(output_dir)).run() + + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 + assert summary.failed_tasks == 0 + + result_files = sorted(output_dir.glob("results_code_real_smoke_*.jsonl")) + assert result_files + + payload = json.loads(result_files[-1].read_text(encoding="utf-8").strip()) + trajectory = payload["trajectory"] + tool_calls = trajectory["tool_calls"] + tool_messages = [ + message for message in trajectory["messages"] if message["role"] == "tool" + ] + + assert any( + _canonical_tool_name(call["tool_name"]).startswith("code:") + for call in tool_calls + ) + assert payload["predicted_answer"] == token + assert trajectory["final_answer"] == token + assert any( + token in json.dumps(message, ensure_ascii=False) for message in tool_messages + ) diff --git a/rollout/tests/test_mcp_example_assets.py b/rollout/tests/test_mcp_example_assets.py new file mode 100644 index 0000000..b330d6a --- /dev/null +++ b/rollout/tests/test_mcp_example_assets.py @@ -0,0 +1,153 @@ +import json +from pathlib import Path + +import pytest + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +OPENAI_API_KEY = "secret" +OPENAI_API_URL = "https://example.test/v1" +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_canvas_benchmark.jsonl", + "benchmark_name": "mcp_canvas_trajectory", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_snowflake_benchmark.jsonl", + "benchmark_name": "mcp_snowflake_trajectory", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_woocommerce_benchmark.jsonl", + "benchmark_name": "mcp_woocommerce_trajectory", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_yahoo_finance_benchmark.jsonl", + "benchmark_name": "mcp_yahoo_finance_trajectory", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "benchmark": "benchmark/mcp_youtube_benchmark.jsonl", + "benchmark_name": "mcp_youtube_trajectory", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_train_benchmark.jsonl", + "benchmark_name": "mcp_train_trajectory", + }, +} + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def _set_openai_env(monkeypatch: pytest.MonkeyPatch, enabled: bool) -> None: + if enabled: + monkeypatch.setenv("OPENAI_API_KEY", OPENAI_API_KEY) + monkeypatch.setenv("OPENAI_API_URL", OPENAI_API_URL) + return + + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_URL", raising=False) + + +def _expected_openai_value(enabled: bool, env_value: str, placeholder: str) -> str: + if enabled: + return env_value + return placeholder + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_rollout_config_contract(domain, env_enabled, monkeypatch): + expected = EXPECTED[domain] + _set_openai_env(monkeypatch, env_enabled) + config_path = REPO_ROOT / "configs" / "trajectory" / f"mcp_{domain}_trajectory.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = RolloutConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.benchmark_name == expected["benchmark_name"] + assert config.data_path == expected["benchmark"] + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) + assert config.max_turns == 20 + assert config.available_tools == expected["tools"] + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert "MCP domain assistant" in config.system_prompt + assert "Use only the available MCP tools" in config.system_prompt + assert "Reply with the final answer only" in config.system_prompt + assert config.evaluate_results is False + assert config.output_dir == f"trajectory_results/mcp_{domain}" + assert config.save_results is True + assert config.trajectory_only is True + assert config.save_trajectories is True + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) +def test_mcp_rollout_config_ignores_unknown_env_placeholders( + caplog, monkeypatch, env_enabled +): + _set_openai_env(monkeypatch, env_enabled) + monkeypatch.delenv("IGNORED_ROLLOUT_VAR", raising=False) + + with caplog.at_level("WARNING", logger="ConfigLoader"): + config = RolloutConfig.from_dict( + { + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "unknown_field": "${IGNORED_ROLLOUT_VAR}", + } + ) + + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) + assert "IGNORED_ROLLOUT_VAR" not in caplog.text + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_seed_files_are_two_row_jsonl(domain): + seed_path = REPO_ROOT / "seeds" / "mcp" / f"{domain}_seeds.jsonl" + rows = _read_jsonl(seed_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_benchmark_files_have_two_row_jsonl_contract(domain): + benchmark_path = REPO_ROOT / "benchmark" / f"mcp_{domain}_benchmark.jsonl" + rows = _read_jsonl(benchmark_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"id", "question", "answer"} for row in rows) + assert all(isinstance(row["question"], str) and row["question"].strip() for row in rows) + assert all(isinstance(row["answer"], str) for row in rows) diff --git a/rollout/tests/test_mcp_example_doc.py b/rollout/tests/test_mcp_example_doc.py new file mode 100644 index 0000000..fef80be --- /dev/null +++ b/rollout/tests/test_mcp_example_doc.py @@ -0,0 +1,118 @@ +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _get_section(content: str, heading: str, next_heading: str) -> str: + start = content.index(heading) + end = content.index(next_heading, start) + return content[start:end] + + +def test_mcp_example_doc_has_required_sections_and_exact_prerequisite_contract(): + content = (REPO_ROOT / "examples" / "MCPAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "configs/sandbox-server/mcp_config.json", + "configs/synthesis/mcp_canvas_config.json", + "configs/synthesis/mcp_snowflake_config.json", + "configs/synthesis/mcp_woocommerce_config.json", + "configs/synthesis/mcp_yahoo_finance_config.json", + "configs/synthesis/mcp_youtube_config.json", + "configs/synthesis/mcp_train_config.json", + "configs/trajectory/mcp_canvas_trajectory.json", + "configs/trajectory/mcp_snowflake_trajectory.json", + "configs/trajectory/mcp_woocommerce_trajectory.json", + "configs/trajectory/mcp_yahoo_finance_trajectory.json", + "configs/trajectory/mcp_youtube_trajectory.json", + "configs/trajectory/mcp_train_trajectory.json", + "export TOOLATHLON_GYM_ROOT=", + "${TOOLATHLON_GYM_ROOT}/local_servers", + "./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json", + "node", + "uv", + "PGHOST", + "PGPORT", + "PGUSER", + "PGPASSWORD", + "PGDATABASE", + "CANVAS_DOMAIN", + "WORDPRESS_SITE_URL", + ] + for needle in required_strings: + assert needle in content + + step_2_content = _get_section( + content, + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + ) + + step_2_requirements = [ + "configs/synthesis/mcp_canvas_config.json", + "configs/synthesis/mcp_snowflake_config.json", + "configs/synthesis/mcp_woocommerce_config.json", + "configs/synthesis/mcp_yahoo_finance_config.json", + "configs/synthesis/mcp_youtube_config.json", + "configs/synthesis/mcp_train_config.json", + "results/ds_synthesized_qa/", + "results/ds_synthesized_qa/synthesized_qa.jsonl", + "results/ds_synthesized_qa/trajectories.jsonl", + "> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`.", + ] + for needle in step_2_requirements: + assert needle in step_2_content + + expected_commands = [ + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_canvas_config.json \\ + --seeds seeds/mcp/canvas_seeds.jsonl \\ + --output-dir results/mcp_canvas""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_snowflake_config.json \\ + --seeds seeds/mcp/snowflake_seeds.jsonl \\ + --output-dir results/mcp_snowflake""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_woocommerce_config.json \\ + --seeds seeds/mcp/woocommerce_seeds.jsonl \\ + --output-dir results/mcp_woocommerce""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_yahoo_finance_config.json \\ + --seeds seeds/mcp/yahoo_finance_seeds.jsonl \\ + --output-dir results/mcp_yahoo_finance""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_youtube_config.json \\ + --seeds seeds/mcp/youtube_seeds.jsonl \\ + --output-dir results/mcp_youtube""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_train_config.json \\ + --seeds seeds/mcp/train_seeds.jsonl \\ + --output-dir results/mcp_train""", + ] + for command in expected_commands: + assert command in step_2_content + + assert "### Output files" not in step_2_content + + lowered = content.lower() + assert "/home/" not in content + assert "training" in lowered + assert "deployment" in lowered + assert "infer" in lowered + assert "not covered" in lowered + assert "Step 4" not in content + assert "Step 5" not in content diff --git a/sandbox/result_formatter.py b/sandbox/result_formatter.py index e134433..30b8dfe 100644 --- a/sandbox/result_formatter.py +++ b/sandbox/result_formatter.py @@ -251,6 +251,9 @@ def to_str(self, verbose: bool = False) -> str: error_msg = self.metadata.get("message", "Code execution failed") return f"[Error] {error_msg}" + if isinstance(self.raw_data, str): + return self.raw_data + stdout = self.raw_data.get("stdout", "") stderr = self.raw_data.get("stderr", "") return_code = self.raw_data.get("return_code", 0) @@ -359,6 +362,56 @@ def to_str(self, verbose: bool = False) -> str: return json.dumps(self.raw_data, indent=2, ensure_ascii=False) +# ============================================================================ +# MCP tool result. +# ============================================================================ + +class MCPResult(ToolResult): + """MCP tool result.""" + + def to_str(self, verbose: bool = False) -> str: + del verbose + + if not self.success: + error_msg = self.metadata.get("message", "MCP tool execution failed") + return f"[Error] {error_msg}" + + if isinstance(self.raw_data, str): + return self.raw_data + + content = self.raw_data.get("content", []) + if not isinstance(content, list): + return json.dumps(self.raw_data, indent=2, ensure_ascii=False) + + lines = [] + has_text_content = False + for item in content: + if isinstance(item, dict): + if item.get("type") == "text": + text = str(item.get("text", "")) + lines.append(text) + if text.strip(): + has_text_content = True + else: + item_type = item.get("type", "content") + lines.append(f"[{item_type} content]") + else: + text = str(item) + lines.append(text) + if text.strip(): + has_text_content = True + + rendered_content = "\n".join(lines) + if has_text_content: + return rendered_content + + structured_content = self.raw_data.get("structuredContent") + if structured_content is not None: + return json.dumps(structured_content, indent=2, ensure_ascii=False) + + return rendered_content + + # ============================================================================ # Web search tool result. # ============================================================================ @@ -724,6 +777,7 @@ class ResultFormatter: "vm": VMResult, "doc": DocResult, "ds": DocResult, + "mcp": MCPResult, } @classmethod diff --git a/sandbox/server/backends/resources/__init__.py b/sandbox/server/backends/resources/__init__.py index 0042aa9..8ac0433 100644 --- a/sandbox/server/backends/resources/__init__.py +++ b/sandbox/server/backends/resources/__init__.py @@ -27,7 +27,7 @@ ```python from sandbox.server import HTTPServiceServer from sandbox.server.backends.resources import ( - VMBackend, + VMBackend, RAGBackend ) @@ -59,18 +59,37 @@ ``` """ -from .vm import VMBackend, create_vm_backend -from .rag import RAGBackend, create_rag_backend +from .code import CodeBackend from .mcp import MCPBackend, ToolathlonGymBackend +from .rag import RAGBackend, create_rag_backend + +_VM_IMPORT_ERROR = None +try: + from .vm import VMBackend, create_vm_backend +except ImportError as exc: + if "cssselect" not in str(exc): + raise + _VM_IMPORT_ERROR = exc + + class VMBackend: # type: ignore[no-redef] + def __init__(self, *args, **kwargs): + del args, kwargs + raise ImportError( + "VMBackend requires the optional 'cssselect' dependency" + ) from _VM_IMPORT_ERROR + + def create_vm_backend(*args, **kwargs): # type: ignore[no-redef] + del args, kwargs + raise ImportError( + "VMBackend requires the optional 'cssselect' dependency" + ) from _VM_IMPORT_ERROR __all__ = [ - # Backend classes "VMBackend", "RAGBackend", "MCPBackend", + "CodeBackend", "ToolathlonGymBackend", - - # Convenience factories "create_vm_backend", "create_rag_backend", ] diff --git a/sandbox/server/backends/resources/code.py b/sandbox/server/backends/resources/code.py new file mode 100644 index 0000000..a4d3248 --- /dev/null +++ b/sandbox/server/backends/resources/code.py @@ -0,0 +1,342 @@ +""" +Code backend skeleton for lightweight coding workspace integration. +""" + +from __future__ import annotations + +import re +import shutil +import time +import uuid +from pathlib import Path +from types import SimpleNamespace +from typing import Any + +from sandbox.server.backends.base import Backend, BackendConfig +from sandbox.server.backends.error_codes import ErrorCode +from sandbox.server.backends.response_builder import ( + build_error_response, + build_success_response, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) + + +class CodeBackend(Backend): + name = "code" + description = "Code Backend - lightweight coding workspace integration" + version = "1.0.0" + + def __init__(self, config: BackendConfig | None = None): + if config is None: + config = BackendConfig( + enabled=True, + default_config={ + "workspace_root": "/tmp/agentflow_code", + }, + description="Code backend", + ) + super().__init__(config) + self._tool_instances: dict[str, Any] | None = None + + def bind_server(self, server) -> None: + super().bind_server(server) + for tool_name in ("read", "glob", "grep", "bash", "edit", "write"): + server.register_tool( + f"code:{tool_name}", + self._make_bridge_tool(tool_name), + resource_type="code", + ) + + async def initialize(self, worker_id: str, config: dict) -> dict: + source_dir = self._resolve_source_dir(config) + workspace, staged_workspace, previous_workspace = self._prepare_workspace(worker_id) + + try: + if source_dir: + self._copy_source_dir(source_dir, staged_workspace) + + self._load_code_tools() + self._commit_prepared_workspace(workspace, staged_workspace, previous_workspace) + except Exception: + if staged_workspace.exists(): + shutil.rmtree(staged_workspace) + if previous_workspace is not None and previous_workspace.exists() and not workspace.exists(): + previous_workspace.rename(workspace) + raise + + return { + "workspace": str(workspace), + "source_dir": str(source_dir) if source_dir else "", + } + + async def cleanup(self, worker_id: str, session_info: dict) -> None: + workspace_value = ((session_info or {}).get("data") or {}).get("workspace") + if not isinstance(workspace_value, str) or not workspace_value.strip(): + return None + + try: + workspace = Path(workspace_value).resolve() + workspace_root = self._get_workspace_root().resolve() + expected_workspace = (workspace_root / self._validate_worker_id(worker_id)).resolve( + strict=False + ) + workspace.relative_to(workspace_root) + except (OSError, RuntimeError, ValueError, TypeError): + return None + + if workspace != expected_workspace: + return None + if workspace.exists() and workspace.is_dir(): + shutil.rmtree(workspace) + return None + + def _get_workspace_root(self) -> Path: + value = self.get_default_config().get("workspace_root") or "/tmp/agentflow_code" + return Path(value) + + def _prepare_workspace(self, worker_id: str) -> tuple[Path, Path, Path | None]: + safe_worker_id = self._validate_worker_id(worker_id) + workspace_root = self._get_workspace_root() + workspace_root.mkdir(parents=True, exist_ok=True) + workspace = workspace_root / safe_worker_id + staged_workspace = workspace_root / f".{safe_worker_id}.staged-{uuid.uuid4().hex}" + previous_workspace = ( + workspace_root / f".{safe_worker_id}.previous-{uuid.uuid4().hex}" + if workspace.exists() + else None + ) + staged_workspace.mkdir(parents=True, exist_ok=False) + return workspace, staged_workspace, previous_workspace + + def _commit_prepared_workspace( + self, + workspace: Path, + staged_workspace: Path, + previous_workspace: Path | None, + ) -> None: + if previous_workspace is not None: + workspace.rename(previous_workspace) + staged_workspace.rename(workspace) + if previous_workspace is not None and previous_workspace.exists(): + shutil.rmtree(previous_workspace) + + def _validate_worker_id(self, worker_id: str) -> str: + if not isinstance(worker_id, str) or not worker_id: + raise ValueError("worker_id must be a non-empty string") + if worker_id in {".", ".."}: + raise ValueError("worker_id contains unsafe path traversal") + if worker_id != Path(worker_id).name: + raise ValueError("worker_id must be a single safe path component") + if not re.fullmatch(r"[A-Za-z0-9._-]+", worker_id): + raise ValueError("worker_id contains unsupported characters") + return worker_id + + def _resolve_source_dir(self, config: dict | None) -> Path | None: + config = config or {} + value = config.get("source_dir") + if not value: + return None + source_dir = Path(value) + if not source_dir.exists(): + raise ValueError(f"source_dir does not exist: {source_dir}") + if not source_dir.is_dir(): + raise ValueError(f"source_dir is not a directory: {source_dir}") + return source_dir + + def _copy_source_dir(self, source_dir: Path, workspace: Path) -> None: + if not source_dir.exists(): + return + for child in source_dir.iterdir(): + destination = workspace / child.name + if child.is_dir(): + shutil.copytree(child, destination, dirs_exist_ok=True) + else: + shutil.copy2(child, destination) + + def _load_code_tools(self) -> dict[str, Any]: + if self._tool_instances is None: + self._tool_instances = { + "read": ReadTool(), + "glob": GlobTool(), + "grep": GrepTool(), + "bash": BashTool(), + "edit": EditTool(), + "write": WriteTool(), + } + return self._tool_instances + + def _make_bridge_tool(self, tool_name: str): + async def bridge_tool(session_info: dict, **params): + return await self._dispatch(tool_name, session_info, params) + + bridge_tool.__name__ = f"code_{tool_name}" + return bridge_tool + + async def _dispatch( + self, + tool_name: str, + session_info: dict, + params: dict[str, Any], + ) -> dict[str, Any]: + start_time = time.time() + full_name = f"{self.name}:{tool_name}" + session_id = (session_info or {}).get("session_id") + runtime_params = dict(params or {}) + trace_id = runtime_params.pop("trace_id", None) + worker_id = runtime_params.pop("worker_id", None) + runtime_params.pop("session_id", None) + + tool = self._load_code_tools().get(tool_name) + if tool is None: + return build_error_response( + code=ErrorCode.INVALID_REQUEST_FORMAT, + message=f"Unknown code tool: {tool_name}", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + workspace_value = ((session_info or {}).get("data") or {}).get("workspace") + if not isinstance(workspace_value, str) or not workspace_value.strip(): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message="Invalid session workspace: missing or empty data.workspace", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + try: + workspace = Path(workspace_value).resolve(strict=False) + workspace_root = self._get_workspace_root().resolve() + expected_workspace = (workspace_root / self._validate_worker_id(worker_id)).resolve( + strict=False + ) + workspace.relative_to(workspace_root) + except (OSError, RuntimeError, ValueError, TypeError): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message="Invalid session workspace: must resolve inside workspace_root", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + if workspace != expected_workspace or not workspace.exists() or not workspace.is_dir(): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message="Invalid session workspace: must match existing worker workspace", + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + ctx = SimpleNamespace(cwd=str(workspace)) + try: + normalized_params = self._normalize_tool_params( + tool_name=tool_name, + params=runtime_params, + workspace=workspace, + ) + except ValueError as exc: + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message=str(exc), + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + try: + result = await tool.call(normalized_params, ctx) + except Exception as exc: + return build_error_response( + code=ErrorCode.EXECUTION_ERROR, + message=str(exc), + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + if isinstance(result, str) and result.startswith("Error:"): + return build_error_response( + code=ErrorCode.BUSINESS_FAILURE, + message=result, + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + return build_success_response( + data=result, + tool=full_name, + execution_time_ms=(time.time() - start_time) * 1000, + resource_type=self.name, + session_id=session_id, + trace_id=trace_id, + ) + + def _normalize_tool_params( + self, + tool_name: str, + params: dict[str, Any], + workspace: Path, + ) -> dict[str, Any]: + normalized = dict(params) + workspace_path = workspace.resolve(strict=False) + + path_keys: tuple[str, ...] = () + if tool_name in {"read", "edit", "write"}: + path_keys = ("file_path",) + elif tool_name in {"glob", "grep"}: + path_keys = ("path",) + + for key in path_keys: + raw_value = normalized.get(key) + if not isinstance(raw_value, str) or not raw_value: + continue + value_path = Path(raw_value) + if value_path.is_absolute(): + resolved = value_path.resolve(strict=False) + else: + resolved = (workspace_path / value_path).resolve(strict=False) + + try: + resolved.relative_to(workspace_path) + except ValueError as exc: + raise ValueError( + f"Path parameter '{key}' must stay inside workspace" + ) from exc + + normalized[key] = str(resolved) + + if tool_name == "glob": + pattern = normalized.get("pattern") + if ( + isinstance(pattern, str) + and pattern + and re.search(r"(^|[\\/])\.\.([\\/]|$)", pattern) + ): + raise ValueError("Glob pattern must not contain parent traversal segments") + + return normalized diff --git a/sandbox/server/backends/resources/code_vendor/__init__.py b/sandbox/server/backends/resources/code_vendor/__init__.py new file mode 100644 index 0000000..1fc9cfa --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/__init__.py @@ -0,0 +1,11 @@ +from .edit_tools import EditTool, WriteTool +from .file_tools import BashTool, GlobTool, GrepTool, ReadTool + +__all__ = [ + "BashTool", + "EditTool", + "GlobTool", + "GrepTool", + "ReadTool", + "WriteTool", +] diff --git a/sandbox/server/backends/resources/code_vendor/edit_tools.py b/sandbox/server/backends/resources/code_vendor/edit_tools.py new file mode 100644 index 0000000..622658d --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/edit_tools.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .tool import Tool + + +class EditTool(Tool): + name = "Edit" + description = ( + "Perform an exact string replacement in a file. " + "old_string must uniquely identify the target location unless replace_all=true." + ) + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "old_string": {"type": "string"}, + "new_string": {"type": "string"}, + "replace_all": {"type": "boolean", "default": False}, + }, + "required": ["file_path", "old_string", "new_string"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + old_string = args["old_string"] + new_string = args["new_string"] + replace_all = args.get("replace_all", False) + + if not path.exists(): + return f"Error: file not found: {path}" + + content = path.read_text(encoding="utf-8") + count = content.count(old_string) + if count == 0: + return f"Error: old_string not found in {path}. Read the file first to verify the exact text." + if count > 1 and not replace_all: + return ( + f"Error: old_string appears {count} times in {path}. " + "Provide more surrounding context to make it unique, or set replace_all=true." + ) + + if replace_all: + updated = content.replace(old_string, new_string) + replacements = count + else: + updated = content.replace(old_string, new_string, 1) + replacements = 1 + + path.write_text(updated, encoding="utf-8") + return f"Replaced {replacements} occurrence(s) in {path}" + + +class WriteTool(Tool): + name = "Write" + description = "Write content to a file, creating parent directories if needed." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "content": {"type": "string"}, + }, + "required": ["file_path", "content"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + content = args["content"] + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + line_count = content.count("\n") + if content and not content.endswith("\n"): + line_count += 1 + return f"Wrote {len(content)} bytes ({line_count} lines) to {path}" diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py new file mode 100644 index 0000000..dd47f0f --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -0,0 +1,189 @@ +from __future__ import annotations + +import asyncio +import io +import locale +import os +import signal +import subprocess +from pathlib import Path +from typing import Any + +from .tool import Tool + + +class BashTool(Tool): + name = "Bash" + description = "Execute a shell command and return stdout/stderr." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "command": {"type": "string", "description": "Shell command to run"}, + }, + "required": ["command"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + proc = await asyncio.create_subprocess_shell( + args["command"], + shell=True, + cwd=ctx.cwd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, + ) + + try: + stdout_bytes, stderr_bytes = await proc.communicate() + except asyncio.CancelledError: + if proc.returncode is None: + try: + os.killpg(proc.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + proc.kill() + await proc.communicate() + raise + + output = _decode_text_mode_output(stdout_bytes) + stderr = _decode_text_mode_output(stderr_bytes) + if proc.returncode: + return _format_command_error("bash", proc.returncode, output, stderr) + return _format_command_output(output, stderr) + + +def _decode_text_mode_output(data: bytes | None) -> str: + if not data: + return "" + + text_stream = io.TextIOWrapper( + io.BytesIO(data), + encoding=locale.getpreferredencoding(False), + newline=None, + ) + try: + return text_stream.read() + finally: + text_stream.detach() + + +def _format_command_output(stdout: str, stderr: str) -> str: + output = stdout + if stderr: + output += f"\n[stderr]:\n{stderr}" if output else f"[stderr]:\n{stderr}" + return output.strip() or "(no output)" + + +def _format_command_error(tool_name: str, returncode: int, stdout: str, stderr: str) -> str: + if returncode < 0: + status = f"signal {-returncode}" + else: + status = f"exit status {returncode}" + + summary = f"Error: {tool_name} command failed with {status}" + details = _format_command_output(stdout, stderr) + if details == "(no output)": + return summary + return f"{summary}\n{details}" + + +def _resolve_search_base(args: dict[str, Any], ctx: Any) -> Path: + return Path(args.get("path") or ctx.cwd) + + +class ReadTool(Tool): + name = "Read" + description = "Read a file and return its contents with line numbers." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "offset": {"type": "integer", "description": "Start line (1-indexed)"}, + "limit": {"type": "integer", "description": "Maximum lines to return"}, + }, + "required": ["file_path"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + if not path.exists(): + return f"Error: file not found: {path}" + + lines = path.read_text(encoding="utf-8").splitlines() + offset = max(0, args.get("offset", 1) - 1) + limit = args.get("limit", 2000) + selected = lines[offset : offset + limit] + return "\n".join( + f"{line_number:4}→{line}" + for line_number, line in enumerate(selected, start=offset + 1) + ) + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True + + +class GlobTool(Tool): + name = "Glob" + description = "Find files matching a glob pattern." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Glob pattern"}, + "path": {"type": "string", "description": "Directory to search from"}, + }, + "required": ["pattern"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + base = _resolve_search_base(args, ctx) + pattern = args["pattern"] + matches = sorted(base.glob(pattern)) + return "\n".join(str(match) for match in matches) or "(no matches)" + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True + + +class GrepTool(Tool): + name = "Grep" + description = "Search file contents with a regex pattern." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Regex pattern"}, + "path": {"type": "string", "description": "Directory to search"}, + "glob": {"type": "string", "description": "Optional file glob filter"}, + }, + "required": ["pattern"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + base = _resolve_search_base(args, ctx) + cmd = ["grep", "-r", "-n"] + if "glob" in args: + cmd += ["--include", args["glob"]] + cmd += ["--", args["pattern"], str(base)] + result = subprocess.run(cmd, capture_output=True, text=True) + if result.returncode == 0: + return result.stdout or "(no matches)" + if result.returncode == 1: + return "(no matches)" + return _format_command_error("grep", result.returncode, result.stdout, result.stderr) + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True diff --git a/sandbox/server/backends/resources/code_vendor/tool.py b/sandbox/server/backends/resources/code_vendor/tool.py new file mode 100644 index 0000000..bef7084 --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/tool.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + + +class Tool(ABC): + name: str + description: str + + @property + @abstractmethod + def input_schema(self) -> dict[str, Any]: + raise NotImplementedError + + @abstractmethod + async def call(self, args: dict[str, Any], ctx: Any) -> str: + raise NotImplementedError + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return False + + def to_api_format(self) -> dict[str, Any]: + return { + "name": self.name, + "description": self.description, + "input_schema": self.input_schema, + } diff --git a/sandbox/server/backends/resources/mcp/client.py b/sandbox/server/backends/resources/mcp/client.py index da83acc..7171c3f 100644 --- a/sandbox/server/backends/resources/mcp/client.py +++ b/sandbox/server/backends/resources/mcp/client.py @@ -17,6 +17,10 @@ _PLACEHOLDER_PATTERN = re.compile(r"\$\{([^}]+)\}") _SUPPORTED_PLACEHOLDERS = {"local_servers_paths", "agent_workspace", "task_dir"} _BUNDLED_CONFIG_DIR = Path(__file__).parent / "configs" +_BUNDLED_PYTHON_SERVER_FALLBACKS = { + "yahoo-finance": ("yahoo-finance-mcp", "server.py"), + "youtube-transcript": ("mcp-youtube-transcript", "run_server.py"), +} @dataclass @@ -80,6 +84,65 @@ def build_server_env( return {k: str(v) for k, v in merged.items() if v is not None} +def discover_mcp_config_dir(mcp_servers_path: str | Path | None) -> Path | None: + if not mcp_servers_path: + return None + + servers_path = Path(mcp_servers_path) + if servers_path.name != "local_servers" or not servers_path.is_dir(): + return None + + candidate = servers_path.parent / "configs" / "mcp_servers" + if candidate.is_dir(): + return candidate + return None + + +def _is_usable_executable(path: str) -> bool: + if not path: + return False + + executable = Path(path) + return executable.is_file() and os.access(executable, os.X_OK) + + +def _is_toolathlon_config_dir(config_dir: Path, mcp_servers_path: str | Path | None) -> bool: + resolved_config_dir = config_dir.resolve() + if resolved_config_dir == _BUNDLED_CONFIG_DIR.resolve(): + return True + + discovered_config_dir = discover_mcp_config_dir(mcp_servers_path) + if discovered_config_dir is None: + return False + + return resolved_config_dir == discovered_config_dir.resolve() + + +def _apply_bundled_python_server_fallback( + *, + server_name: str, + config_dir: Path, + command: str, + args: list[str], + cwd: str, + local_servers_path: str, +) -> tuple[str, list[str], str]: + fallback = _BUNDLED_PYTHON_SERVER_FALLBACKS.get(server_name) + if fallback is None or not _is_toolathlon_config_dir(config_dir, local_servers_path): + return command, args, cwd + + if ".venv/bin/python3" not in command or _is_usable_executable(command): + return command, args, cwd + + project_dir_name, entrypoint = fallback + project_dir = Path(local_servers_path) / project_dir_name if local_servers_path else Path(cwd) + if not (project_dir / entrypoint).is_file(): + return command, args, cwd + + project_dir_str = str(project_dir) + return "uv", ["--directory", project_dir_str, "run", "python", entrypoint], project_dir_str + + def load_mcp_process_config( *, server_name: str, @@ -99,6 +162,9 @@ def load_mcp_process_config( if mcp_servers_path is None: mcp_servers_path = str(toolathlon_root / "local_servers") + if config_dir is None: + config_dir = discover_mcp_config_dir(mcp_servers_path) + config_path = Path(config_dir) if config_dir else _BUNDLED_CONFIG_DIR if not config_path.exists(): raise FileNotFoundError(f"MCP config dir not found: {config_path}") @@ -155,7 +221,22 @@ def load_mcp_process_config( cwd_value = resolve(params.get("cwd", agent_workspace)) runtime_env = dict(os.environ) if process_env is None else dict(process_env) + command, args, cwd_value = _apply_bundled_python_server_fallback( + server_name=server_name, + config_dir=config_path, + command=command, + args=args, + cwd=cwd_value, + local_servers_path=local_servers_path, + ) + full_env = build_server_env(yaml_env=env_values, process_env=runtime_env) + if ( + command == "uv" + and "UV_CACHE_DIR" not in full_env + and _is_toolathlon_config_dir(config_path, mcp_servers_path) + ): + full_env["UV_CACHE_DIR"] = str(Path(workspace) / ".cache" / "uv") timeout_seconds = float(cfg.get("client_session_timeout_seconds", 60.0)) diff --git a/sandbox/server/backends/resources/mcp/configs/12306.yaml b/sandbox/server/backends/resources/mcp/configs/12306.yaml index a969de2..fcc9c16 100644 --- a/sandbox/server/backends/resources/mcp/configs/12306.yaml +++ b/sandbox/server/backends/resources/mcp/configs/12306.yaml @@ -1,19 +1,19 @@ -# 12306 China Railway Ticket Query System +# 12306 China Railway Ticket Query System (PostgreSQL-backed, no real API calls) # Source: https://github.com/Joooook/12306-mcp -# Toolathlon Version: https://github.com/lockon-n/12306-mcp -# Mock: PG-backed local server (replaces real kyfw.12306.cn API) +# Toolathlon Version: local pg-backed version in local_mcp_servers_copy/12306-mcp +# Schema: train (stations, trains, train_seats, train_routes) +# Toolathlon_GYM: ../../local_servers/12306-mcp type: stdio name: rail_12306 params: command: node args: - - "${local_servers_paths}/12306-mcp/environment/build/index.js" - cwd: "${agent_workspace}" + - "${local_servers_paths}/12306-mcp/build/index.js" env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" + PG_HOST: "postgres" + PG_PORT: "5432" + PG_DATABASE: "toolathlon" + PG_USER: "postgres" + PG_PASSWORD: "postgres" cache_tools_list: true -client_session_timeout_seconds: 60 +client_session_timeout_seconds: 20 diff --git a/sandbox/server/backends/resources/mcp/configs/canvas.yaml b/sandbox/server/backends/resources/mcp/configs/canvas.yaml index f3c2ffc..8d66f72 100644 --- a/sandbox/server/backends/resources/mcp/configs/canvas.yaml +++ b/sandbox/server/backends/resources/mcp/configs/canvas.yaml @@ -1,23 +1,20 @@ # canvas # Source: https://github.com/DMontgomery40/mcp-canvas-lms # Toolathlon Version: https://github.com/lockon-n/mcp-canvas-lms -# Mock: PG-backed local server +# Toolathlon_GYM: ../../local_servers/mcp-canvas-lms type: stdio name: canvas params: command: node args: - - "${local_servers_paths}/mcp-canvas-lms/environment/build/index.js" + - "${local_servers_paths}/mcp-canvas-lms/build/index.js" env: - CANVAS_API_TOKEN: "${token.canvas_api_token}" - CANVAS_STUDENT_EMAIL: "${token.canvas_student_email}" + CANVAS_API_TOKEN: "placeholder" CANVAS_DOMAIN: "localhost:8080" + # skip self-signed certificate detection, set NODE_TLS_REJECT_UNAUTHORIZED=0 NODE_TLS_REJECT_UNAUTHORIZED: "0" - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" + # if you don't need to load custom CA certificates, you can comment out the following line + # NODE_EXTRA_CA_CERTS: "deployment/canvas/logs/cert.pem" cwd: "${agent_workspace}" -client_session_timeout_seconds: 60 +client_session_timeout_seconds: 10 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/filesystem.yaml b/sandbox/server/backends/resources/mcp/configs/filesystem.yaml index 077ffb9..f9d43b1 100644 --- a/sandbox/server/backends/resources/mcp/configs/filesystem.yaml +++ b/sandbox/server/backends/resources/mcp/configs/filesystem.yaml @@ -1,13 +1,14 @@ # file system server - for reading and writing files # Source: https://github.com/modelcontextprotocol/servers/tree/main/src/filesystem -# Mock: local server +# Toolathlon Version: The same +# Toolathlon_GYM: ../../local_servers/filesystem type: stdio name: filesystem params: command: node args: - - "${local_servers_paths}/filesystem/environment/dist/index.js" + - "${local_servers_paths}/filesystem/dist/index.js" - "${agent_workspace}" cwd: "${agent_workspace}" -client_session_timeout_seconds: 900 +client_session_timeout_seconds: 300 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/snowflake.yaml b/sandbox/server/backends/resources/mcp/configs/snowflake.yaml index 2daa5f6..81f20ba 100644 --- a/sandbox/server/backends/resources/mcp/configs/snowflake.yaml +++ b/sandbox/server/backends/resources/mcp/configs/snowflake.yaml @@ -1,14 +1,14 @@ # snowflake # Source: https://github.com/isaacwasserman/mcp-snowflake-server # Toolathlon Version: https://github.com/lockon-n/mcp-snowflake-server -# Mock: PG-backed local server +# Toolathlon_GYM: ../../local_servers/mcp-snowflake-server type: stdio name: snowflake params: command: uv args: - "--directory" - - "${local_servers_paths}/mcp-snowflake-server/environment" + - "${local_servers_paths}/mcp-snowflake-server" - "run" - "mcp_snowflake_server" - "--account" @@ -19,20 +19,20 @@ params: - "eigent" - "--private_key_path" - "" + - "--role" + - "PUBLIC" - "--database" - - "HR_ANALYTICS" + - "toolathlon_gym" - "--schema" - "sf" - - "--allowed_databases" - - "HR_ANALYTICS,SUPPORT_CENTER,SALES_DW" - "--allow_write" - "--exclude-json-results" env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" PG_USER: "eigent" PG_PASSWORD: "camel" + PG_HOST: "toolathlon_pg" + PG_PORT: "5432" + PG_DATABASE: "toolathlon_gym" cwd: "${agent_workspace}" client_session_timeout_seconds: 120 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml b/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml index eacd62c..883add3 100644 --- a/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml +++ b/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml @@ -1,21 +1,17 @@ # WooCommerce MCP Server # Source: https://github.com/lockon-n/woocommerce-mcp -# Mock: PG-backed local server +# Toolathlon Version: The same +# Toolathlon_GYM: ../../local_servers/woocommerce-mcp type: stdio name: woocommerce params: command: node args: - - "${local_servers_paths}/woocommerce-mcp/environment/dist/index.js" + - "${local_servers_paths}/woocommerce-mcp/dist/index.js" env: - WORDPRESS_SITE_URL: "http://localhost" + WORDPRESS_SITE_URL: "http://localhost:8081" WOOCOMMERCE_CONSUMER_KEY: "placeholder" WOOCOMMERCE_CONSUMER_SECRET: "placeholder" - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" cwd: "${agent_workspace}" -client_session_timeout_seconds: 60 +client_session_timeout_seconds: 10 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml b/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml index 1dc350f..66f437e 100644 --- a/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml +++ b/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml @@ -1,19 +1,15 @@ # Yahoo Finance # Source: https://github.com/Alex2Yang97/yahoo-finance-mcp # Toolathlon Version: https://github.com/lockon-n/yahoo-finance-mcp +# Toolathlon_GYM: ../../local_servers/yahoo-finance-mcp type: stdio name: yahoo-finance params: - command: uv + # Use the server venv directly instead of `uv run` so stdio passes through to FastMCP + # and the server resolves dependencies from its own project environment. + command: "${local_servers_paths}/yahoo-finance-mcp/.venv/bin/python3" args: - - "run" - - "${local_servers_paths}/yahoo-finance-mcp/environment/server.py" - env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" - # cwd: "${agent_workspace}" # do not add this for compatibility -client_session_timeout_seconds: 180 -cache_tools_list: true \ No newline at end of file + - "server.py" + cwd: "${local_servers_paths}/yahoo-finance-mcp" +client_session_timeout_seconds: 60 +cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/youtube.yaml b/sandbox/server/backends/resources/mcp/configs/youtube.yaml index edbc164..9b7e299 100644 --- a/sandbox/server/backends/resources/mcp/configs/youtube.yaml +++ b/sandbox/server/backends/resources/mcp/configs/youtube.yaml @@ -1,19 +1,20 @@ -# mcp 4 youtube +# YouTube MCP Server (PostgreSQL-backed, no real API calls) # Source: https://github.com/ZubeidHendricks/youtube-mcp-server -# Toolathlon Version: https://github.com/lockon-n/youtube-mcp-server +# Toolathlon Version: local pg-backed version in local_mcp_servers_copy/youtube-mcp-server +# Schema: youtube (channels, videos, playlists, playlist_items, transcripts) +# Toolathlon_GYM: ../../local_servers/youtube-mcp-server type: stdio name: youtube params: command: node args: - - "${local_servers_paths}/youtube-mcp-server/environment/dist/index.js" + - "${local_servers_paths}/youtube-mcp-server/dist/index.js" env: - YOUTUBE_API_KEY: "${token.google_cloud_console_api_key}" - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" + PG_HOST: "postgres" + PG_PORT: "5432" + PG_DATABASE: "toolathlon" + PG_USER: "postgres" + PG_PASSWORD: "postgres" # cwd: "${agent_workspace}" # do not add this for compatibility -client_session_timeout_seconds: 60 -cache_tools_list: true \ No newline at end of file +client_session_timeout_seconds: 120 +cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml b/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml index 35539e6..a9e078f 100644 --- a/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml +++ b/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml @@ -1,21 +1,23 @@ -# YouTube Transcript +# YouTube Transcript MCP Server (PostgreSQL-backed, no real YouTube API calls) # Source: https://github.com/jkawamoto/mcp-youtube-transcript -# Mock: PG-backed local server +# Toolathlon Version: local pg-backed version in local_mcp_servers_copy/mcp-youtube-transcript +# Schema: youtube (transcripts, videos tables) +# Toolathlon_GYM: ../../local_servers/mcp-youtube-transcript type: stdio name: youtube-transcript params: - command: uv + # Use venv python directly instead of `uv run` to avoid stdin being consumed by uv + # (uv run reads stdin before passing to the child process, causing FastMCP to see EOF and exit) + command: "${local_servers_paths}/mcp-youtube-transcript/.venv/bin/python3" args: - - "--directory" - - "${local_servers_paths}/mcp-youtube-transcript/environment" - - "run" - - "mcp-youtube-transcript" + - "-c" + - "import sys,os; os.chdir('${local_servers_paths}/mcp-youtube-transcript'); from mcp_youtube_transcript import server; import anyio; anyio.run(server(50000).run_stdio_async)" env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" - cwd: "${agent_workspace}" -client_session_timeout_seconds: 60 + PG_HOST: "postgres" + PG_PORT: "5432" + PG_DATABASE: "toolathlon" + PG_USER: "postgres" + PG_PASSWORD: "postgres" + cwd: "${local_servers_paths}/mcp-youtube-transcript" +client_session_timeout_seconds: 20 cache_tools_list: true diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py new file mode 100644 index 0000000..fb48625 --- /dev/null +++ b/sandbox/tests/test_code_backend.py @@ -0,0 +1,824 @@ +""" +Tests for the Code backend skeleton and bridge-tool registration. +""" + +import asyncio +import importlib +import os +import shlex +import sys +from pathlib import Path + +import pytest + +from sandbox.server.backends.base import BackendConfig +from sandbox.server.backends.error_codes import ErrorCode +from sandbox.server.config_loader import ConfigLoader +from sandbox.server.core.tool_executor import ToolExecutor + +MODULE_PATH = Path(__file__).resolve().parents[1] / "server" / "backends" / "resources" / "code.py" + + +def remove_resources_modules(): + package_name = "sandbox.server.backends.resources" + for module_name in list(sys.modules): + if module_name == package_name or module_name.startswith(f"{package_name}."): + sys.modules.pop(module_name, None) + + +def load_code_backend_module(): + remove_resources_modules() + return importlib.import_module("sandbox.server.backends.resources.code") + + +class FakeServer: + def __init__(self): + self._tools = {} + self._tool_resource_types = {} + + def register_tool(self, name, func, resource_type=None): + self._tools[name] = func + if resource_type is not None: + self._tool_resource_types[name] = resource_type + + +class FakeResourceRouter: + def __init__(self, session_info): + self._session_info = session_info + + async def get_session(self, worker_id, resource_type): + del worker_id, resource_type + return self._session_info + + async def get_or_create_session(self, worker_id, resource_type, auto_created=False): + del worker_id, resource_type, auto_created + raise AssertionError("unexpected temporary session creation") + + async def refresh_session(self, worker_id, resource_type): + del worker_id, resource_type + return True + + async def destroy_session(self, worker_id, resource_type): + del worker_id, resource_type + return True + + +def build_backend_config(tmp_path): + return BackendConfig( + enabled=True, + default_config={ + "workspace_root": str(tmp_path / "agentflow_code"), + }, + description="Code backend", + ) + + +def build_backend(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + return module, backend + + +def bind_backend_tools(backend): + fake_server = FakeServer() + backend.bind_server(fake_server) + return fake_server + + +def build_executor(fake_server, session_info): + return ToolExecutor( + tools=fake_server._tools, + tool_name_index={}, + tool_resource_types=fake_server._tool_resource_types, + resource_router=FakeResourceRouter(session_info), + ) + + +def execute_tool(executor, action, *, params, worker_id, trace_id): + return asyncio.run( + executor.execute( + action=action, + params=params, + worker_id=worker_id, + trace_id=trace_id, + ) + ) + + +def test_bind_server_registers_code_tools(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + + assert "code:read" in fake_server._tools + assert "code:bash" in fake_server._tools + assert fake_server._tool_resource_types["code:read"] == "code" + assert fake_server._tool_resource_types["code:bash"] == "code" + + +def test_resources_package_exports_eager_backends_without_lazy_machinery(): + remove_resources_modules() + + resources = importlib.import_module("sandbox.server.backends.resources") + code_module = importlib.import_module("sandbox.server.backends.resources.code") + mcp_module = importlib.import_module("sandbox.server.backends.resources.mcp") + + assert resources.CodeBackend is code_module.CodeBackend + assert resources.MCPBackend is mcp_module.MCPBackend + assert not hasattr(resources, "__getattr__") + + +def test_initialize_does_not_require_external_root(tmp_path): + _, backend = build_backend(tmp_path) + + session = asyncio.run(backend.initialize("runner_123", {})) + + assert session["workspace"].endswith("runner_123") + assert Path(session["workspace"]).exists() + + +def test_initialize_copies_source_dir(tmp_path): + _, backend = build_backend(tmp_path) + source_dir = tmp_path / "source" + source_dir.mkdir(parents=True) + (source_dir / "demo.py").write_text("print('hi')\n", encoding="utf-8") + + session = asyncio.run( + backend.initialize("runner_123", {"source_dir": str(source_dir)}) + ) + + copied = Path(session["workspace"]) / "demo.py" + assert copied.exists() + assert copied.read_text(encoding="utf-8") == "print('hi')\n" + + +def test_load_code_tools_uses_internal_vendor_package(tmp_path): + _, backend = build_backend(tmp_path) + + tools = backend._load_code_tools() + + assert set(tools.keys()) == {"read", "glob", "grep", "bash", "edit", "write"} + assert type(tools["read"]).__module__ == ( + "sandbox.server.backends.resources.code_vendor.file_tools" + ) + assert type(tools["edit"]).__module__ == ( + "sandbox.server.backends.resources.code_vendor.edit_tools" + ) + assert backend._load_code_tools() is tools + + +def test_tool_executor_code_dispatch_returns_standard_success_response(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + demo_file = runtime_workspace / "demo.py" + demo_file.write_text("hello from demo\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-1", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"] == " 1→hello from demo" + + +def test_tool_executor_code_dispatch_preserves_trace_id(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + demo_file = runtime_workspace / "demo.py" + demo_file.write_text("hello from demo\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-trace", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-preserve-1", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["meta"]["trace_id"] == "trace-preserve-1" + + +def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-bash", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:bash", + params={"command": "pwd"}, + worker_id="worker-1", + trace_id="trace-bash", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"] == str(runtime_workspace.resolve(strict=False)) + + +def test_tool_executor_returns_business_failure_for_vendored_grep_error(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + sample_file = runtime_workspace / "sample.txt" + sample_file.write_text("alpha\nbeta\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-grep-error", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:grep", + params={"pattern": "[", "path": str(runtime_workspace)}, + worker_id="worker-1", + trace_id="trace-grep-error", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert result["message"].startswith("Error:") + assert "exit status 2" in result["message"] + assert "[stderr]:" in result["message"] + + +def test_tool_executor_returns_business_failure_for_vendored_bash_error(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-bash-error", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:bash", + params={ + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; " + "print('out'); " + "print('err', file=sys.stderr); " + "raise SystemExit(7)\"" + ) + }, + worker_id="worker-1", + trace_id="trace-bash-error", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert result["message"].startswith("Error:") + assert "exit status 7" in result["message"] + assert "out" in result["message"] + assert "[stderr]:" in result["message"] + assert "err" in result["message"] + + +def test_tool_executor_non_bash_timeout_uses_standard_error_handling(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + + class TimeoutReadTool: + async def call(self, params, ctx): + del params, ctx + raise asyncio.TimeoutError("read timeout") + + tools = backend._load_code_tools() + backend._tool_instances = dict(tools) + backend._tool_instances["read"] = TimeoutReadTool() + + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + demo_file = runtime_workspace / "demo.py" + demo_file.write_text("hello from demo\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-read-timeout", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-read-timeout", + ) + + assert result["code"] == ErrorCode.EXECUTION_ERROR + assert result["message"] == "read timeout" + + +def test_code_write_relative_file_path_resolves_inside_session_workspace(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + process_cwd = tmp_path / "process-cwd" + process_cwd.mkdir(parents=True) + prev_cwd = Path.cwd() + os.chdir(process_cwd) + try: + executor = build_executor( + fake_server, + { + "session_id": "code-session-3", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:write", + params={"file_path": "nested/output.txt", "content": "from workspace\n"}, + worker_id="worker-1", + trace_id="trace-1", + ) + finally: + os.chdir(prev_cwd) + + assert result["code"] == ErrorCode.SUCCESS + assert (runtime_workspace / "nested" / "output.txt").read_text(encoding="utf-8") == ( + "from workspace\n" + ) + assert not (process_cwd / "nested" / "output.txt").exists() + + +def test_code_read_error_prefix_is_returned_as_agentflow_error_response(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-4", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "missing.txt"}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] != ErrorCode.SUCCESS + assert result["message"].startswith("Error:") + + +def test_tool_executor_rejects_missing_session_workspace_without_fallback(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + workspace_root = tmp_path / "agentflow_code" + workspace_root.mkdir(parents=True, exist_ok=True) + fallback_file = workspace_root / "fallback.txt" + fallback_file.write_text("must-not-read\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-missing-workspace", + "data": {}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "fallback.txt"}, + worker_id="worker-1", + trace_id="trace-missing-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_malformed_session_workspace_without_fallback(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + executor = build_executor( + fake_server, + { + "session_id": "code-session-malformed-workspace", + "data": {"workspace": 123}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "fallback.txt"}, + worker_id="worker-1", + trace_id="trace-malformed-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_nonexistent_session_workspace_under_workspace_root(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + nonexistent_workspace = tmp_path / "agentflow_code" / "worker-1" + executor = build_executor( + fake_server, + { + "session_id": "code-session-nonexistent-workspace", + "data": {"workspace": str(nonexistent_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": "demo.py"}, + worker_id="worker-1", + trace_id="trace-nonexistent-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_mismatched_session_workspace_under_workspace_root(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + mismatched_workspace = tmp_path / "agentflow_code" / "other-worker" + mismatched_workspace.mkdir(parents=True) + demo_file = mismatched_workspace / "demo.py" + demo_file.write_text("should-not-read\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-mismatched-workspace", + "data": {"workspace": str(mismatched_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-mismatched-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_tool_executor_rejects_session_workspace_outside_workspace_root(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + outside_workspace = tmp_path / "outside-workspace" + outside_workspace.mkdir(parents=True) + demo_file = outside_workspace / "demo.py" + demo_file.write_text("outside\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-outside-workspace", + "data": {"workspace": str(outside_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-outside-workspace", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "session workspace" in result["message"].lower() + + +def test_initialize_recreates_worker_workspace_without_stale_files(tmp_path): + _, backend = build_backend(tmp_path) + first_source = tmp_path / "source-first" + second_source = tmp_path / "source-second" + first_source.mkdir(parents=True) + second_source.mkdir(parents=True) + (first_source / "stale.py").write_text("print('old')\n", encoding="utf-8") + (second_source / "fresh.py").write_text("print('new')\n", encoding="utf-8") + + first_session = asyncio.run( + backend.initialize("runner_123", {"source_dir": str(first_source)}) + ) + second_session = asyncio.run( + backend.initialize("runner_123", {"source_dir": str(second_source)}) + ) + + assert first_session["workspace"] == second_session["workspace"] + workspace = Path(second_session["workspace"]) + assert not (workspace / "stale.py").exists() + assert (workspace / "fresh.py").exists() + + +def test_code_read_rejects_absolute_path_outside_workspace(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + outside_file = tmp_path / "outside.txt" + outside_file.write_text("secret\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-5", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:read", + params={"file_path": str(outside_file)}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] != ErrorCode.SUCCESS + + +def test_code_write_rejects_parent_escape_outside_workspace(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + escaped_file = tmp_path / "escaped.txt" + executor = build_executor( + fake_server, + { + "session_id": "code-session-6", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:write", + params={"file_path": "../escaped.txt", "content": "escaped\n"}, + worker_id="worker-1", + trace_id="trace-1", + ) + + assert result["code"] != ErrorCode.SUCCESS + assert not escaped_file.exists() + + +def test_code_glob_rejects_parent_traversal_pattern(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-parent-traversal", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "../*"}, + worker_id="worker-1", + trace_id="trace-glob-parent-traversal", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "pattern" in result["message"].lower() + + +def test_code_glob_rejects_embedded_parent_traversal_pattern(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + (runtime_workspace / "nested").mkdir(parents=True) + (runtime_workspace / "nested" / "demo.py").write_text( + "print('safe')\n", + encoding="utf-8", + ) + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-embedded-traversal", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "**/../*"}, + worker_id="worker-1", + trace_id="trace-glob-embedded-traversal", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert "pattern" in result["message"].lower() + + +def test_code_glob_allows_safe_workspace_pattern(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + safe_file = runtime_workspace / "nested" / "demo.py" + safe_file.parent.mkdir(parents=True) + safe_file.write_text("print('ok')\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-safe", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "**/*.py"}, + worker_id="worker-1", + trace_id="trace-glob-safe", + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"] == str(safe_file.resolve(strict=False)) + + +def test_initialize_rejects_hostile_worker_id_without_deleting_outside_dir(tmp_path): + _, backend = build_backend(tmp_path) + outside_dir = tmp_path / "escaped" + outside_dir.mkdir(parents=True) + marker = outside_dir / "keep.txt" + marker.write_text("do-not-delete\n", encoding="utf-8") + + with pytest.raises(ValueError): + asyncio.run(backend.initialize("../escaped", {})) + + assert marker.exists() + + +def test_initialize_rejects_nonexistent_source_dir(tmp_path): + _, backend = build_backend(tmp_path) + missing_source = tmp_path / "missing-source" + + with pytest.raises(ValueError, match="source_dir"): + asyncio.run(backend.initialize("runner_123", {"source_dir": str(missing_source)})) + + +def test_initialize_invalid_source_dir_leaves_no_workspace(tmp_path): + _, backend = build_backend(tmp_path) + missing_source = tmp_path / "missing-source" + workspace = tmp_path / "agentflow_code" / "runner_123" + + with pytest.raises(ValueError, match="source_dir"): + asyncio.run(backend.initialize("runner_123", {"source_dir": str(missing_source)})) + + assert not workspace.exists() + + +def test_cleanup_removes_worker_workspace(tmp_path): + _, backend = build_backend(tmp_path) + session = asyncio.run(backend.initialize("runner_123", {})) + workspace = Path(session["workspace"]) + + assert workspace.exists() + asyncio.run(backend.cleanup("runner_123", {"data": {"workspace": str(workspace)}})) + + assert not workspace.exists() + + +def test_cleanup_does_not_delete_workspace_outside_root(tmp_path): + _, backend = build_backend(tmp_path) + outside_workspace = tmp_path / "outside-workspace" + outside_workspace.mkdir(parents=True) + + asyncio.run( + backend.cleanup("runner_123", {"data": {"workspace": str(outside_workspace)}}) + ) + + assert outside_workspace.exists() + + +def test_cleanup_does_not_delete_nested_under_root_non_worker_path(tmp_path): + _, backend = build_backend(tmp_path) + nested_workspace = tmp_path / "agentflow_code" / "shared" / "cache" + nested_workspace.mkdir(parents=True) + + asyncio.run( + backend.cleanup("runner_123", {"data": {"workspace": str(nested_workspace)}}) + ) + + assert nested_workspace.exists() + + +def test_code_config_template_parses(): + loader = ConfigLoader() + config_path = ( + Path(__file__).resolve().parents[2] + / "configs" + / "sandbox-server" + / "code_config.json" + ) + + config = loader.load(str(config_path)) + + assert "code" in config.resources + assert ( + config.resources["code"].backend_class + == "sandbox.server.backends.resources.code.CodeBackend" + ) + assert config.server.session_ttl == 300 + assert ( + config.resources["code"].description + == "Lightweight coding backend powered by vendored internal tools" + ) + assert config.resources["code"].config == {"workspace_root": "/tmp/agentflow_code"} + assert config.warmup.enabled is False + assert config.warmup.resources == [] + + +def test_create_server_loads_code_backend_via_config_loader(tmp_path): + workspace_root = tmp_path / "agentflow_code" + remove_resources_modules() + loader = ConfigLoader() + loader.load_from_dict( + { + "server": { + "title": "Code backend smoke", + "session_ttl": 300, + }, + "resources": { + "code": { + "enabled": True, + "description": "Code backend", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": str(workspace_root), + }, + } + }, + "warmup": { + "enabled": False, + "resources": [], + }, + } + ) + + server = loader.create_server(host="127.0.0.1", port=0) + resources_package = sys.modules["sandbox.server.backends.resources"] + code_module = sys.modules["sandbox.server.backends.resources.code"] + + assert "code" in server._backends + assert "code:read" in server._tools + assert server._tool_resource_types["code:read"] == "code" + assert Path(resources_package.__file__).resolve() == (MODULE_PATH.parent / "__init__.py").resolve() + assert Path(code_module.__file__).resolve() == MODULE_PATH.resolve() diff --git a/sandbox/tests/test_code_tool_schemas.py b/sandbox/tests/test_code_tool_schemas.py new file mode 100644 index 0000000..6247ee0 --- /dev/null +++ b/sandbox/tests/test_code_tool_schemas.py @@ -0,0 +1,119 @@ +""" +Tests for code tool schemas integration. +""" + +from sandbox.tool_schemas import get_all_tool_names, get_tool_schemas, get_tools_by_resource + + +EXPECTED_CODE_TOOLS = { + "code-read", + "code-glob", + "code-grep", + "code-bash", + "code-edit", + "code-write", +} + + +def _code_schemas_by_name(): + schemas = get_tools_by_resource("code") + return {schema["name"]: schema for schema in schemas} + + +def test_code_tools_visible_in_global_catalog(): + """Code tools should appear in the global tool name catalog.""" + names = get_all_tool_names() + + assert "code-read" in names + assert "code-bash" in names + + +def test_code_wildcard_filtering(): + """Wildcard filtering should support code-* patterns.""" + schemas = get_tool_schemas(["code-*"]) + names = {schema["name"] for schema in schemas} + + assert names == EXPECTED_CODE_TOOLS + + +def test_get_tools_by_resource_code(): + """Resource filtering should return all code tools.""" + schemas = get_tools_by_resource("code") + names = {schema["name"] for schema in schemas} + + assert names == EXPECTED_CODE_TOOLS + + +def test_code_tool_parameter_contract(): + """Each code tool should expose the exact expected parameter contracts.""" + expected_params = { + "code-read": { + ("file_path", "string", True), + ("offset", "integer", False), + ("limit", "integer", False), + }, + "code-glob": { + ("pattern", "string", True), + ("path", "string", False), + }, + "code-grep": { + ("pattern", "string", True), + ("path", "string", False), + ("glob", "string", False), + }, + "code-bash": { + ("command", "string", True), + }, + "code-edit": { + ("file_path", "string", True), + ("old_string", "string", True), + ("new_string", "string", True), + ("replace_all", "boolean", False), + }, + "code-write": { + ("file_path", "string", True), + ("content", "string", True), + }, + } + schemas = _code_schemas_by_name() + + for tool_name, expected in expected_params.items(): + actual = { + (param["name"], param["type"], param["required"]) + for param in schemas[tool_name]["parameters"] + } + assert actual == expected + + +def test_code_read_description_mentions_line_numbered_and_1_indexed_offset(): + """code-read docs should preserve line-numbered output and 1-indexed offset semantics.""" + schema = _code_schemas_by_name()["code-read"] + offset = next( + param for param in schema["parameters"] if param["name"] == "offset" + ) + + assert "line" in schema["description"].lower() + assert "number" in schema["description"].lower() + assert "1-indexed" in offset["description"].lower() + + +def test_code_bash_description_mentions_workspace_shell_execution(): + """code-bash docs should describe shell execution in the coding workspace.""" + schema = _code_schemas_by_name()["code-bash"] + description = schema["description"].lower() + + assert "workspace" in description + assert "shell command" in description + assert "backend config" not in description + + +def test_code_write_description_mentions_workspace_full_content_and_parent_dirs(): + """code-write docs should mention writing full content and creating parent directories.""" + schema = _code_schemas_by_name()["code-write"] + description = schema["description"].lower() + + assert "workspace" in description + assert "full file content" in description + assert "parent" in description + assert "director" in description + assert "create" in description diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py new file mode 100644 index 0000000..945d60d --- /dev/null +++ b/sandbox/tests/test_code_vendor_tools.py @@ -0,0 +1,378 @@ +import asyncio +import importlib.util +import shlex +import sys +import time +from types import SimpleNamespace +from pathlib import Path + +import pytest + +PACKAGE_DIR = ( + Path(__file__).resolve().parents[1] + / "server" + / "backends" + / "resources" + / "code_vendor" +) + + +def load_code_vendor_module(module_name): + package_name = "_test_code_vendor" + package_spec = importlib.util.spec_from_file_location( + package_name, + PACKAGE_DIR / "__init__.py", + submodule_search_locations=[str(PACKAGE_DIR)], + ) + package = importlib.util.module_from_spec(package_spec) + sys.modules[package_name] = package + assert package_spec is not None + assert package_spec.loader is not None + package_spec.loader.exec_module(package) + + module_spec = importlib.util.spec_from_file_location( + f"{package_name}.{module_name}", + PACKAGE_DIR / f"{module_name}.py", + ) + module = importlib.util.module_from_spec(module_spec) + sys.modules[f"{package_name}.{module_name}"] = module + assert module_spec is not None + assert module_spec.loader is not None + module_spec.loader.exec_module(module) + return module + + +file_tools = load_code_vendor_module("file_tools") +edit_tools = load_code_vendor_module("edit_tools") +tool_module = load_code_vendor_module("tool") + +ReadTool = file_tools.ReadTool +GlobTool = file_tools.GlobTool +GrepTool = file_tools.GrepTool +BashTool = file_tools.BashTool +EditTool = edit_tools.EditTool +WriteTool = edit_tools.WriteTool + + +def make_ctx(tmp_path): + return SimpleNamespace(cwd=str(tmp_path)) + + +def call_tool(tool, args, ctx): + return asyncio.run(tool.call(args, ctx)) + + +def test_read_tool_returns_line_numbered_content(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\ngamma\n", encoding="utf-8") + + result = call_tool(ReadTool(), {"file_path": str(target)}, make_ctx(tmp_path)) + + assert result == " 1→alpha\n 2→beta\n 3→gamma" + + +def test_read_tool_honors_offset_and_limit(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\ngamma\ndelta\n", encoding="utf-8") + + result = call_tool( + ReadTool(), + {"file_path": str(target), "offset": 1, "limit": 2}, + make_ctx(tmp_path), + ) + + assert result == " 1→alpha\n 2→beta" + + +def test_edit_tool_requires_unique_match_by_default(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\nbeta\n", encoding="utf-8") + + result = call_tool( + EditTool(), + {"file_path": str(target), "old_string": "beta", "new_string": "BETA"}, + make_ctx(tmp_path), + ) + + assert "appears 2 times" in result + assert "replace_all=true" in result + assert target.read_text(encoding="utf-8") == "alpha\nbeta\nbeta\n" + + +def test_edit_tool_replace_all_updates_each_match(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\nbeta\n", encoding="utf-8") + + result = call_tool( + EditTool(), + { + "file_path": str(target), + "old_string": "beta", + "new_string": "BETA", + "replace_all": True, + }, + make_ctx(tmp_path), + ) + + assert result == f"Replaced 2 occurrence(s) in {target}" + assert target.read_text(encoding="utf-8") == "alpha\nBETA\nBETA\n" + + +def test_write_tool_creates_parent_directories_and_overwrites_full_file(tmp_path): + target = tmp_path / "nested" / "dir" / "sample.txt" + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("stale content that should disappear\n", encoding="utf-8") + + result = call_tool( + WriteTool(), + {"file_path": str(target), "content": "hello\nworld\n"}, + make_ctx(tmp_path), + ) + + assert result == f"Wrote 12 bytes (2 lines) to {target}" + assert target.read_text(encoding="utf-8") == "hello\nworld\n" + + +def test_glob_tool_returns_sorted_matches(tmp_path): + (tmp_path / "a.py").write_text("print('a')\n", encoding="utf-8") + pkg = tmp_path / "pkg" + pkg.mkdir() + (pkg / "b.py").write_text("print('b')\n", encoding="utf-8") + (pkg / "c.txt").write_text("ignore\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "**/*.py", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == f"{tmp_path / 'a.py'}\n{tmp_path / 'pkg' / 'b.py'}" + + +def test_glob_tool_uses_ctx_cwd_when_path_is_empty_string(tmp_path): + target = tmp_path / "target.py" + target.write_text("print('target')\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "target.py", "path": ""}, + make_ctx(tmp_path), + ) + + assert result == str(target) + + +def test_glob_tool_uses_ctx_cwd_when_path_is_omitted(tmp_path): + target = tmp_path / "target.py" + target.write_text("print('target')\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "target.py"}, + make_ctx(tmp_path), + ) + + assert result == str(target) + + +def test_grep_tool_returns_matches_with_line_numbers_for_filtered_files(tmp_path): + first = tmp_path / "first.txt" + second = tmp_path / "second.txt" + first.write_text("alpha\nbeta\n", encoding="utf-8") + second.write_text("beta\ngamma\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "beta", "path": str(tmp_path), "glob": "*.txt"}, + make_ctx(tmp_path), + ) + + assert result.endswith("\n") + assert set(result.splitlines()) == { + f"{first}:2:beta", + f"{second}:1:beta", + } + + +def test_grep_tool_searches_recursively_without_glob_filter(tmp_path): + root_match = tmp_path / "root.txt" + nested_dir = tmp_path / "pkg" / "nested" + nested_dir.mkdir(parents=True) + nested_match = nested_dir / "deep.py" + root_match.write_text("needle at root\n", encoding="utf-8") + nested_match.write_text("first line\nneedle in nested file\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result.endswith("\n") + assert set(result.splitlines()) == { + f"{nested_match}:2:needle in nested file", + f"{root_match}:1:needle at root", + } + + +def test_grep_tool_uses_ctx_cwd_when_path_is_empty_string(tmp_path): + target = tmp_path / "target.txt" + target.write_text("needle\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle", "path": ""}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:needle\n" + + +def test_grep_tool_uses_ctx_cwd_when_path_is_omitted(tmp_path): + target = tmp_path / "target.txt" + target.write_text("needle\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle"}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:needle\n" + + +def test_grep_tool_returns_no_matches_for_exit_code_one(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "missing", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == "(no matches)" + + +def test_grep_tool_returns_error_prefix_for_invalid_pattern(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "[", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result.startswith("Error:") + assert "exit status 2" in result + assert "[stderr]:" in result + + +def test_grep_tool_treats_option_like_pattern_as_search_pattern(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("--help\nalpha\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "--help", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:--help\n" + + +def test_bash_tool_combines_stdout_and_stderr(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + "python -c \"import sys; " + "print('out'); " + "print('err', file=sys.stderr)\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result == "out\n\n[stderr]:\nerr" + + +def test_bash_tool_matches_text_mode_newline_normalization(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; sys.stdout.buffer.write(b'a\\r\\nb\\r\\n')\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result == "a\nb" + + +def test_bash_tool_returns_error_prefix_for_nonzero_exit_status(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; " + "print('out'); " + "print('err', file=sys.stderr); " + "raise SystemExit(7)\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result.startswith("Error:") + assert "exit status 7" in result + assert "out" in result + assert "[stderr]:" in result + assert "err" in result + + +def test_bash_tool_cancellation_stops_background_command(tmp_path): + marker = tmp_path / "marker.txt" + + async def run_bash_with_timeout(): + timeout_start = time.monotonic() + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + BashTool().call( + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import pathlib, time; " + "time.sleep(0.3); " + "pathlib.Path('marker.txt').write_text('created', encoding='utf-8')\"" + ) + }, + make_ctx(tmp_path), + ), + timeout=0.1, + ) + timeout_elapsed = time.monotonic() - timeout_start + await asyncio.sleep(0.4) + return timeout_elapsed + + timeout_elapsed = asyncio.run(run_bash_with_timeout()) + + assert timeout_elapsed < 0.25 + assert not marker.exists() + + +def test_tool_api_format_and_read_only_flags(): + read_tool = ReadTool() + bash_tool = BashTool() + + api_format = read_tool.to_api_format() + + assert api_format["name"] == "Read" + assert isinstance(api_format["description"], str) + assert api_format["input_schema"] == read_tool.input_schema + assert read_tool.is_read_only({}) is True + assert bash_tool.is_read_only({}) is False diff --git a/sandbox/tests/test_mcp_backend.py b/sandbox/tests/test_mcp_backend.py index 77cb58f..5b6bdd7 100644 --- a/sandbox/tests/test_mcp_backend.py +++ b/sandbox/tests/test_mcp_backend.py @@ -3,6 +3,7 @@ """ import asyncio +import json import importlib.util import sys import types @@ -536,7 +537,17 @@ def fake_load_mcp_process_config(**kwargs): assert created_clients[0].closed is True -def test_mcp_config_template_parses(): +def test_mcp_config_template_declares_example_server_subset(monkeypatch): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + monkeypatch.delenv("TOOLATHLON_WORKSPACE_ROOT", raising=False) + monkeypatch.delenv("PGHOST", raising=False) + monkeypatch.delenv("PGPORT", raising=False) + monkeypatch.delenv("PGUSER", raising=False) + monkeypatch.delenv("PGPASSWORD", raising=False) + monkeypatch.delenv("PGDATABASE", raising=False) + monkeypatch.delenv("CANVAS_DOMAIN", raising=False) + monkeypatch.delenv("WORDPRESS_SITE_URL", raising=False) + loader = ConfigLoader() config_path = ( Path(__file__).resolve().parents[2] @@ -544,11 +555,48 @@ def test_mcp_config_template_parses(): / "sandbox-server" / "mcp_config.json" ) + raw_config = json.loads(config_path.read_text(encoding="utf-8")) + raw_mcp_config = raw_config["resources"]["mcp"]["config"] config = loader.load(str(config_path)) + mcp_resource = config.resources["mcp"] + mcp_config = mcp_resource.config - assert "mcp" in config.resources - assert ( - config.resources["mcp"].backend_class - == "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend" + assert mcp_resource.backend_class == ( + "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend" + ) + assert mcp_config["mcp_servers_path"] == "${TOOLATHLON_GYM_ROOT}/local_servers" + assert mcp_config["enabled_mcp_servers"] == [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem", + ] + assert raw_mcp_config["workspace_root"] == ( + "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}" ) + assert raw_mcp_config["env_overrides"] == { + "PGHOST": "${PGHOST:-localhost}", + "PGPORT": "${PGPORT:-5432}", + "PGUSER": "${PGUSER:-eigent}", + "PGPASSWORD": "${PGPASSWORD:-camel}", + "PGDATABASE": "${PGDATABASE:-toolathlon_gym}", + "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}", + "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}", + } + assert mcp_config["workspace_root"] == "/tmp/agentflow_mcp" + assert mcp_config["env_overrides"] == { + "PGHOST": "localhost", + "PGPORT": "5432", + "PGUSER": "eigent", + "PGPASSWORD": "camel", + "PGDATABASE": "toolathlon_gym", + "CANVAS_DOMAIN": "localhost:8080", + "WORDPRESS_SITE_URL": "http://localhost:8081", + } + assert config.warmup.enabled is True + assert config.warmup.resources == ["mcp"] diff --git a/sandbox/tests/test_mcp_client.py b/sandbox/tests/test_mcp_client.py index 0be42d6..ed79ca5 100644 --- a/sandbox/tests/test_mcp_client.py +++ b/sandbox/tests/test_mcp_client.py @@ -28,6 +28,12 @@ def load_mcp_client_module(): return module +def make_executable(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + path.chmod(0o755) + + def test_resolve_toolathlon_placeholders(tmp_path): module = load_mcp_client_module() @@ -419,6 +425,100 @@ def test_load_mcp_process_config_resolves_with_mcp_servers_path(tmp_path): assert config.timeout_seconds == 42 +def test_load_mcp_process_config_prefers_toolathlon_configs_next_to_local_servers(tmp_path): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + config_dir = toolathlon_root / "configs" / "mcp_servers" + local_servers_dir = toolathlon_root / "local_servers" + config_dir.mkdir(parents=True) + local_servers_dir.mkdir() + (config_dir / "canvas.yaml").write_text( + """ +type: stdio +name: canvas +params: + command: node + args: + - ${local_servers_paths}/mcp-canvas-lms/build/index.js + env: + CANVAS_API_TOKEN: placeholder + cwd: ${agent_workspace} +client_session_timeout_seconds: 10 + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="canvas", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "node" + assert config.args == [str(local_servers_dir / "mcp-canvas-lms" / "build" / "index.js")] + assert config.env["CANVAS_API_TOKEN"] == "placeholder" + assert config.cwd == str(tmp_path / "workspace") + assert config.timeout_seconds == 10 + + +def test_discover_mcp_config_dir_requires_toolathlon_local_servers_layout(tmp_path): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + config_dir = toolathlon_root / "configs" / "mcp_servers" + config_dir.mkdir(parents=True) + (toolathlon_root / "custom_servers").mkdir() + (toolathlon_root / "local_servers").mkdir() + + assert module.discover_mcp_config_dir(toolathlon_root / "custom_servers") is None + assert module.discover_mcp_config_dir(toolathlon_root / "local_servers") == config_dir + + +def test_discover_mcp_config_dir_requires_real_directory(tmp_path): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + local_servers_dir = toolathlon_root / "local_servers" + local_servers_dir.mkdir(parents=True) + config_path = toolathlon_root / "configs" / "mcp_servers" + config_path.parent.mkdir(parents=True) + config_path.write_text("not a directory\n", encoding="utf-8") + + assert module.discover_mcp_config_dir(local_servers_dir) is None + + +def test_load_mcp_process_config_resolves_toolathlon_local_servers_path(tmp_path): + module = load_mcp_client_module() + config_dir = tmp_path / "configs" / "mcp_servers" + config_dir.mkdir(parents=True) + (config_dir / "filesystem.yaml").write_text( + """ +type: stdio +name: filesystem +params: + command: node + args: + - ${local_servers_paths}/filesystem/environment/dist/index.js + - ${agent_workspace} + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="filesystem", + agent_workspace="/tmp/agentflow-worker", + mcp_servers_path="/tmp/toolathlon/local_servers", + config_dir=config_dir, + ) + + assert config.command == "node" + assert config.args == [ + "/tmp/toolathlon/local_servers/filesystem/environment/dist/index.js", + "/tmp/agentflow-worker", + ] + + def test_load_mcp_process_config_backward_compat_toolathlon_root(tmp_path): module = load_mcp_client_module() toolathlon_root = tmp_path / "toolathlon" @@ -477,3 +577,394 @@ def test_load_mcp_process_config_uses_process_env_overrides(tmp_path): assert config.env["PG_HOST"] == "from_process" assert config.env["PG_PORT"] == "15432" + + +def test_load_mcp_process_config_sets_workspace_uv_cache_for_uv_servers(tmp_path): + module = load_mcp_client_module() + workspace = tmp_path / "workspace" + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + (local_servers_dir / "mcp-snowflake-server").mkdir(parents=True) + + config = module.load_mcp_process_config( + server_name="snowflake", + agent_workspace=str(workspace), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.env["UV_CACHE_DIR"] == str(workspace / ".cache" / "uv") + + +def test_load_mcp_process_config_preserves_existing_uv_cache_dir(tmp_path): + module = load_mcp_client_module() + workspace = tmp_path / "workspace" + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + (local_servers_dir / "mcp-snowflake-server").mkdir(parents=True) + + config = module.load_mcp_process_config( + server_name="snowflake", + agent_workspace=str(workspace), + mcp_servers_path=str(local_servers_dir), + process_env={"UV_CACHE_DIR": "/tmp/custom-uv-cache"}, + ) + + assert config.env["UV_CACHE_DIR"] == "/tmp/custom-uv-cache" + + +def test_load_mcp_process_config_does_not_inject_uv_cache_for_custom_config_dir(tmp_path): + module = load_mcp_client_module() + config_dir = tmp_path / "custom-configs" + config_dir.mkdir(parents=True) + (config_dir / "custom-uv.yaml").write_text( + """ +type: stdio +name: custom-uv +params: + command: uv + args: + - run + - python + - server.py + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="custom-uv", + agent_workspace=str(tmp_path / "workspace"), + config_dir=config_dir, + process_env={}, + ) + + assert config.command == "uv" + assert "UV_CACHE_DIR" not in config.env + + +def test_load_mcp_process_config_keeps_direct_python_fast_path_when_venv_exists(tmp_path): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "yahoo-finance-mcp" + make_executable(project_dir / ".venv" / "bin" / "python3") + (project_dir / "server.py").write_text("print('ok')\n", encoding="utf-8") + + config = module.load_mcp_process_config( + server_name="yahoo-finance", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == str(project_dir / ".venv" / "bin" / "python3") + assert config.args == ["server.py"] + assert config.cwd == str(project_dir) + + +def test_load_mcp_process_config_falls_back_to_uv_for_yahoo_finance_without_venv_launcher(tmp_path): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "yahoo-finance-mcp" + project_dir.mkdir(parents=True) + (project_dir / "server.py").write_text("print('ok')\n", encoding="utf-8") + + config = module.load_mcp_process_config( + server_name="yahoo-finance", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.args == ["--directory", str(project_dir), "run", "python", "server.py"] + assert config.cwd == str(project_dir) + + +def test_load_mcp_process_config_does_not_fallback_when_bundled_entrypoint_is_missing(tmp_path): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "yahoo-finance-mcp" + project_dir.mkdir(parents=True) + + config = module.load_mcp_process_config( + server_name="yahoo-finance", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == str(project_dir / ".venv" / "bin" / "python3") + assert config.args == ["server.py"] + assert config.cwd == str(project_dir) + + +def test_load_mcp_process_config_falls_back_to_uv_for_youtube_transcript_when_launcher_unusable( + tmp_path, +): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "mcp-youtube-transcript" + launcher = project_dir / ".venv" / "bin" / "python3" + launcher.parent.mkdir(parents=True, exist_ok=True) + launcher.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + launcher.chmod(0o644) + (project_dir / "run_server.py").write_text("print('ok')\n", encoding="utf-8") + + config = module.load_mcp_process_config( + server_name="youtube-transcript", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.args == ["--directory", str(project_dir), "run", "python", "run_server.py"] + assert config.cwd == str(project_dir) + + +@pytest.mark.parametrize( + ("server_name", "project_subdir", "entrypoint"), + [ + ("yahoo-finance", "yahoo-finance-mcp", "server.py"), + ("youtube-transcript", "mcp-youtube-transcript", "run_server.py"), + ], +) +def test_load_mcp_process_config_falls_back_to_uv_for_discovered_toolathlon_python_servers( + tmp_path, + server_name, + project_subdir, + entrypoint, +): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + config_dir = toolathlon_root / "configs" / "mcp_servers" + local_servers_dir = toolathlon_root / "local_servers" + project_dir = local_servers_dir / project_subdir + config_dir.mkdir(parents=True) + project_dir.mkdir(parents=True) + (project_dir / entrypoint).write_text("print('ok')\n", encoding="utf-8") + (config_dir / f"{server_name}.yaml").write_text( + f""" +type: stdio +name: {server_name} +params: + command: ${{local_servers_paths}}/{project_subdir}/.venv/bin/python3 + args: + - {entrypoint} + cwd: ${{local_servers_paths}}/{project_subdir} + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name=server_name, + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.args == ["--directory", str(project_dir), "run", "python", entrypoint] + assert config.cwd == str(project_dir) + + +def assert_resolved_runtime_paths(config) -> None: + all_values = [config.command, config.cwd, *config.args] + for value in all_values: + assert "${local_servers_paths}" not in value + assert "/environment/" not in value + + +def assert_arg_contains_path(config, expected_path: str) -> None: + assert expected_path in config.args + + +def assert_python_server_launch(config, server_subdir: str, fallback_entrypoint: str) -> None: + expected_project_dir = f"/toolathlon/local_servers/{server_subdir}" + assert config.cwd == expected_project_dir + + if config.command.endswith("/.venv/bin/python3"): + assert config.command == f"{expected_project_dir}/.venv/bin/python3" + if fallback_entrypoint == "server.py": + assert config.args == ["server.py"] + else: + assert config.args[0] == "-c" + assert "mcp_youtube_transcript" in config.args[1] + assert expected_project_dir in config.args[1] + return + + assert "--directory" in config.args + assert_arg_contains_path(config, expected_project_dir) + assert "run" in config.args + assert fallback_entrypoint in config.args + + +@pytest.mark.parametrize( + ("server_name", "expected"), + [ + ( + "canvas", + { + "launch": "node", + "server_subdir": "mcp-canvas-lms", + "entrypoint_suffix": "/build/index.js", + "cwd": "/workspace", + "timeout_seconds": 10, + "env_subset": { + "CANVAS_API_TOKEN": "placeholder", + "CANVAS_DOMAIN": "localhost:8080", + "NODE_TLS_REJECT_UNAUTHORIZED": "0", + }, + }, + ), + ( + "snowflake", + { + "launch": "uv", + "server_subdir": "mcp-snowflake-server", + "entrypoint": "mcp_snowflake_server", + "cwd": "/workspace", + "timeout_seconds": 120, + "env_subset": { + "PG_HOST": "toolathlon_pg", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon_gym", + "PG_USER": "eigent", + "PG_PASSWORD": "camel", + }, + }, + ), + ( + "woocommerce", + { + "launch": "node", + "server_subdir": "woocommerce-mcp", + "entrypoint_suffix": "/dist/index.js", + "cwd": "/workspace", + "timeout_seconds": 10, + "env_subset": { + "WORDPRESS_SITE_URL": "http://localhost:8081", + "WOOCOMMERCE_CONSUMER_KEY": "placeholder", + "WOOCOMMERCE_CONSUMER_SECRET": "placeholder", + }, + }, + ), + ( + "yahoo-finance", + { + "launch": "python_or_uv", + "server_subdir": "yahoo-finance-mcp", + "fallback_entrypoint": "server.py", + "cwd": "/toolathlon/local_servers/yahoo-finance-mcp", + "timeout_seconds": 60, + "env_subset": {}, + }, + ), + ( + "youtube", + { + "launch": "node", + "server_subdir": "youtube-mcp-server", + "entrypoint_suffix": "/dist/index.js", + "cwd": "/workspace", + "timeout_seconds": 120, + "env_subset": { + "PG_HOST": "postgres", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon", + "PG_USER": "postgres", + "PG_PASSWORD": "postgres", + }, + }, + ), + ( + "youtube-transcript", + { + "launch": "python_or_uv", + "server_subdir": "mcp-youtube-transcript", + "fallback_entrypoint": "run_server.py", + "cwd": "/toolathlon/local_servers/mcp-youtube-transcript", + "timeout_seconds": 20, + "env_subset": { + "PG_HOST": "postgres", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon", + "PG_USER": "postgres", + "PG_PASSWORD": "postgres", + }, + }, + ), + ( + "rail_12306", + { + "launch": "node", + "server_subdir": "12306-mcp", + "entrypoint_suffix": "/build/index.js", + "cwd": "/workspace", + "timeout_seconds": 20, + "env_subset": { + "PG_HOST": "postgres", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon", + "PG_USER": "postgres", + "PG_PASSWORD": "postgres", + }, + }, + ), + ( + "filesystem", + { + "launch": "node_with_workspace_arg", + "server_subdir": "filesystem", + "entrypoint_suffix": "/dist/index.js", + "cwd": "/workspace", + "timeout_seconds": 300, + "env_subset": {}, + }, + ), + ], +) +def test_bundled_mcp_runtime_configs_match_current_toolathlon_layout(server_name, expected): + module = load_mcp_client_module() + + config = module.load_mcp_process_config( + server_name=server_name, + agent_workspace="/workspace", + mcp_servers_path="/toolathlon/local_servers", + process_env={}, + ) + + assert_resolved_runtime_paths(config) + assert config.cwd == expected["cwd"] + assert config.timeout_seconds == expected["timeout_seconds"] + + launch = expected["launch"] + if launch == "node": + assert_arg_contains_path( + config, + f"/toolathlon/local_servers/{expected['server_subdir']}{expected['entrypoint_suffix']}", + ) + elif launch == "node_with_workspace_arg": + assert_arg_contains_path( + config, + f"/toolathlon/local_servers/{expected['server_subdir']}{expected['entrypoint_suffix']}", + ) + assert_arg_contains_path(config, "/workspace") + elif launch == "uv": + expected_project_dir = f"/toolathlon/local_servers/{expected['server_subdir']}" + assert "--directory" in config.args + assert_arg_contains_path(config, expected_project_dir) + assert "run" in config.args + assert expected["entrypoint"] in config.args + elif launch == "python_or_uv": + assert_python_server_launch( + config, + server_subdir=expected["server_subdir"], + fallback_entrypoint=expected["fallback_entrypoint"], + ) + else: + raise AssertionError(f"Unknown launch mode: {launch}") + + for key, value in expected["env_subset"].items(): + assert config.env[key] == value diff --git a/sandbox/tests/test_result_formatter.py b/sandbox/tests/test_result_formatter.py new file mode 100644 index 0000000..8904585 --- /dev/null +++ b/sandbox/tests/test_result_formatter.py @@ -0,0 +1,166 @@ +from sandbox.result_formatter import format_tool_result + + +def test_format_tool_result_returns_plain_string_for_successful_code_response(): + response = { + "code": 0, + "message": "success", + "data": " 1→hello", + "meta": { + "tool": "code:read", + "resource_type": "code", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == " 1→hello" + + +def test_format_tool_result_preserves_whitespace_only_plain_string_for_successful_code_response(): + response = { + "code": 0, + "message": "success", + "data": " \n\t ", + "meta": { + "tool": "code:read", + "resource_type": "code", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == " \n\t " + + +def test_format_tool_result_keeps_dict_style_successful_code_response_behavior(): + response = { + "code": 0, + "message": "success", + "data": { + "stdout": "print('ok')\n", + "stderr": "", + "return_code": 0, + }, + "meta": { + "tool": "code:run", + "resource_type": "code", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == "print('ok')" + + +def test_format_tool_result_preserves_error_behavior_for_failed_code_response(): + response = { + "code": 1, + "message": "read failed", + "data": "ignored plain string payload", + "meta": { + "tool": "code:read", + "resource_type": "code", + }, + } + + assert format_tool_result(response) == "[Error] read failed" + + +def test_format_tool_result_returns_text_for_successful_mcp_response(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [ + {"type": "text", "text": "ok"}, + ] + }, + "meta": { + "tool": "mcp:canvas.canvas_list_courses", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == "ok" + + +def test_format_tool_result_handles_mixed_mcp_content_without_crashing(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [ + {"type": "text", "text": "first"}, + {"type": "image", "mimeType": "image/png"}, + {"type": "text", "text": "second"}, + ] + }, + "meta": { + "tool": "mcp:canvas.canvas_list_courses", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == "first\n[image content]\nsecond" + + +def test_format_tool_result_falls_back_to_structured_content_for_successful_mcp_response(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [], + "structuredContent": { + "name": "Canvas", + "status": "ok", + }, + }, + "meta": { + "tool": "mcp:canvas.canvas_health_check", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == '{\n "name": "Canvas",\n "status": "ok"\n}' + + +def test_format_tool_result_prefers_structured_content_when_successful_mcp_content_has_no_text(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [ + {"type": "image", "mimeType": "image/png"}, + ], + "structuredContent": { + "name": "Canvas", + "status": "ok", + }, + }, + "meta": { + "tool": "mcp:canvas.canvas_health_check", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == '{\n "name": "Canvas",\n "status": "ok"\n}' + + +def test_format_tool_result_preserves_error_behavior_for_failed_mcp_response(): + response = { + "code": 1, + "message": "mcp failed", + "data": { + "content": [ + {"type": "text", "text": "ignored"}, + ] + }, + "meta": { + "tool": "mcp:canvas.canvas_list_courses", + "resource_type": "mcp", + }, + } + + assert format_tool_result(response) == "[Error] mcp failed" diff --git a/sandbox/tests/test_sandbox_config_loading.py b/sandbox/tests/test_sandbox_config_loading.py index 16773a5..358b0c7 100644 --- a/sandbox/tests/test_sandbox_config_loading.py +++ b/sandbox/tests/test_sandbox_config_loading.py @@ -27,3 +27,51 @@ def test_load_server_config_expands_env_default_placeholders(tmp_path, monkeypat loaded["resources"]["mcp"]["config"]["workspace_root"] == "/tmp/agentflow_mcp" ) + + +def test_load_server_config_keeps_required_mcp_servers_path_placeholder_when_env_missing( + tmp_path, monkeypatch +): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + + config_path = tmp_path / "mcp_config.json" + raw_config = { + "resources": { + "mcp": { + "enabled": True, + "config": { + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert ( + loaded["resources"]["mcp"]["config"]["mcp_servers_path"] + == "${TOOLATHLON_GYM_ROOT}/local_servers" + ) + + +def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): + config_path = tmp_path / "code_config.json" + raw_config = { + "resources": { + "code": { + "enabled": True, + "config": { + "workspace_root": "/tmp/agentflow_code" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + + loaded = sandbox._load_server_config() + + assert loaded["resources"]["code"]["config"]["workspace_root"] == "/tmp/agentflow_code" diff --git a/sandbox/tool_schemas/__init__.py b/sandbox/tool_schemas/__init__.py index 9ba5139..36a1d16 100644 --- a/sandbox/tool_schemas/__init__.py +++ b/sandbox/tool_schemas/__init__.py @@ -13,6 +13,7 @@ from .doc_tools import get_doc_tool_schemas from .ds_tools import get_ds_tool_schemas from .sql_tools import get_sql_tool_schemas +from .code_tools import get_code_tool_schemas from .mcp import get_mcp_tool_schemas @@ -57,6 +58,7 @@ def get_tool_schemas(allowed_tools: Optional[List[str]] = None) -> List[Dict[str + get_doc_tool_schemas() + get_ds_tool_schemas() + get_sql_tool_schemas() + + get_code_tool_schemas() ) # MCP manifest (438 tools) is expensive to load into every prompt. @@ -118,7 +120,7 @@ def get_tools_by_resource(resource_type: str) -> List[Dict[str, Any]]: Get tools for a specific resource type. Args: - resource_type: Resource type like "vm", "rag", "web", "mcp" + resource_type: Resource type like "vm", "rag", "web", "mcp", "code" Returns: List of tool schemas for that resource @@ -144,5 +146,6 @@ def get_tools_by_resource(resource_type: str) -> List[Dict[str, Any]]: "get_doc_tool_schemas", "get_ds_tool_schemas", "get_sql_tool_schemas", + "get_code_tool_schemas", "get_mcp_tool_schemas", ] diff --git a/sandbox/tool_schemas/code_tools.py b/sandbox/tool_schemas/code_tools.py new file mode 100644 index 0000000..f281b14 --- /dev/null +++ b/sandbox/tool_schemas/code_tools.py @@ -0,0 +1,139 @@ +""" +Code Tool Schemas + +This module defines tool schemas for code workspace operations. +""" + +from typing import List, Dict, Any + + +def get_code_tool_schemas() -> List[Dict[str, Any]]: + """Get all code tool schemas.""" + return [ + { + "name": "code-read", + "description": "Read a text file from the current code workspace and return contents with line numbers.", + "parameters": [ + { + "name": "file_path", + "type": "string", + "description": "Path to the file to read.", + "required": True, + }, + { + "name": "offset", + "type": "integer", + "description": "Optional start line number for partial reads (1-indexed).", + "required": False, + }, + { + "name": "limit", + "type": "integer", + "description": "Optional maximum number of lines to return.", + "required": False, + }, + ], + }, + { + "name": "code-glob", + "description": "Find files in the coding workspace using a glob pattern.", + "parameters": [ + { + "name": "pattern", + "type": "string", + "description": "Glob pattern to match files, such as '**/*.py'.", + "required": True, + }, + { + "name": "path", + "type": "string", + "description": "Optional base directory to search from.", + "required": False, + }, + ], + }, + { + "name": "code-grep", + "description": "Search file contents in the coding workspace with a regex pattern.", + "parameters": [ + { + "name": "pattern", + "type": "string", + "description": "Regex pattern to search for.", + "required": True, + }, + { + "name": "path", + "type": "string", + "description": "Optional directory path to scope the search.", + "required": False, + }, + { + "name": "glob", + "type": "string", + "description": "Optional glob filter for file selection, such as '*.ts'.", + "required": False, + }, + ], + }, + { + "name": "code-bash", + "description": "Run a shell command in the current coding workspace.", + "parameters": [ + { + "name": "command", + "type": "string", + "description": "Shell command to execute.", + "required": True, + }, + ], + }, + { + "name": "code-edit", + "description": "Edit a file in the coding workspace by exact string replacement, expecting a unique match unless replace_all=true.", + "parameters": [ + { + "name": "file_path", + "type": "string", + "description": "Path to the file to edit.", + "required": True, + }, + { + "name": "old_string", + "type": "string", + "description": "Exact text to find in the file.", + "required": True, + }, + { + "name": "new_string", + "type": "string", + "description": "Text used to replace the matched string.", + "required": True, + }, + { + "name": "replace_all", + "type": "boolean", + "description": "When true, replace all exact matches; otherwise exactly one unique match is expected.", + "required": False, + }, + ], + }, + { + "name": "code-write", + "description": "Write full file content to a file in the coding workspace and create parent directories if needed.", + "parameters": [ + { + "name": "file_path", + "type": "string", + "description": "Path to the file to write.", + "required": True, + }, + { + "name": "content", + "type": "string", + "description": "Complete file content to write.", + "required": True, + }, + ], + }, + ] diff --git a/seeds/code/seed/demo_repo/README.md b/seeds/code/seed/demo_repo/README.md new file mode 100644 index 0000000..6bdffd1 --- /dev/null +++ b/seeds/code/seed/demo_repo/README.md @@ -0,0 +1,8 @@ +# Coding Example Demo Repo + +This tiny repository is bundled for AgentFlow's CodingAgent example. + +- `app.py` builds a greeting string. +- `config/app_config.json` stores the expected name and suffix. +- `lib/helpers.py` contains the formatting helper. +- `tests/smoke_test.py` is the verification command used by the rollout example. diff --git a/seeds/code/seed/demo_repo/app.py b/seeds/code/seed/demo_repo/app.py new file mode 100644 index 0000000..a172cac --- /dev/null +++ b/seeds/code/seed/demo_repo/app.py @@ -0,0 +1,20 @@ +import json +from pathlib import Path + +from lib.helpers import render_greeting + + +CONFIG_PATH = Path(__file__).parent / "config" / "app_config.json" + + +def load_config() -> dict: + return json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + + +def build_message() -> str: + config = load_config() + return render_greeting(config["default_name"], "?") + + +if __name__ == "__main__": + print(build_message()) diff --git a/seeds/code/seed/demo_repo/config/app_config.json b/seeds/code/seed/demo_repo/config/app_config.json new file mode 100644 index 0000000..31f4249 --- /dev/null +++ b/seeds/code/seed/demo_repo/config/app_config.json @@ -0,0 +1,4 @@ +{ + "default_name": "AgentFlow", + "suffix": "!" +} diff --git a/seeds/code/seed/demo_repo/lib/helpers.py b/seeds/code/seed/demo_repo/lib/helpers.py new file mode 100644 index 0000000..099ce38 --- /dev/null +++ b/seeds/code/seed/demo_repo/lib/helpers.py @@ -0,0 +1,2 @@ +def render_greeting(name: str, suffix: str) -> str: + return f"Hello, {name}{suffix}" diff --git a/seeds/code/seed/demo_repo/tests/smoke_test.py b/seeds/code/seed/demo_repo/tests/smoke_test.py new file mode 100644 index 0000000..9f9c9eb --- /dev/null +++ b/seeds/code/seed/demo_repo/tests/smoke_test.py @@ -0,0 +1,16 @@ +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + +from app import build_message + + +def main() -> None: + message = build_message() + assert message == "Hello, AgentFlow!", message + print("SMOKE_OK") + + +if __name__ == "__main__": + main() diff --git a/seeds/code/seeds.jsonl b/seeds/code/seeds.jsonl new file mode 100644 index 0000000..ba6a9b3 --- /dev/null +++ b/seeds/code/seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Inspect the demo repository and trace how the greeting is assembled from config and helper code.", "kwargs": {}} +{"content": "Look for a small repository bug that can be fixed with a minimal edit and validated with the committed smoke test.", "kwargs": {}} diff --git a/seeds/mcp/canvas_seeds.jsonl b/seeds/mcp/canvas_seeds.jsonl new file mode 100644 index 0000000..bf984a3 --- /dev/null +++ b/seeds/mcp/canvas_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the available Canvas MCP tools to inspect courses, assignments, and enrollment information.", "kwargs": {}} +{"content": "Find a small Canvas reporting task that can be answered from the mock data and save intermediate notes with filesystem tools if helpful.", "kwargs": {}} diff --git a/seeds/mcp/snowflake_seeds.jsonl b/seeds/mcp/snowflake_seeds.jsonl new file mode 100644 index 0000000..17076be --- /dev/null +++ b/seeds/mcp/snowflake_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the available Snowflake MCP tools to inspect schemas, tables, and small analytical queries in the mock warehouse.", "kwargs": {}} +{"content": "Find one compact warehouse reporting question that can be answered from the available Snowflake tools.", "kwargs": {}} diff --git a/seeds/mcp/train_seeds.jsonl b/seeds/mcp/train_seeds.jsonl new file mode 100644 index 0000000..e02d780 --- /dev/null +++ b/seeds/mcp/train_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the rail_12306 MCP tools to inspect mock train, station, and route information.", "kwargs": {}} +{"content": "Find one small travel-planning or route lookup question that can be answered from the available railway tools.", "kwargs": {}} diff --git a/seeds/mcp/woocommerce_seeds.jsonl b/seeds/mcp/woocommerce_seeds.jsonl new file mode 100644 index 0000000..3fbf365 --- /dev/null +++ b/seeds/mcp/woocommerce_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the WooCommerce MCP tools to inspect customers, products, and orders in the mock store.", "kwargs": {}} +{"content": "Find one small sales or operations question that can be answered from the WooCommerce mock data.", "kwargs": {}} diff --git a/seeds/mcp/yahoo_finance_seeds.jsonl b/seeds/mcp/yahoo_finance_seeds.jsonl new file mode 100644 index 0000000..77f8e90 --- /dev/null +++ b/seeds/mcp/yahoo_finance_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the Yahoo Finance MCP tools to inspect the mock ticker and market data available locally.", "kwargs": {}} +{"content": "Find one small finance lookup or comparison question that can be answered directly from the available tools.", "kwargs": {}} diff --git a/seeds/mcp/youtube_seeds.jsonl b/seeds/mcp/youtube_seeds.jsonl new file mode 100644 index 0000000..5575703 --- /dev/null +++ b/seeds/mcp/youtube_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the YouTube and YouTube Transcript MCP tools to inspect mock video metadata and transcript data.", "kwargs": {}} +{"content": "Find one small content-discovery or transcript lookup question that can be answered from the local mock data.", "kwargs": {}} diff --git a/synthesis/core/config.py b/synthesis/core/config.py index b8527d9..eaa491b 100644 --- a/synthesis/core/config.py +++ b/synthesis/core/config.py @@ -7,6 +7,8 @@ from typing import Dict, List, Any, Optional from dataclasses import dataclass, field, fields +from sandbox.server.config_loader import expand_env_vars + @dataclass class SynthesisConfig: @@ -67,6 +69,7 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> 'SynthesisConfig': valid_fields = {f.name for f in fields(cls)} filtered = {k: v for k, v in config_dict.items() if k in valid_fields} + filtered = expand_env_vars(filtered) # Normalize text fields (allow list[str] for easier editing) def _normalize_text_field(v: Any) -> str: diff --git a/synthesis/tests/conftest.py b/synthesis/tests/conftest.py new file mode 100644 index 0000000..f7d4e68 --- /dev/null +++ b/synthesis/tests/conftest.py @@ -0,0 +1,8 @@ +import sys +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) diff --git a/synthesis/tests/test_code_example_synthesis_config.py b/synthesis/tests/test_code_example_synthesis_config.py new file mode 100644 index 0000000..dddc9e6 --- /dev/null +++ b/synthesis/tests/test_code_example_synthesis_config.py @@ -0,0 +1,79 @@ +import json +from pathlib import Path + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_code_synthesis_config_contract_expands_repo_root_when_set(monkeypatch): + config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + monkeypatch.setenv("AGENTFLOW_REPO_ROOT", str(REPO_ROOT)) + config = SynthesisConfig.from_json(str(config_path)) + + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": f"{REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.available_tools == ["code-*"] + assert config.seeds_file == "seeds/code/seeds.jsonl" + assert raw["seed_description"] == "Coding demo repository prompts" + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() + + +def test_code_synthesis_config_preserves_placeholder_when_repo_root_unset(monkeypatch): + config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" + monkeypatch.delenv("AGENTFLOW_REPO_ROOT", raising=False) + + config = SynthesisConfig.from_json(str(config_path)) + + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + + +def test_synthesis_config_from_dict_expands_nested_env_values(monkeypatch): + monkeypatch.setenv("CODE_ROOT", "/tmp/demo") + monkeypatch.delenv("UNSET_VALUE", raising=False) + + config = SynthesisConfig.from_dict( + { + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${CODE_ROOT}/repo", + "fallback_dir": "${UNSET_VALUE:-/tmp/fallback}", + "preserved_dir": "${UNSET_VALUE}/repo", + "artifacts": [ + "${CODE_ROOT}/one", + "${UNSET_VALUE:-/tmp/two}", + "${UNSET_VALUE}/three", + ], + } + } + } + } + ) + + content = config.resource_init_configs["code"]["content"] + assert content["source_dir"] == "/tmp/demo/repo" + assert content["fallback_dir"] == "/tmp/fallback" + assert content["preserved_dir"] == "${UNSET_VALUE}/repo" + assert content["artifacts"] == [ + "/tmp/demo/one", + "/tmp/two", + "${UNSET_VALUE}/three", + ] diff --git a/synthesis/tests/test_mcp_example_synthesis_configs.py b/synthesis/tests/test_mcp_example_synthesis_configs.py new file mode 100644 index 0000000..806edeb --- /dev/null +++ b/synthesis/tests/test_mcp_example_synthesis_configs.py @@ -0,0 +1,124 @@ +import json +from pathlib import Path + +import pytest + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +OPENAI_API_KEY = "secret" +OPENAI_API_URL = "https://example.test/v1" +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/canvas_seeds.jsonl", + "seed_description": "Canvas MCP prompts", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/snowflake_seeds.jsonl", + "seed_description": "Snowflake MCP prompts", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl", + "seed_description": "WooCommerce MCP prompts", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl", + "seed_description": "Yahoo Finance MCP prompts", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "seeds_file": "seeds/mcp/youtube_seeds.jsonl", + "seed_description": "YouTube MCP prompts", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/train_seeds.jsonl", + "seed_description": "Train MCP prompts", + }, +} + + +def _set_openai_env(monkeypatch: pytest.MonkeyPatch, enabled: bool) -> None: + if enabled: + monkeypatch.setenv("OPENAI_API_KEY", OPENAI_API_KEY) + monkeypatch.setenv("OPENAI_API_URL", OPENAI_API_URL) + return + + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_URL", raising=False) + + +def _expected_openai_value(enabled: bool, env_value: str, placeholder: str) -> str: + if enabled: + return env_value + return placeholder + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_synthesis_config_contract(domain, env_enabled, monkeypatch): + expected = EXPECTED[domain] + _set_openai_env(monkeypatch, env_enabled) + config_path = REPO_ROOT / "configs" / "synthesis" / f"mcp_{domain}_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) + assert config.max_depth == 12 + assert config.branching_factor == 2 + assert config.depth_threshold == 2 + assert config.min_depth == 2 + assert config.max_selected_traj == 1 + assert config.path_similarity_threshold == 0.7 + assert config.available_tools == expected["tools"] + assert config.seeds_file == expected["seeds_file"] + assert config.output_dir == f"results/mcp_{domain}" + assert raw["seed_description"] == expected["seed_description"] + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) +def test_mcp_synthesis_config_ignores_unknown_env_placeholders( + caplog, monkeypatch, env_enabled +): + _set_openai_env(monkeypatch, env_enabled) + monkeypatch.delenv("IGNORED_SYNTHESIS_VAR", raising=False) + + with caplog.at_level("WARNING", logger="ConfigLoader"): + config = SynthesisConfig.from_dict( + { + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "unknown_field": "${IGNORED_SYNTHESIS_VAR}", + } + ) + + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) + assert "IGNORED_SYNTHESIS_VAR" not in caplog.text