From 702999aee51bde9b7e15c03bfa5e11bc54ac7fc1 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Wed, 15 Apr 2026 23:59:25 +0800 Subject: [PATCH 01/39] docs: add code backend single-repo vendoring design --- ...de-backend-single-repo-vendoring-design.md | 356 ++++++++++++++++++ 1 file changed, 356 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md diff --git a/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md new file mode 100644 index 0000000..e142a91 --- /dev/null +++ b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md @@ -0,0 +1,356 @@ +## Code Backend Single-Repository Vendoring Design + +Date: 2026-04-15 +Status: Approved for planning +Supersedes: `docs/superpowers/specs/2026-04-15-code-backend-design.md` + +## Summary + +AgentFlow's current `code` backend depends on an external `claude-code-py` source tree through `claude_code_root`. That makes the feature non-portable, couples runtime behavior to a closed or separately managed repository, and introduces an avoidable configuration requirement. + +This design replaces that approach with an internal vendored compatibility layer inside AgentFlow. AgentFlow will vendor the minimal upstream code-tool subset it actually needs, and the `code` backend will load those vendored classes directly from the AgentFlow repository. + +The backend remains a session-scoped resource backend with per-worker workspaces and AgentFlow-owned path-boundary enforcement. The six exposed coding tools remain: + +- `code:read` +- `code:glob` +- `code:grep` +- `code:bash` +- `code:edit` +- `code:write` + +Unlike the current implementation, all six tools, including `bash`, will execute through vendored upstream-style tool classes. AgentFlow will stop treating `bash` as a separately wrapped special case. + +## Problem Statement + +The current design has three architectural problems: + +- It requires `claude_code_root` in sandbox config, which breaks single-repository portability. +- It relies on dynamic source loading from another tree, which is brittle and hard to reason about. +- It contains an internal inconsistency: `code:bash` is nominally part of the reused six-tool set, but in practice it bypasses the loaded upstream `BashTool` and runs through an AgentFlow-specific subprocess wrapper. + +These are not desirable "advanced configuration" choices. They are design mistakes for a feature that should ship as a self-contained AgentFlow capability. + +## Goals + +- Make the `code` backend runnable from the AgentFlow repository alone. +- Keep the six coding tools behaviorally aligned with the upstream lightweight tool implementations. +- Remove all dependency on external `claude-code-py` runtime paths and dynamic import plumbing. +- Keep `code` as a session-scoped backend with isolated worker workspaces. +- Preserve AgentFlow's existing rollout and sandbox abstractions. +- Add a clear test strategy that covers unit behavior, backend integration, and a real rollout smoke path. + +## Non-Goals + +- Do not vendor the full `claude-code-py` runtime. +- Do not vendor query loops, skills, tracing, memory loading, or sub-agent functionality. +- Do not add hard shell sandboxing. +- Do not redesign rollout configuration, sandbox protocols, or tool schema conventions. +- Do not keep backward compatibility for `claude_code_root`, `allow_bash`, or `bash_timeout_seconds`. + +## Core Decisions + +### 1. Vendor a minimal compatibility layer + +AgentFlow will vendor only the minimal code-tool slice needed for the `code` backend: + +- a minimal `Tool` base class +- `ReadTool` +- `GlobTool` +- `GrepTool` +- `BashTool` +- `EditTool` +- `WriteTool` + +The vendored code should be a small, clearly bounded package inside AgentFlow, with only the minimum import adjustments required to make it internal and self-contained. + +### 2. Remove the external-root model completely + +The new design deletes the idea that AgentFlow should discover coding tools from another source tree at runtime. + +Delete these concepts from implementation, config, tests, and docs: + +- `claude_code_root` +- dynamic import of upstream files +- root-local support-module loading +- compatibility tests that verify loading from an external tree + +This is an intentional removal, not a soft deprecation. + +### 3. Treat `bash` as a normal member of the six-tool set + +The vendored `BashTool` will be used the same way as the other five vendored tools: through the common tool-loading path and `tool.call(params, ctx)` execution model. + +Delete these concepts from implementation, config, tests, and docs: + +- `allow_bash` +- `bash_timeout_seconds` +- AgentFlow-specific `_run_bash_command()` behavior +- config-availability messaging for `code-bash` + +The `code` backend will expose all six tools all the time. + +### 4. Keep AgentFlow-owned environment boundaries + +Vendoring the tool classes does not move workspace safety into the vendored code. AgentFlow still owns: + +- per-worker workspace creation +- `source_dir` copying +- worker/session identity checks +- file-path normalization relative to workspace +- path-escape rejection for file-oriented tools + +This separation keeps the vendored code small and keeps environment policy at the backend boundary where AgentFlow already owns session state. + +## Architecture + +### Vendored package layout + +Add a dedicated internal package for the vendored code tools, for example: + +- `sandbox/server/backends/resources/code_vendor/__init__.py` +- `sandbox/server/backends/resources/code_vendor/tool.py` +- `sandbox/server/backends/resources/code_vendor/file_tools.py` +- `sandbox/server/backends/resources/code_vendor/edit_tools.py` + +The package name should make it obvious that this is a bounded internal compatibility layer, not a general-purpose reimplementation of `claude-code-py`. + +### Backend responsibilities + +`CodeBackend` remains responsible for: + +- registering the six `code:*` bridge tools +- creating and cleaning per-worker workspaces +- copying optional `source_dir` contents into the session workspace +- validating the session workspace against the worker id +- enforcing file-path boundaries for file-oriented tools +- instantiating and caching the six vendored tool classes + +`CodeBackend` no longer needs: + +- `_get_claude_code_root()` +- `_validate_claude_code_root_prerequisites()` +- `_load_root_support_modules()` +- dynamic module alias installation +- `_run_bash_command()` +- any `bash`-only dispatch branch + +### Runtime flow + +The runtime flow becomes: + +1. `initialize()` validates `worker_id`, prepares a staged workspace, and optionally copies `source_dir`. +2. The backend ensures vendored tool instances are loaded from the repository itself. +3. The staged workspace becomes the active workspace. +4. Bridge dispatch resolves the worker session workspace. +5. For file-oriented tools, AgentFlow normalizes and bounds path-like parameters to the workspace. +6. The backend creates a minimal context adapter with `cwd=`. +7. All six tools execute through the same vendored `tool.call(...)` path. +8. AgentFlow wraps results into standard backend success/error responses. + +### Minimal context adapter + +The vendored six-tool subset only needs a tiny runtime context: + +```python +SimpleNamespace(cwd=str(workspace)) +``` + +No full agent runtime model is needed. + +## Tool Behavior Contract + +The tool surface remains unchanged: + +- prompt-visible schemas stay `code-read`, `code-glob`, `code-grep`, `code-bash`, `code-edit`, `code-write` +- runtime names stay `code:read`, `code:glob`, `code:grep`, `code:bash`, `code:edit`, `code:write` +- parameter names remain aligned with the vendored upstream tool classes + +Behaviorally, the backend should preserve: + +- line-numbered `read` output +- recursive globbing behavior +- recursive grep behavior with optional file filter +- exact-match edit semantics with uniqueness checks +- full-file overwrite semantics for `write` +- upstream-style shell execution behavior for `bash` + +AgentFlow should not add new `bash`-specific runtime policy once vendoring is complete. + +## Configuration Design + +### Backend config + +After the redesign, `code` backend config should keep only what is still meaningfully owned by AgentFlow: + +- `workspace_root` + +The config example should therefore look like: + +```json +{ + "server": { + "url": "http://127.0.0.1:18890", + "port": 18890, + "session_ttl": 300 + }, + "resources": { + "code": { + "enabled": true, + "description": "Lightweight coding backend with vendored upstream-style tools", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "workspace_root": "/tmp/agentflow_code" + } + } + }, + "warmup": { + "enabled": false, + "resources": [] + } +} +``` + +### Session init config + +Session init remains intentionally small: + +- `source_dir`: optional directory copied into the session workspace + +Rollout-facing use stays: + +- `resource_types=["code"]` +- `available_tools=["code-*"]` +- `resource_init_configs["code"]["content"]["source_dir"]` + +## Testing Strategy + +The testing changes must be explicit. This work is not only about adding tests; it also requires deleting tests and rewriting tests that lock in the old design mistake. + +### Delete old tests + +Delete tests that exist only to validate the old external-root or AgentFlow-specific bash wrapper model, including categories such as: + +- external `claude_code_root` requirement +- fake external upstream roots +- dynamic loading from another repository +- root-local support-module loading +- isolated-per-root loader behavior +- `allow_bash` gating +- `bash_timeout_seconds` +- AgentFlow-specific `bash` input validation that no longer exists in the vendored-upstream model +- config-template checks that still mention deleted fields +- env-var expansion tests whose only purpose was `CLAUDE_CODE_ROOT` + +### Modify existing tests + +Keep and adapt the tests that remain valid for the new architecture: + +- tool registration tests +- workspace initialization and `source_dir` copy tests +- workspace recreation tests +- cleanup safety tests +- `worker_id` validation tests +- session workspace identity and boundary tests +- file-path normalization and escape rejection tests +- successful bridge dispatch and standard response-shape tests +- tool schema presence/filtering/parameter-contract tests + +Schema tests must update descriptions so `code-bash` no longer claims backend-config-dependent availability. + +### Add new tests + +Add new focused tests for the vendored model: + +- vendored tool loading from the internal package +- all six tools executing through the same tool-call path +- vendored `BashTool` behavior contract +- vendored `EditTool`/`WriteTool` behavior contract where existing bridge tests do not already cover it + +### Add a rollout-facing smoke test + +Add one end-to-end rollout smoke that exercises the real rollout-to-sandbox-to-code-backend path. + +The smoke should: + +- live under `rollout/tests/` +- be marked with a dedicated pytest marker, for example `code_real` +- not run in default pytest execution +- require explicit manual invocation +- use a real LLM response path and a real sandbox/code backend path + +This smoke should not mock sandbox components. It should really: + +- start sandbox +- create a `code` session +- copy a tiny fixture repo into the workspace +- expose `code-*` tools through rollout +- execute at least one real `code:*` tool call + +Recommended smoke structure: + +1. Create a temporary fixture repo with a uniquely identifiable file, for example `nested/TOKEN.txt`. +2. Write a hard-to-guess token into that file. +3. Create a one-task benchmark asking the agent to use code tools to read the file and return only the exact token. +4. Run `RolloutPipeline` with: + - `available_tools=["code-*"]` + - `resource_types=["code"]` + - `resource_init_configs["code"]["content"]["source_dir"]=fixture_repo` + - `sandbox_config_path="configs/sandbox-server/code_config.json"` + - `sandbox_auto_start=True` + - `number_of_tasks=1` + - `evaluate_results=False` + - `save_trajectories=True` +5. Assert that: + - the task succeeds + - the trajectory contains at least one `code:*` tool call + - the final answer equals the token + - the token appears in the observed tool-result chain + +Credential provisioning for that opt-in real smoke remains an execution-time concern and must not be hardcoded into repository defaults. + +## Documentation Changes + +Update all user-facing and internal docs that still describe the deleted design: + +- `configs/sandbox-server/code_config.json` +- `sandbox/tests/test_sandbox_config_loading.py` +- `sandbox/tests/test_code_tool_schemas.py` +- any code-backend README/tutorial snippets +- the prior `2026-04-15-code-backend-design.md` should be treated as superseded + +The resulting documentation should consistently present the `code` backend as a native AgentFlow capability. + +## Risks + +- Vendored code can drift from future upstream changes. + Mitigation: treat the vendored subset as an intentionally frozen internal compatibility layer and cover it with explicit behavior tests. + +- Real rollout smoke tests can be flaky because they depend on live model behavior and external connectivity. + Mitigation: keep them opt-in and strongly constrain the task prompt and fixture. + +- `bash` remains powerful because it executes shell commands relative to the workspace but without OS-level isolation. + Mitigation: document this clearly as an inherent property of the `code` backend rather than disguising it behind partial configuration toggles. + +## Recommended Implementation Order + +1. Vendor the minimal upstream six-tool compatibility layer into AgentFlow. +2. Simplify `CodeBackend` to load vendored tools directly and remove all external-root logic. +3. Remove `bash` special handling so all six tools share one execution path. +4. Simplify `code` backend config to `workspace_root` only. +5. Update schema descriptions and sandbox config examples. +6. Delete old tests tied to the removed design. +7. Adapt retained backend and schema tests. +8. Add vendored-tool behavior coverage. +9. Add the opt-in real rollout smoke. + +## Decision + +AgentFlow should stop treating the `code` backend as a thin adapter over an external source tree and instead ship a self-contained, vendored upstream-style compatibility layer inside the repository. + +This restores the intended product boundary: + +- AgentFlow owns the coding environment as a native feature +- all six code tools are internally available +- rollout and sandbox integration stay unchanged +- the repository becomes portable again From a49ce056ad483b45470093a930db884c61a049ab Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 09:07:39 +0800 Subject: [PATCH 02/39] docs: align code real smoke plan with mcp --- ...4-16-code-backend-single-repo-vendoring.md | 727 ++++++++++++++++++ ...de-backend-single-repo-vendoring-design.md | 6 +- 2 files changed, 730 insertions(+), 3 deletions(-) create mode 100644 docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md diff --git a/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md b/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md new file mode 100644 index 0000000..e282aed --- /dev/null +++ b/docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md @@ -0,0 +1,727 @@ +# Code Backend Single-Repository Vendoring Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Make AgentFlow's `code` backend self-contained by vendoring the six upstream-style code tools into this repository, removing `claude_code_root`/`allow_bash`/`bash_timeout_seconds`, and adding an opt-in real rollout smoke. + +**Architecture:** Keep `code` as a session-scoped sandbox backend with AgentFlow-owned workspace lifecycle and path-boundary enforcement. Replace all external-root loading with a small internal `code_vendor` package, route all six tools through the same vendored `tool.call(...)` path, and verify the result with updated sandbox tests plus an MCP-style env-gated real rollout smoke. + +**Tech Stack:** Python 3.10, pytest, FastAPI sandbox server, pathlib/shutil, vendored upstream-style tool classes, RolloutPipeline, real LLM smoke via env-gated pytest collection + pytest CLI options + +--- + +**Known baseline:** In this worktree, `PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py sandbox/tests/test_code_tool_schemas.py sandbox/tests/test_sandbox_config_loading.py rollout/tests/test_config.py rollout/tests/test_integration.py` passes (`68 passed, 2 skipped`). `pip install -r requirements.txt` still hits the existing unrelated `pyxcursor` dependency-resolution issue from the VM stack; do not block this plan on that package. + +## File Map + +### New files + +- `sandbox/server/backends/resources/code_vendor/__init__.py` + Internal package export surface for the vendored six-tool compatibility layer. +- `sandbox/server/backends/resources/code_vendor/tool.py` + Minimal vendored `Tool` base class used by the six code tools. +- `sandbox/server/backends/resources/code_vendor/file_tools.py` + Vendored upstream-style `ReadTool`, `GlobTool`, `GrepTool`, and `BashTool`. +- `sandbox/server/backends/resources/code_vendor/edit_tools.py` + Vendored upstream-style `EditTool` and `WriteTool`. +- `sandbox/tests/test_code_vendor_tools.py` + Focused behavior-contract tests for the vendored tool package independent of `CodeBackend`. +- `rollout/tests/conftest.py` + Mirror the MCP real-smoke collection-gating pattern for `code` rollout tests and add pytest CLI options for real-smoke credentials. +- `rollout/tests/test_code_real_smoke.py` + Opt-in real rollout smoke that starts sandbox, uses the real LLM path, and proves at least one real `code:*` call happens against a temporary fixture repo. + +### Modified files + +- `sandbox/server/backends/resources/code.py` + Remove external-root logic and `bash` special casing; load vendored tools directly and execute all six through the same code path. +- `configs/sandbox-server/code_config.json` + Remove deleted config fields and present the `code` backend as a native AgentFlow capability with only `workspace_root`. +- `sandbox/tests/test_code_backend.py` + Delete obsolete external-root/bash-wrapper tests, keep valid workspace/boundary coverage, and rewrite tool-loading expectations around internal vendoring. +- `sandbox/tool_schemas/code_tools.py` + Update `code-bash` description to remove backend-config-dependent availability wording. +- `sandbox/tests/test_code_tool_schemas.py` + Update schema assertions to match the new `code-bash` description and keep the rest of the parameter contract coverage. +- `sandbox/tests/test_sandbox_config_loading.py` + Replace the `CLAUDE_CODE_ROOT` env-expansion test with a config-loading assertion for the simplified `code` backend template. + +### Intentionally unchanged files + +- `rollout/core/config.py` +- `rollout/core/runner.py` +- `rollout/pipeline.py` +- `sandbox/tool_schemas/__init__.py` +- `sandbox/server/backends/resources/__init__.py` + +The rollout and backend registration plumbing already supports the target design. Do not widen scope into rollout engine rewrites or unrelated backend refactors. + +## Chunk 1: Vendor the Upstream-Style Tool Subset + +### Task 1: Add the internal `code_vendor` package and behavior-contract tests + +**Files:** +- Create: `sandbox/server/backends/resources/code_vendor/__init__.py` +- Create: `sandbox/server/backends/resources/code_vendor/tool.py` +- Create: `sandbox/server/backends/resources/code_vendor/file_tools.py` +- Create: `sandbox/server/backends/resources/code_vendor/edit_tools.py` +- Create: `sandbox/tests/test_code_vendor_tools.py` + +- [ ] **Step 1: Write the failing vendored-tool tests** + +Create `sandbox/tests/test_code_vendor_tools.py` with focused tests like: + +```python +import asyncio +from pathlib import Path +from types import SimpleNamespace + +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool + + +def test_read_tool_returns_line_numbered_content(tmp_path): + target = tmp_path / "demo.py" + target.write_text("first\nsecond\n", encoding="utf-8") + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run(ReadTool().call({"file_path": str(target)}, ctx)) + + assert "1" in result + assert "first" in result + assert "second" in result + + +def test_edit_tool_requires_unique_match(tmp_path): + target = tmp_path / "demo.py" + target.write_text("x = 1\nx = 1\n", encoding="utf-8") + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run( + EditTool().call( + { + "file_path": str(target), + "old_string": "x = 1", + "new_string": "x = 2", + }, + ctx, + ) + ) + + assert result.startswith("Error:") + assert "appears" in result + + +def test_bash_tool_combines_stdout_and_stderr(tmp_path): + ctx = SimpleNamespace(cwd=str(tmp_path)) + + result = asyncio.run( + BashTool().call( + { + "command": "python -c \"import sys; print('out'); print('err', file=sys.stderr)\"" + }, + ctx, + ) + ) + + assert result == "out\n\n[stderr]:\nerr" +``` + +- [ ] **Step 2: Run the vendored-tool tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py +``` + +Expected: FAIL because the `code_vendor` package does not exist yet. + +- [ ] **Step 3: Add the vendored tool package** + +Create `sandbox/server/backends/resources/code_vendor/tool.py` with a minimal base: + +```python +from __future__ import annotations + +from abc import ABC, abstractmethod + + +class Tool(ABC): + name: str + description: str + + @property + @abstractmethod + def input_schema(self) -> dict: + ... + + @abstractmethod + async def call(self, args: dict, ctx) -> str: + ... + + def is_read_only(self, args: dict) -> bool: + return False +``` + +Create `sandbox/server/backends/resources/code_vendor/file_tools.py` with vendored upstream-style implementations: + +```python +from __future__ import annotations + +import subprocess +from pathlib import Path + +from .tool import Tool + + +class BashTool(Tool): + name = "Bash" + description = "Execute a shell command and return stdout/stderr." + + @property + def input_schema(self) -> dict: + return { + "type": "object", + "properties": {"command": {"type": "string", "description": "Shell command to run"}}, + "required": ["command"], + } + + async def call(self, args: dict, ctx) -> str: + result = subprocess.run( + args["command"], + shell=True, + capture_output=True, + text=True, + cwd=ctx.cwd, + ) + out = result.stdout + if result.stderr: + out += f"\n[stderr]:\n{result.stderr}" + return out.strip() or "(no output)" + + def is_read_only(self, args: dict) -> bool: + return False +``` + +Add matching vendored implementations for `ReadTool`, `GlobTool`, `GrepTool`, `EditTool`, and `WriteTool`, preserving the current upstream-style semantics already described in the approved spec. Keep imports package-local only; do not carry over `log.py`, `trace.py`, or a vendored tool executor. + +Create `sandbox/server/backends/resources/code_vendor/__init__.py` to export the six tool classes. + +- [ ] **Step 4: Run the vendored-tool tests** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py +``` + +Expected: PASS. The vendored tool package exists and captures the expected upstream-style behavior. + +- [ ] **Step 5: Commit the vendored tool package** + +```bash +git add sandbox/server/backends/resources/code_vendor/__init__.py \ + sandbox/server/backends/resources/code_vendor/tool.py \ + sandbox/server/backends/resources/code_vendor/file_tools.py \ + sandbox/server/backends/resources/code_vendor/edit_tools.py \ + sandbox/tests/test_code_vendor_tools.py +git commit -m "feat: vendor code backend tool subset" +``` + +## Chunk 2: Simplify `CodeBackend` to Use Vendored Tools Only + +### Task 2: Rewrite `CodeBackend` around the internal tool package + +**Files:** +- Modify: `sandbox/server/backends/resources/code.py` +- Modify: `sandbox/tests/test_code_backend.py` +- Modify: `configs/sandbox-server/code_config.json` + +- [ ] **Step 1: Rewrite the backend tests around the new design** + +In `sandbox/tests/test_code_backend.py`: + +- delete external-root helper factories such as `create_fake_claude_code_root()` and `create_marker_claude_code_root()` +- delete all tests whose only purpose is external-root loading, root-local support modules, or per-root loader isolation +- delete all `allow_bash` and `bash_timeout_seconds` tests +- add/keep failing tests like: + +```python +def build_backend_config(tmp_path): + return BackendConfig( + enabled=True, + default_config={ + "workspace_root": str(tmp_path / "agentflow_code"), + }, + description="Code backend", + ) + + +def test_initialize_does_not_require_external_root(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + + session = asyncio.run(backend.initialize("runner_123", {})) + + assert Path(session["workspace"]).exists() + + +def test_load_code_tools_uses_internal_vendor_package(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + + tools = backend._load_code_tools() + + assert set(tools) == {"read", "glob", "grep", "bash", "edit", "write"} + assert tools["bash"].__class__.__module__.endswith("code_vendor.file_tools") + + +def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): + module = load_code_backend_module() + backend = module.CodeBackend(config=build_backend_config(tmp_path)) + fake_server = FakeServer() + backend.bind_server(fake_server) + workspace = tmp_path / "agentflow_code" / "worker-1" + workspace.mkdir(parents=True) + + executor = ToolExecutor( + tools=fake_server._tools, + tool_name_index={}, + tool_resource_types=fake_server._tool_resource_types, + resource_router=FakeResourceRouter( + {"session_id": "sid", "data": {"workspace": str(workspace)}} + ), + ) + + result = asyncio.run( + executor.execute( + action="code:bash", + params={"command": "pwd"}, + worker_id="worker-1", + trace_id="trace-1", + ) + ) + + assert result["code"] == ErrorCode.SUCCESS + assert result["data"].strip() == str(workspace.resolve(strict=False)) +``` + +- [ ] **Step 2: Run the rewritten backend tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py -k "does_not_require_external_root or internal_vendor_package or runs_bash_via_vendored_tool" +``` + +Expected: FAIL because `CodeBackend` still depends on `claude_code_root` and still special-cases `bash`. + +- [ ] **Step 3: Rewrite `sandbox/server/backends/resources/code.py`** + +Update `CodeBackend` to: + +- keep only `workspace_root` in its default config +- rename the internal tool loader to something neutral like `_load_code_tools()` +- import vendored classes directly, for example: + +```python +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool +``` + +- cache vendored instances per backend instance: + +```python +self._tool_instances = { + "read": ReadTool(), + "glob": GlobTool(), + "grep": GrepTool(), + "bash": BashTool(), + "edit": EditTool(), + "write": WriteTool(), +} +``` + +- remove these methods entirely: + - `_get_claude_code_root()` + - `_validate_claude_code_root_prerequisites()` + - `_load_root_support_modules()` + - `_temporary_module_aliases()` + - `_load_module_from_path()` + - `_run_bash_command()` + +- remove any `tool_name == "bash"` branch in `_dispatch()` +- after session/workspace validation and path normalization, always run: + +```python +tool = self._load_code_tools()[tool_name] +ctx = SimpleNamespace(cwd=str(workspace)) +result = await tool.call(normalized_params, ctx) +``` + +- keep AgentFlow-owned path normalization and workspace identity enforcement exactly as the valid existing tests expect + +Update `configs/sandbox-server/code_config.json` so the `code` backend config becomes: + +```json +"config": { + "workspace_root": "/tmp/agentflow_code" +} +``` + +and update the description string to describe the backend as vendored/internal rather than powered by an external repository. + +- [ ] **Step 4: Run the backend regression subset** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_backend.py +``` + +Expected: PASS after the obsolete tests/helpers are removed and the remaining coverage is adapted to the internal vendored model. + +- [ ] **Step 5: Commit the backend simplification** + +```bash +git add sandbox/server/backends/resources/code.py \ + sandbox/tests/test_code_backend.py \ + configs/sandbox-server/code_config.json +git commit -m "refactor: vendor code backend runtime" +``` + +## Chunk 3: Refresh Schema and Config Tests Around the New Contract + +### Task 3: Update schema docs, config-loading tests, and obsolete assertions + +**Files:** +- Modify: `sandbox/tool_schemas/code_tools.py` +- Modify: `sandbox/tests/test_code_tool_schemas.py` +- Modify: `sandbox/tests/test_sandbox_config_loading.py` + +- [ ] **Step 1: Write the failing schema/config assertions** + +Update the tests to the new expected contract: + +```python +def test_code_bash_description_mentions_workspace_shell_execution(): + schema = _code_schemas_by_name()["code-bash"] + description = schema["description"].lower() + + assert "workspace" in description + assert "shell command" in description + assert "backend config" not in description + + +def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): + config_path = tmp_path / "code_config.json" + raw_config = { + "resources": { + "code": { + "enabled": True, + "config": { + "workspace_root": "/tmp/agentflow_code" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert loaded["resources"]["code"]["config"]["workspace_root"] == "/tmp/agentflow_code" +``` + +- [ ] **Step 2: Run the schema/config tests to verify they fail** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_tool_schemas.py sandbox/tests/test_sandbox_config_loading.py +``` + +Expected: FAIL because the current schema text still mentions backend-config-dependent availability and the config-loading test still asserts `CLAUDE_CODE_ROOT` expansion. + +- [ ] **Step 3: Update schema text and config-loading coverage** + +In `sandbox/tool_schemas/code_tools.py`, change `code-bash` to something like: + +```python +{ + "name": "code-bash", + "description": "Run a shell command in the coding workspace using the current workspace as the working directory.", + "parameters": [ + { + "name": "command", + "type": "string", + "description": "Shell command to execute.", + "required": True, + } + ], +} +``` + +In `sandbox/tests/test_sandbox_config_loading.py`, replace the `CLAUDE_CODE_ROOT` test with the simplified `workspace_root` expectation and remove `monkeypatch.delenv("CLAUDE_CODE_ROOT", ...)`. + +- [ ] **Step 4: Run the updated schema/config regression suite** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/test_config.py +``` + +Expected: PASS. The code tool docs and config-loading tests now reflect the new single-repository contract. + +- [ ] **Step 5: Commit the schema/config cleanup** + +```bash +git add sandbox/tool_schemas/code_tools.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py +git commit -m "test: align code backend schema and config coverage" +``` + +## Chunk 4: Add the Opt-In Real Rollout Smoke + +### Task 4: Add MCP-style env-gated real `code` rollout smoke support + +**Files:** +- Create: `rollout/tests/conftest.py` +- Create: `rollout/tests/test_code_real_smoke.py` + +- [ ] **Step 1: Write the smoke test first** + +Create `rollout/tests/test_code_real_smoke.py` with a real-smoke shape like: + +```python +import json +from pathlib import Path + +from rollout import RolloutConfig, RolloutPipeline + + +def test_code_real_rollout_smoke(tmp_path, code_real_settings): + fixture_repo = tmp_path / "fixture_repo" + fixture_repo.mkdir() + nested = fixture_repo / "nested" + nested.mkdir() + token = "AF_CODE_SMOKE_TOKEN_7F3A91" + (nested / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") + + benchmark_path = tmp_path / "benchmark.jsonl" + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": "Use the available code tools to read nested/TOKEN.txt. Reply with only the exact token.", + "answer": token, + } + ) + + "\n", + encoding="utf-8", + ) + + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + output_dir=str(tmp_path / "out"), + model_name=code_real_settings["model"], + api_key=code_real_settings["api_key"], + base_url=code_real_settings["base_url"], + max_turns=5, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={"code": {"content": {"source_dir": str(fixture_repo)}}}, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + number_of_tasks=1, + ) + + summary = RolloutPipeline(config, output_dir=str(tmp_path / "out")).run() + + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 +``` + +- [ ] **Step 2: Run collection to verify the smoke currently fails** + +Run: + +```bash +PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: FAIL because the new smoke file or its `code_real_settings` fixture does not exist yet. + +- [ ] **Step 3: Add MCP-style collection gating and credential CLI options** + +Create `rollout/tests/conftest.py`: + +```python +import os +from pathlib import Path + +import pytest + + +_REAL_CODE_TEST_FILES = { + "test_code_real_smoke.py", +} + + +def pytest_addoption(parser): + parser.addoption("--real-api-key", action="store", default="") + parser.addoption("--real-base-url", action="store", default="") + parser.addoption("--real-model", action="store", default="") + + +def pytest_ignore_collect(collection_path, config): + if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": + return False + + path = Path(str(collection_path)) + return path.name in _REAL_CODE_TEST_FILES + + +@pytest.fixture +def code_real_settings(request): + api_key = request.config.getoption("--real-api-key") + base_url = request.config.getoption("--real-base-url") + model = request.config.getoption("--real-model") + if not api_key or not base_url or not model: + pytest.skip( + "Provide --real-api-key, --real-base-url, and --real-model to run code_real smoke tests." + ) + return {"api_key": api_key, "base_url": base_url, "model": model} +``` + +Then complete `rollout/tests/test_code_real_smoke.py` so it also: + +- locates the results file written by `RolloutPipeline` +- loads the single saved result +- asserts there is at least one `code:*` tool call in the trajectory +- asserts the final answer equals the unique token +- asserts the token appears in the tool-result chain + +- [ ] **Step 4: Verify collection gating behavior** + +Run: + +```bash +PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: no tests collected from this file unless opt-in is enabled, matching the MCP real-smoke pattern. + +Run: + +```bash +AGENTFLOW_RUN_CODE_REAL=1 PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py --collect-only +``` + +Expected: PASS. The real-smoke file is now collected explicitly, matching MCP's env-gated behavior. + +- [ ] **Step 5: Commit the real-smoke scaffolding** + +```bash +git add rollout/tests/conftest.py \ + rollout/tests/test_code_real_smoke.py +git commit -m "test: add opt-in code rollout real smoke" +``` + +## Chunk 5: Final Verification and Live Smoke Run + +### Task 5: Run the full targeted regression suite and the real smoke with supplied credentials + +**Files:** +- No code changes expected + +- [ ] **Step 1: Run the full targeted regression suite** + +Run: + +```bash +PYTHONPATH=. pytest -q sandbox/tests/test_code_vendor_tools.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/test_config.py \ + rollout/tests/test_integration.py +``` + +Expected: PASS. The vendored tool package, the simplified backend, schema/config coverage, and rollout baseline all pass together. + +- [ ] **Step 2: Run the real rollout smoke with explicit credentials** + +Run: + +```bash +AGENTFLOW_RUN_CODE_REAL=1 PYTHONPATH=. pytest -q rollout/tests/test_code_real_smoke.py \ + --real-api-key '' \ + --real-base-url '' \ + --real-model '' \ + -s +``` + +Expected: PASS. Sandbox starts, the `code` session is created, at least one real `code:*` tool call occurs, and the final answer matches the unique token from the fixture repo. + +- [ ] **Step 3: Inspect the real-smoke output** + +Verify in the saved trajectory/result payload that: + +- the trajectory contains at least one `code:*` tool call +- at least one `code:read` or `code:glob` appears +- the token from `nested/TOKEN.txt` is present in tool-result observations +- the final predicted answer equals the token exactly + +- [ ] **Step 4: Commit the integrated result** + +```bash +git status --short +git add sandbox/server/backends/resources/code_vendor \ + sandbox/server/backends/resources/code.py \ + configs/sandbox-server/code_config.json \ + sandbox/tests/test_code_vendor_tools.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tool_schemas/code_tools.py \ + sandbox/tests/test_code_tool_schemas.py \ + sandbox/tests/test_sandbox_config_loading.py \ + rollout/tests/conftest.py \ + rollout/tests/test_code_real_smoke.py +git commit -m "refactor: vendor code backend tools into agentflow" +``` + +- [ ] **Step 5: Record final verification notes** + +Capture: + +- exact targeted pytest command and pass result +- exact `AGENTFLOW_RUN_CODE_REAL=1 ... pytest ...` command used +- whether the real smoke passed +- any residual risk, especially around powerful `bash` behavior and live-model variability + +## Execution Notes + +- Use `PYTHONPATH=.` for pytest commands in this repository unless the execution harness already injects the repo root. +- Keep path-boundary enforcement in `CodeBackend`; do not push it into the vendored tool files. +- Do not leave any `claude_code_root`, `allow_bash`, or `bash_timeout_seconds` references behind in tests or config templates. +- Do not widen the real-smoke scope into general rollout refactoring. + +Plan complete and saved to `docs/superpowers/plans/2026-04-16-code-backend-single-repo-vendoring.md`. Ready to execute? diff --git a/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md index e142a91..7085867 100644 --- a/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md +++ b/docs/superpowers/specs/2026-04-15-code-backend-single-repo-vendoring-design.md @@ -274,9 +274,9 @@ Add one end-to-end rollout smoke that exercises the real rollout-to-sandbox-to-c The smoke should: - live under `rollout/tests/` -- be marked with a dedicated pytest marker, for example `code_real` -- not run in default pytest execution -- require explicit manual invocation +- follow the MCP real-smoke opt-in pattern +- not be collected in default pytest execution +- require explicit manual invocation, for example by setting `AGENTFLOW_RUN_CODE_REAL=1` - use a real LLM response path and a real sandbox/code backend path This smoke should not mock sandbox components. It should really: From 1672c2dd93a190399b906a57dee71ac79d1f0a3a Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 09:34:08 +0800 Subject: [PATCH 03/39] Vendor code backend tools --- .../resources/code_vendor/__init__.py | 11 ++ .../resources/code_vendor/edit_tools.py | 85 ++++++++ .../resources/code_vendor/file_tools.py | 139 +++++++++++++ .../backends/resources/code_vendor/tool.py | 29 +++ sandbox/tests/test_code_vendor_tools.py | 187 ++++++++++++++++++ 5 files changed, 451 insertions(+) create mode 100644 sandbox/server/backends/resources/code_vendor/__init__.py create mode 100644 sandbox/server/backends/resources/code_vendor/edit_tools.py create mode 100644 sandbox/server/backends/resources/code_vendor/file_tools.py create mode 100644 sandbox/server/backends/resources/code_vendor/tool.py create mode 100644 sandbox/tests/test_code_vendor_tools.py diff --git a/sandbox/server/backends/resources/code_vendor/__init__.py b/sandbox/server/backends/resources/code_vendor/__init__.py new file mode 100644 index 0000000..1fc9cfa --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/__init__.py @@ -0,0 +1,11 @@ +from .edit_tools import EditTool, WriteTool +from .file_tools import BashTool, GlobTool, GrepTool, ReadTool + +__all__ = [ + "BashTool", + "EditTool", + "GlobTool", + "GrepTool", + "ReadTool", + "WriteTool", +] diff --git a/sandbox/server/backends/resources/code_vendor/edit_tools.py b/sandbox/server/backends/resources/code_vendor/edit_tools.py new file mode 100644 index 0000000..622658d --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/edit_tools.py @@ -0,0 +1,85 @@ +from __future__ import annotations + +from pathlib import Path +from typing import Any + +from .tool import Tool + + +class EditTool(Tool): + name = "Edit" + description = ( + "Perform an exact string replacement in a file. " + "old_string must uniquely identify the target location unless replace_all=true." + ) + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "old_string": {"type": "string"}, + "new_string": {"type": "string"}, + "replace_all": {"type": "boolean", "default": False}, + }, + "required": ["file_path", "old_string", "new_string"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + old_string = args["old_string"] + new_string = args["new_string"] + replace_all = args.get("replace_all", False) + + if not path.exists(): + return f"Error: file not found: {path}" + + content = path.read_text(encoding="utf-8") + count = content.count(old_string) + if count == 0: + return f"Error: old_string not found in {path}. Read the file first to verify the exact text." + if count > 1 and not replace_all: + return ( + f"Error: old_string appears {count} times in {path}. " + "Provide more surrounding context to make it unique, or set replace_all=true." + ) + + if replace_all: + updated = content.replace(old_string, new_string) + replacements = count + else: + updated = content.replace(old_string, new_string, 1) + replacements = 1 + + path.write_text(updated, encoding="utf-8") + return f"Replaced {replacements} occurrence(s) in {path}" + + +class WriteTool(Tool): + name = "Write" + description = "Write content to a file, creating parent directories if needed." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "content": {"type": "string"}, + }, + "required": ["file_path", "content"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + content = args["content"] + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text(content, encoding="utf-8") + + line_count = content.count("\n") + if content and not content.endswith("\n"): + line_count += 1 + return f"Wrote {len(content)} bytes ({line_count} lines) to {path}" diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py new file mode 100644 index 0000000..325747d --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -0,0 +1,139 @@ +from __future__ import annotations + +import glob +import re +import subprocess +from pathlib import Path +from typing import Any + +from .tool import Tool + + +class BashTool(Tool): + name = "Bash" + description = "Execute a shell command and return stdout/stderr." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "command": {"type": "string", "description": "Shell command to run"}, + }, + "required": ["command"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + result = subprocess.run( + args["command"], + shell=True, + capture_output=True, + text=True, + cwd=ctx.cwd, + ) + output = result.stdout.rstrip("\n") + stderr = result.stderr.rstrip("\n") + if stderr: + output = f"{output}\n[stderr]:\n{stderr}" if output else f"[stderr]:\n{stderr}" + return output or "(no output)" + + +class ReadTool(Tool): + name = "Read" + description = "Read a file and return its contents with line numbers." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "file_path": {"type": "string"}, + "offset": {"type": "integer", "description": "Number of lines to skip"}, + "limit": {"type": "integer", "description": "Maximum lines to return"}, + }, + "required": ["file_path"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + del ctx + path = Path(args["file_path"]) + if not path.exists(): + return f"Error: file not found: {path}" + + lines = path.read_text(encoding="utf-8").splitlines() + offset = max(args.get("offset", 0), 0) + limit = args.get("limit", 2000) + selected = lines[offset : offset + limit] + return "\n".join( + f"{line_number}: {line}" + for line_number, line in enumerate(selected, start=offset + 1) + ) + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True + + +class GlobTool(Tool): + name = "Glob" + description = "Find files matching a glob pattern." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Glob pattern"}, + "path": {"type": "string", "description": "Directory to search from"}, + }, + "required": ["pattern"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + base = Path(args.get("path", ctx.cwd)) + pattern = args["pattern"] + matches = sorted(glob.glob(pattern, root_dir=base, recursive=True)) + resolved = [str((base / match).resolve(strict=False)) for match in matches] + return "\n".join(resolved) or "(no matches)" + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True + + +class GrepTool(Tool): + name = "Grep" + description = "Search file contents with a regex pattern." + + @property + def input_schema(self) -> dict[str, Any]: + return { + "type": "object", + "properties": { + "pattern": {"type": "string", "description": "Regex pattern"}, + "path": {"type": "string", "description": "Directory to search"}, + "glob": {"type": "string", "description": "Optional file glob filter"}, + }, + "required": ["pattern"], + } + + async def call(self, args: dict[str, Any], ctx: Any) -> str: + base = Path(args.get("path", ctx.cwd)) + pattern = re.compile(args["pattern"]) + file_glob = args.get("glob") + + if file_glob: + paths = sorted(path for path in base.rglob(file_glob) if path.is_file()) + else: + paths = sorted(path for path in base.rglob("*") if path.is_file()) + + matches: list[str] = [] + for path in paths: + for line_number, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1): + if pattern.search(line): + matches.append(f"{path}:{line_number}:{line}") + return "\n".join(matches) or "(no matches)" + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return True diff --git a/sandbox/server/backends/resources/code_vendor/tool.py b/sandbox/server/backends/resources/code_vendor/tool.py new file mode 100644 index 0000000..bef7084 --- /dev/null +++ b/sandbox/server/backends/resources/code_vendor/tool.py @@ -0,0 +1,29 @@ +from __future__ import annotations + +from abc import ABC, abstractmethod +from typing import Any + + +class Tool(ABC): + name: str + description: str + + @property + @abstractmethod + def input_schema(self) -> dict[str, Any]: + raise NotImplementedError + + @abstractmethod + async def call(self, args: dict[str, Any], ctx: Any) -> str: + raise NotImplementedError + + def is_read_only(self, args: dict[str, Any]) -> bool: + del args + return False + + def to_api_format(self) -> dict[str, Any]: + return { + "name": self.name, + "description": self.description, + "input_schema": self.input_schema, + } diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py new file mode 100644 index 0000000..be76c15 --- /dev/null +++ b/sandbox/tests/test_code_vendor_tools.py @@ -0,0 +1,187 @@ +import asyncio +import importlib.util +import sys +from types import SimpleNamespace +from pathlib import Path + +PACKAGE_DIR = ( + Path(__file__).resolve().parents[1] + / "server" + / "backends" + / "resources" + / "code_vendor" +) + + +def load_code_vendor_module(module_name): + package_name = "_test_code_vendor" + package_spec = importlib.util.spec_from_file_location( + package_name, + PACKAGE_DIR / "__init__.py", + submodule_search_locations=[str(PACKAGE_DIR)], + ) + package = importlib.util.module_from_spec(package_spec) + sys.modules[package_name] = package + assert package_spec is not None + assert package_spec.loader is not None + package_spec.loader.exec_module(package) + + module_spec = importlib.util.spec_from_file_location( + f"{package_name}.{module_name}", + PACKAGE_DIR / f"{module_name}.py", + ) + module = importlib.util.module_from_spec(module_spec) + sys.modules[f"{package_name}.{module_name}"] = module + assert module_spec is not None + assert module_spec.loader is not None + module_spec.loader.exec_module(module) + return module + + +file_tools = load_code_vendor_module("file_tools") +edit_tools = load_code_vendor_module("edit_tools") +tool_module = load_code_vendor_module("tool") + +ReadTool = file_tools.ReadTool +GlobTool = file_tools.GlobTool +GrepTool = file_tools.GrepTool +BashTool = file_tools.BashTool +EditTool = edit_tools.EditTool +WriteTool = edit_tools.WriteTool + + +def make_ctx(tmp_path): + return SimpleNamespace(cwd=str(tmp_path)) + + +def call_tool(tool, args, ctx): + return asyncio.run(tool.call(args, ctx)) + + +def test_read_tool_returns_line_numbered_content(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\ngamma\n", encoding="utf-8") + + result = call_tool(ReadTool(), {"file_path": str(target)}, make_ctx(tmp_path)) + + assert result == "1: alpha\n2: beta\n3: gamma" + + +def test_read_tool_honors_offset_and_limit(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\ngamma\ndelta\n", encoding="utf-8") + + result = call_tool( + ReadTool(), + {"file_path": str(target), "offset": 1, "limit": 2}, + make_ctx(tmp_path), + ) + + assert result == "2: beta\n3: gamma" + + +def test_edit_tool_requires_unique_match_by_default(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\nbeta\n", encoding="utf-8") + + result = call_tool( + EditTool(), + {"file_path": str(target), "old_string": "beta", "new_string": "BETA"}, + make_ctx(tmp_path), + ) + + assert "appears 2 times" in result + assert "replace_all=true" in result + assert target.read_text(encoding="utf-8") == "alpha\nbeta\nbeta\n" + + +def test_edit_tool_replace_all_updates_each_match(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\nbeta\n", encoding="utf-8") + + result = call_tool( + EditTool(), + { + "file_path": str(target), + "old_string": "beta", + "new_string": "BETA", + "replace_all": True, + }, + make_ctx(tmp_path), + ) + + assert result == f"Replaced 2 occurrence(s) in {target}" + assert target.read_text(encoding="utf-8") == "alpha\nBETA\nBETA\n" + + +def test_write_tool_creates_parent_directories_and_writes_content(tmp_path): + target = tmp_path / "nested" / "dir" / "sample.txt" + + result = call_tool( + WriteTool(), + {"file_path": str(target), "content": "hello\nworld\n"}, + make_ctx(tmp_path), + ) + + assert result == f"Wrote 12 bytes (2 lines) to {target}" + assert target.read_text(encoding="utf-8") == "hello\nworld\n" + + +def test_glob_tool_returns_sorted_matches(tmp_path): + (tmp_path / "a.py").write_text("print('a')\n", encoding="utf-8") + pkg = tmp_path / "pkg" + pkg.mkdir() + (pkg / "b.py").write_text("print('b')\n", encoding="utf-8") + (pkg / "c.txt").write_text("ignore\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "**/*.py", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == f"{tmp_path / 'a.py'}\n{tmp_path / 'pkg' / 'b.py'}" + + +def test_grep_tool_returns_matches_with_line_numbers(tmp_path): + first = tmp_path / "first.txt" + second = tmp_path / "second.txt" + first.write_text("alpha\nbeta\n", encoding="utf-8") + second.write_text("beta\ngamma\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "beta", "path": str(tmp_path), "glob": "*.txt"}, + make_ctx(tmp_path), + ) + + assert result == f"{first}:2:beta\n{second}:1:beta" + + +def test_bash_tool_combines_stdout_and_stderr(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + "python -c \"import sys; " + "print('out'); " + "print('err', file=sys.stderr)\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result == "out\n[stderr]:\nerr" + + +def test_tool_api_format_and_read_only_flags(): + read_tool = ReadTool() + bash_tool = BashTool() + + api_format = read_tool.to_api_format() + + assert api_format["name"] == "Read" + assert isinstance(api_format["description"], str) + assert api_format["input_schema"] == read_tool.input_schema + assert read_tool.is_read_only({}) is True + assert bash_tool.is_read_only({}) is False From a2c0bd108a53cde1588297264994ad857b5b58de Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 09:40:26 +0800 Subject: [PATCH 04/39] Fix vendored tool upstream contracts --- .../resources/code_vendor/file_tools.py | 42 +++++++------------ sandbox/tests/test_code_vendor_tools.py | 39 ++++++++++++++--- 2 files changed, 48 insertions(+), 33 deletions(-) diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py index 325747d..a42e670 100644 --- a/sandbox/server/backends/resources/code_vendor/file_tools.py +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -1,7 +1,5 @@ from __future__ import annotations -import glob -import re import subprocess from pathlib import Path from typing import Any @@ -31,11 +29,10 @@ async def call(self, args: dict[str, Any], ctx: Any) -> str: text=True, cwd=ctx.cwd, ) - output = result.stdout.rstrip("\n") - stderr = result.stderr.rstrip("\n") - if stderr: - output = f"{output}\n[stderr]:\n{stderr}" if output else f"[stderr]:\n{stderr}" - return output or "(no output)" + output = result.stdout + if result.stderr: + output += f"\n[stderr]:\n{result.stderr}" + return output.strip() or "(no output)" class ReadTool(Tool): @@ -48,7 +45,7 @@ def input_schema(self) -> dict[str, Any]: "type": "object", "properties": { "file_path": {"type": "string"}, - "offset": {"type": "integer", "description": "Number of lines to skip"}, + "offset": {"type": "integer", "description": "Start line (1-indexed)"}, "limit": {"type": "integer", "description": "Maximum lines to return"}, }, "required": ["file_path"], @@ -61,11 +58,11 @@ async def call(self, args: dict[str, Any], ctx: Any) -> str: return f"Error: file not found: {path}" lines = path.read_text(encoding="utf-8").splitlines() - offset = max(args.get("offset", 0), 0) + offset = max(0, args.get("offset", 1) - 1) limit = args.get("limit", 2000) selected = lines[offset : offset + limit] return "\n".join( - f"{line_number}: {line}" + f"{line_number:4}→{line}" for line_number, line in enumerate(selected, start=offset + 1) ) @@ -92,9 +89,8 @@ def input_schema(self) -> dict[str, Any]: async def call(self, args: dict[str, Any], ctx: Any) -> str: base = Path(args.get("path", ctx.cwd)) pattern = args["pattern"] - matches = sorted(glob.glob(pattern, root_dir=base, recursive=True)) - resolved = [str((base / match).resolve(strict=False)) for match in matches] - return "\n".join(resolved) or "(no matches)" + matches = sorted(base.glob(pattern)) + return "\n".join(str(match) for match in matches) or "(no matches)" def is_read_only(self, args: dict[str, Any]) -> bool: del args @@ -119,20 +115,12 @@ def input_schema(self) -> dict[str, Any]: async def call(self, args: dict[str, Any], ctx: Any) -> str: base = Path(args.get("path", ctx.cwd)) - pattern = re.compile(args["pattern"]) - file_glob = args.get("glob") - - if file_glob: - paths = sorted(path for path in base.rglob(file_glob) if path.is_file()) - else: - paths = sorted(path for path in base.rglob("*") if path.is_file()) - - matches: list[str] = [] - for path in paths: - for line_number, line in enumerate(path.read_text(encoding="utf-8").splitlines(), start=1): - if pattern.search(line): - matches.append(f"{path}:{line_number}:{line}") - return "\n".join(matches) or "(no matches)" + cmd = ["grep", "-r", "-n", args["pattern"]] + if "glob" in args: + cmd += ["--include", args["glob"]] + cmd.append(str(base)) + result = subprocess.run(cmd, capture_output=True, text=True) + return result.stdout or "(no matches)" def is_read_only(self, args: dict[str, Any]) -> bool: del args diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index be76c15..b8f7efc 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -64,7 +64,7 @@ def test_read_tool_returns_line_numbered_content(tmp_path): result = call_tool(ReadTool(), {"file_path": str(target)}, make_ctx(tmp_path)) - assert result == "1: alpha\n2: beta\n3: gamma" + assert result == " 1→alpha\n 2→beta\n 3→gamma" def test_read_tool_honors_offset_and_limit(tmp_path): @@ -77,7 +77,7 @@ def test_read_tool_honors_offset_and_limit(tmp_path): make_ctx(tmp_path), ) - assert result == "2: beta\n3: gamma" + assert result == " 1→alpha\n 2→beta" def test_edit_tool_requires_unique_match_by_default(tmp_path): @@ -114,8 +114,10 @@ def test_edit_tool_replace_all_updates_each_match(tmp_path): assert target.read_text(encoding="utf-8") == "alpha\nBETA\nBETA\n" -def test_write_tool_creates_parent_directories_and_writes_content(tmp_path): +def test_write_tool_creates_parent_directories_and_overwrites_full_file(tmp_path): target = tmp_path / "nested" / "dir" / "sample.txt" + target.parent.mkdir(parents=True, exist_ok=True) + target.write_text("stale content that should disappear\n", encoding="utf-8") result = call_tool( WriteTool(), @@ -143,7 +145,7 @@ def test_glob_tool_returns_sorted_matches(tmp_path): assert result == f"{tmp_path / 'a.py'}\n{tmp_path / 'pkg' / 'b.py'}" -def test_grep_tool_returns_matches_with_line_numbers(tmp_path): +def test_grep_tool_returns_matches_with_line_numbers_for_filtered_files(tmp_path): first = tmp_path / "first.txt" second = tmp_path / "second.txt" first.write_text("alpha\nbeta\n", encoding="utf-8") @@ -155,7 +157,32 @@ def test_grep_tool_returns_matches_with_line_numbers(tmp_path): make_ctx(tmp_path), ) - assert result == f"{first}:2:beta\n{second}:1:beta" + assert result.endswith("\n") + assert set(result.splitlines()) == { + f"{first}:2:beta", + f"{second}:1:beta", + } + + +def test_grep_tool_searches_recursively_without_glob_filter(tmp_path): + root_match = tmp_path / "root.txt" + nested_dir = tmp_path / "pkg" / "nested" + nested_dir.mkdir(parents=True) + nested_match = nested_dir / "deep.py" + root_match.write_text("needle at root\n", encoding="utf-8") + nested_match.write_text("first line\nneedle in nested file\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result.endswith("\n") + assert set(result.splitlines()) == { + f"{nested_match}:2:needle in nested file", + f"{root_match}:1:needle at root", + } def test_bash_tool_combines_stdout_and_stderr(tmp_path): @@ -171,7 +198,7 @@ def test_bash_tool_combines_stdout_and_stderr(tmp_path): make_ctx(tmp_path), ) - assert result == "out\n[stderr]:\nerr" + assert result == "out\n\n[stderr]:\nerr" def test_tool_api_format_and_read_only_flags(): From 96dc16e6e42767a3add0016cabbb111a523703cb Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 09:57:15 +0800 Subject: [PATCH 05/39] Rewrite code backend around vendored tools --- configs/sandbox-server/code_config.json | 7 +- sandbox/server/backends/resources/code.py | 232 +--- sandbox/tests/test_code_backend.py | 1249 ++++++--------------- 3 files changed, 364 insertions(+), 1124 deletions(-) diff --git a/configs/sandbox-server/code_config.json b/configs/sandbox-server/code_config.json index dfc42ee..22bd72a 100644 --- a/configs/sandbox-server/code_config.json +++ b/configs/sandbox-server/code_config.json @@ -7,13 +7,10 @@ "resources": { "code": { "enabled": true, - "description": "Lightweight coding backend powered by claude-code-py tools", + "description": "Lightweight coding backend powered by vendored internal tools", "backend_class": "sandbox.server.backends.resources.code.CodeBackend", "config": { - "claude_code_root": "${CLAUDE_CODE_ROOT:-/home/a1/sdb/dxd/claude-code-py}", - "workspace_root": "/tmp/agentflow_code", - "allow_bash": true, - "bash_timeout_seconds": 30 + "workspace_root": "/tmp/agentflow_code" } } }, diff --git a/sandbox/server/backends/resources/code.py b/sandbox/server/backends/resources/code.py index e97c032..a4d3248 100644 --- a/sandbox/server/backends/resources/code.py +++ b/sandbox/server/backends/resources/code.py @@ -4,14 +4,8 @@ from __future__ import annotations -import asyncio -from contextlib import contextmanager -import importlib.util -import os import re -import signal import shutil -import sys import time import uuid from pathlib import Path @@ -24,6 +18,13 @@ build_error_response, build_success_response, ) +from sandbox.server.backends.resources.code_vendor.edit_tools import EditTool, WriteTool +from sandbox.server.backends.resources.code_vendor.file_tools import ( + BashTool, + GlobTool, + GrepTool, + ReadTool, +) class CodeBackend(Backend): @@ -36,16 +37,12 @@ def __init__(self, config: BackendConfig | None = None): config = BackendConfig( enabled=True, default_config={ - "claude_code_root": "", "workspace_root": "/tmp/agentflow_code", - "allow_bash": False, - "bash_timeout_seconds": 30, }, description="Code backend", ) super().__init__(config) self._tool_instances: dict[str, Any] | None = None - self._module_namespace = f"_agentflow_code_backend_{id(self)}" def bind_server(self, server) -> None: super().bind_server(server) @@ -58,14 +55,13 @@ def bind_server(self, server) -> None: async def initialize(self, worker_id: str, config: dict) -> dict: source_dir = self._resolve_source_dir(config) - self._validate_claude_code_root_prerequisites() workspace, staged_workspace, previous_workspace = self._prepare_workspace(worker_id) try: if source_dir: self._copy_source_dir(source_dir, staged_workspace) - self._load_claude_code_tools() + self._load_code_tools() self._commit_prepared_workspace(workspace, staged_workspace, previous_workspace) except Exception: if staged_workspace.exists(): @@ -100,12 +96,6 @@ async def cleanup(self, worker_id: str, session_info: dict) -> None: shutil.rmtree(workspace) return None - def _get_claude_code_root(self) -> Path | None: - value = self.get_default_config().get("claude_code_root") - if not isinstance(value, str) or not value.strip(): - return None - return Path(value) - def _get_workspace_root(self) -> Path: value = self.get_default_config().get("workspace_root") or "/tmp/agentflow_code" return Path(value) @@ -169,109 +159,18 @@ def _copy_source_dir(self, source_dir: Path, workspace: Path) -> None: else: shutil.copy2(child, destination) - def _load_claude_code_tools(self) -> dict[str, Any]: - if self._tool_instances is not None: - return self._tool_instances - - root_path = self._get_claude_code_root() - if root_path is None: - raise ValueError("claude_code_root is not configured") - - support_modules = self._load_root_support_modules(root_path) - with self._temporary_module_aliases(support_modules): - tool_module = self._load_module_from_path( - f"{self._module_namespace}.tool", - root_path / "tool.py", - ) - with self._temporary_module_aliases({"tool": tool_module}): - file_tools = self._load_module_from_path( - f"{self._module_namespace}.file_tools", - root_path / "tools" / "file_tools.py", - ) - edit_tools = self._load_module_from_path( - f"{self._module_namespace}.edit_tools", - root_path / "tools" / "edit_tools.py", - ) - - self._tool_instances = { - "read": file_tools.ReadTool(), - "glob": file_tools.GlobTool(), - "grep": file_tools.GrepTool(), - "bash": file_tools.BashTool(), - "edit": edit_tools.EditTool(), - "write": edit_tools.WriteTool(), - } + def _load_code_tools(self) -> dict[str, Any]: + if self._tool_instances is None: + self._tool_instances = { + "read": ReadTool(), + "glob": GlobTool(), + "grep": GrepTool(), + "bash": BashTool(), + "edit": EditTool(), + "write": WriteTool(), + } return self._tool_instances - def _validate_claude_code_root_prerequisites(self) -> None: - root_path = self._get_claude_code_root() - if root_path is None: - raise ValueError("claude_code_root is not configured") - - required_paths = ( - root_path / "tool.py", - root_path / "tools" / "file_tools.py", - root_path / "tools" / "edit_tools.py", - ) - for required_path in required_paths: - if not required_path.exists(): - raise ValueError(f"claude_code_root is missing required file: {required_path}") - - def _load_root_support_modules(self, root_path: Path) -> dict[str, Any]: - support_modules: dict[str, Any] = {} - pending_modules = { - module_name: root_path / f"{module_name}.py" - for module_name in ("log", "trace") - if (root_path / f"{module_name}.py").exists() - } - - while pending_modules: - made_progress = False - for module_name, module_path in list(pending_modules.items()): - try: - with self._temporary_module_aliases(support_modules): - support_modules[module_name] = self._load_module_from_path( - f"{self._module_namespace}.{module_name}", - module_path, - ) - except ModuleNotFoundError as exc: - if exc.name in pending_modules: - continue - raise - else: - del pending_modules[module_name] - made_progress = True - - if not made_progress: - unresolved = ", ".join(sorted(pending_modules)) - raise ImportError(f"Unable to resolve root support modules: {unresolved}") - - return support_modules - - @contextmanager - def _temporary_module_aliases(self, aliases: dict[str, Any]): - previous_modules: dict[str, Any] = {} - for module_name, module in aliases.items(): - previous_modules[module_name] = sys.modules.get(module_name) - sys.modules[module_name] = module - try: - yield - finally: - for module_name, previous_module in previous_modules.items(): - if previous_module is None: - sys.modules.pop(module_name, None) - else: - sys.modules[module_name] = previous_module - - def _load_module_from_path(self, module_name: str, path: Path): - spec = importlib.util.spec_from_file_location(module_name, str(path)) - if spec is None or spec.loader is None: - raise ImportError(f"Unable to load module from {path}") - module = importlib.util.module_from_spec(spec) - sys.modules[module_name] = module - spec.loader.exec_module(module) - return module - def _make_bridge_tool(self, tool_name: str): async def bridge_tool(session_info: dict, **params): return await self._dispatch(tool_name, session_info, params) @@ -293,10 +192,11 @@ async def _dispatch( worker_id = runtime_params.pop("worker_id", None) runtime_params.pop("session_id", None) - if tool_name == "bash" and not self.get_default_config().get("allow_bash", False): + tool = self._load_code_tools().get(tool_name) + if tool is None: return build_error_response( - code=ErrorCode.BUSINESS_FAILURE, - message="code:bash is disabled by backend config (allow_bash=False)", + code=ErrorCode.INVALID_REQUEST_FORMAT, + message=f"Unknown code tool: {tool_name}", tool=full_name, execution_time_ms=(time.time() - start_time) * 1000, resource_type=self.name, @@ -304,21 +204,6 @@ async def _dispatch( trace_id=trace_id, ) - tool = None - if tool_name != "bash": - tools = self._load_claude_code_tools() - tool = tools.get(tool_name) - if tool is None: - return build_error_response( - code=ErrorCode.INVALID_REQUEST_FORMAT, - message=f"Unknown code tool: {tool_name}", - tool=full_name, - execution_time_ms=(time.time() - start_time) * 1000, - resource_type=self.name, - session_id=session_id, - trace_id=trace_id, - ) - workspace_value = ((session_info or {}).get("data") or {}).get("workspace") if not isinstance(workspace_value, str) or not workspace_value.strip(): return build_error_response( @@ -368,11 +253,8 @@ async def _dispatch( workspace=workspace, ) except ValueError as exc: - error_code = ErrorCode.BUSINESS_FAILURE - if tool_name == "bash": - error_code = ErrorCode.INVALID_INPUT return build_error_response( - code=error_code, + code=ErrorCode.BUSINESS_FAILURE, message=str(exc), tool=full_name, execution_time_ms=(time.time() - start_time) * 1000, @@ -380,29 +262,9 @@ async def _dispatch( session_id=session_id, trace_id=trace_id, ) + try: - if tool_name == "bash": - bash_timeout_seconds = float( - self.get_default_config().get("bash_timeout_seconds", 30) - ) - try: - result = await self._run_bash_command( - command=normalized_params["command"], - workspace=workspace, - timeout_seconds=bash_timeout_seconds, - ) - except asyncio.TimeoutError: - return build_error_response( - code=ErrorCode.TIMEOUT_ERROR, - message="code:bash execution timeout", - tool=full_name, - execution_time_ms=(time.time() - start_time) * 1000, - resource_type=self.name, - session_id=session_id, - trace_id=trace_id, - ) - else: - result = await tool.call(normalized_params, ctx) + result = await tool.call(normalized_params, ctx) except Exception as exc: return build_error_response( code=ErrorCode.EXECUTION_ERROR, @@ -434,41 +296,6 @@ async def _dispatch( trace_id=trace_id, ) - async def _run_bash_command( - self, - command: str, - workspace: Path, - timeout_seconds: float, - ) -> str: - proc = await asyncio.create_subprocess_shell( - command, - cwd=str(workspace), - stdout=asyncio.subprocess.PIPE, - stderr=asyncio.subprocess.PIPE, - start_new_session=True, - ) - try: - stdout_bytes, stderr_bytes = await asyncio.wait_for( - proc.communicate(), - timeout=timeout_seconds, - ) - except asyncio.TimeoutError: - if proc.returncode is None: - try: - os.killpg(proc.pid, signal.SIGKILL) - except ProcessLookupError: - pass - await proc.communicate() - raise - - stdout_text = stdout_bytes.decode("utf-8", errors="replace") if stdout_bytes else "" - stderr_text = stderr_bytes.decode("utf-8", errors="replace") if stderr_bytes else "" - - out = stdout_text - if stderr_text: - out += f"\n[stderr]:\n{stderr_text}" - return out.strip() or "(no output)" - def _normalize_tool_params( self, tool_name: str, @@ -512,13 +339,4 @@ def _normalize_tool_params( ): raise ValueError("Glob pattern must not contain parent traversal segments") - if tool_name == "bash": - if "command" not in normalized: - raise ValueError("Parameter 'command' is required") - command = normalized.get("command") - if not isinstance(command, str): - raise ValueError("Parameter 'command' must be a string") - if not command.strip(): - raise ValueError("Parameter 'command' must not be empty") - return normalized diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index 3cf93ca..aa3ae7f 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -6,10 +6,9 @@ import importlib.util import itertools import os -import time -import shlex import sys from pathlib import Path +import types import pytest @@ -25,15 +24,49 @@ / "resources" / "code.py" ) +VENDOR_PACKAGE_DIR = MODULE_PATH.parent / "code_vendor" + + +def install_resources_package_stub(): + package_name = "sandbox.server.backends.resources" + if package_name not in sys.modules: + package = types.ModuleType(package_name) + package.__path__ = [str(MODULE_PATH.parent)] + sys.modules[package_name] = package + + vendor_package_name = f"{package_name}.code_vendor" + if vendor_package_name not in sys.modules: + package_spec = importlib.util.spec_from_file_location( + vendor_package_name, + VENDOR_PACKAGE_DIR / "__init__.py", + submodule_search_locations=[str(VENDOR_PACKAGE_DIR)], + ) + package = importlib.util.module_from_spec(package_spec) + assert package_spec is not None + assert package_spec.loader is not None + sys.modules[vendor_package_name] = package + package_spec.loader.exec_module(package) + + for module_name in ("tool", "file_tools", "edit_tools"): + full_name = f"{vendor_package_name}.{module_name}" + if full_name in sys.modules: + continue + module_spec = importlib.util.spec_from_file_location( + full_name, + VENDOR_PACKAGE_DIR / f"{module_name}.py", + ) + module = importlib.util.module_from_spec(module_spec) + assert module_spec is not None + assert module_spec.loader is not None + sys.modules[full_name] = module + module_spec.loader.exec_module(module) def load_code_backend_module(): + install_resources_package_stub() unique_id = next(_MODULE_LOAD_COUNTER) module_name = f"_test_code_backend_{unique_id}" - spec = importlib.util.spec_from_file_location( - module_name, - MODULE_PATH, - ) + spec = importlib.util.spec_from_file_location(module_name, MODULE_PATH) module = importlib.util.module_from_spec(spec) assert spec is not None assert spec.loader is not None @@ -55,162 +88,6 @@ def register_tool(self, name, func, resource_type=None): self._tool_resource_types[name] = resource_type -def build_backend_config(tmp_path): - return BackendConfig( - enabled=True, - default_config={ - "claude_code_root": str(tmp_path / "claude-code-py"), - "workspace_root": str(tmp_path / "agentflow_code"), - "allow_bash": True, - "bash_timeout_seconds": 30, - }, - description="Code backend", - ) - - -def create_fake_claude_code_root(tmp_path): - root = tmp_path / "claude-code-py" - tools_dir = root / "tools" - tools_dir.mkdir(parents=True, exist_ok=True) - - (root / "log.py").write_text( - "ROOT_LOG_MARKER = 'log-helper'\n", - encoding="utf-8", - ) - - (root / "trace.py").write_text( - "ROOT_TRACE_MARKER = 'trace-helper'\n", - encoding="utf-8", - ) - - (root / "tool.py").write_text( - "from log import ROOT_LOG_MARKER\n" - "from trace import ROOT_TRACE_MARKER\n" - "\n" - "class Tool:\n" - " ROOT_LOG_MARKER = ROOT_LOG_MARKER\n" - " ROOT_TRACE_MARKER = ROOT_TRACE_MARKER\n" - "\n" - " async def call(self, params, ctx):\n" - " raise NotImplementedError\n", - encoding="utf-8", - ) - - (tools_dir / "__init__.py").write_text( - "raise RuntimeError('tools package import should not happen')\n", - encoding="utf-8", - ) - - (tools_dir / "file_tools.py").write_text( - "from tool import Tool\n" - "import asyncio\n" - "from pathlib import Path\n" - "import glob\n" - "\n" - "class ReadTool(Tool):\n" - " def __init__(self):\n" - " self.loaded_log_marker = self.ROOT_LOG_MARKER\n" - " self.loaded_trace_marker = self.ROOT_TRACE_MARKER\n" - "\n" - " async def call(self, params, ctx):\n" - " file_path = Path(params['file_path'])\n" - " if not file_path.exists():\n" - " return f'Error: File not found: {file_path}'\n" - " return file_path.read_text(encoding='utf-8')\n" - "\n" - "class GlobTool(Tool):\n" - " async def call(self, params, ctx):\n" - " base = Path(params.get('path', '.'))\n" - " pattern = params.get('pattern', '*')\n" - " matches = sorted(glob.glob(pattern, root_dir=str(base), recursive=True))\n" - " return [str((base / match).resolve(strict=False)) for match in matches]\n" - "\n" - "class GrepTool(Tool):\n" - " async def call(self, params, ctx):\n" - " return f\"Found 0 matches in {Path(params.get('path', '.'))}\"\n" - "\n" - "class BashTool(Tool):\n" - " async def call(self, params, ctx):\n" - " command = params.get('command', '')\n" - " if command == 'pwd':\n" - " return ctx.cwd\n" - " if command.startswith('sleep '):\n" - " await asyncio.sleep(float(command.split(' ', 1)[1]))\n" - " return 'slept'\n" - " return f\"ran: {command}\"\n", - encoding="utf-8", - ) - - (tools_dir / "edit_tools.py").write_text( - "from tool import Tool\n" - "from pathlib import Path\n" - "\n" - "class EditTool(Tool):\n" - " async def call(self, params, ctx):\n" - " file_path = Path(params['file_path'])\n" - " if not file_path.exists():\n" - " return f'Error: File does not exist: {file_path}'\n" - " text = file_path.read_text(encoding='utf-8')\n" - " text = text.replace(params.get('old_string', ''), params.get('new_string', ''))\n" - " file_path.write_text(text, encoding='utf-8')\n" - " return f\"Updated {file_path}\"\n" - "\n" - "class WriteTool(Tool):\n" - " async def call(self, params, ctx):\n" - " file_path = Path(params['file_path'])\n" - " file_path.parent.mkdir(parents=True, exist_ok=True)\n" - " file_path.write_text(params.get('content', ''), encoding='utf-8')\n" - " return f\"Wrote {file_path}\"\n", - encoding="utf-8", - ) - return root - - -def create_marker_claude_code_root(tmp_path, root_name: str, marker: str): - root = tmp_path / root_name - tools_dir = root / "tools" - tools_dir.mkdir(parents=True, exist_ok=True) - - (root / "tool.py").write_text( - f"class Tool:\n" - f" ROOT_MARKER = {marker!r}\n" - " async def call(self, params, ctx):\n" - " return self.ROOT_MARKER\n", - encoding="utf-8", - ) - - (tools_dir / "file_tools.py").write_text( - "from tool import Tool\n" - "\n" - "class ReadTool(Tool):\n" - " def __init__(self):\n" - " self.loaded_marker = self.ROOT_MARKER\n" - "\n" - "class GlobTool(Tool):\n" - " pass\n" - "\n" - "class GrepTool(Tool):\n" - " pass\n" - "\n" - "class BashTool(Tool):\n" - " pass\n", - encoding="utf-8", - ) - - (tools_dir / "edit_tools.py").write_text( - "from tool import Tool\n" - "\n" - "class EditTool(Tool):\n" - " pass\n" - "\n" - "class WriteTool(Tool):\n" - " pass\n", - encoding="utf-8", - ) - - return root - - class FakeResourceRouter: def __init__(self, session_info): self._session_info = session_info @@ -232,12 +109,51 @@ async def destroy_session(self, worker_id, resource_type): return True -def test_bind_server_registers_code_tools(tmp_path): +def build_backend_config(tmp_path): + return BackendConfig( + enabled=True, + default_config={ + "workspace_root": str(tmp_path / "agentflow_code"), + }, + description="Code backend", + ) + + +def build_backend(tmp_path): module = load_code_backend_module() backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() + return module, backend + +def bind_backend_tools(backend): + fake_server = FakeServer() backend.bind_server(fake_server) + return fake_server + + +def build_executor(fake_server, session_info): + return ToolExecutor( + tools=fake_server._tools, + tool_name_index={}, + tool_resource_types=fake_server._tool_resource_types, + resource_router=FakeResourceRouter(session_info), + ) + + +def execute_tool(executor, action, *, params, worker_id, trace_id): + return asyncio.run( + executor.execute( + action=action, + params=params, + worker_id=worker_id, + trace_id=trace_id, + ) + ) + + +def test_bind_server_registers_code_tools(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) assert "code:read" in fake_server._tools assert "code:bash" in fake_server._tools @@ -245,10 +161,8 @@ def test_bind_server_registers_code_tools(tmp_path): assert fake_server._tool_resource_types["code:bash"] == "code" -def test_initialize_creates_worker_workspace(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) +def test_initialize_does_not_require_external_root(tmp_path): + _, backend = build_backend(tmp_path) session = asyncio.run(backend.initialize("runner_123", {})) @@ -257,9 +171,7 @@ def test_initialize_creates_worker_workspace(tmp_path): def test_initialize_copies_source_dir(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) source_dir = tmp_path / "source" source_dir.mkdir(parents=True) (source_dir / "demo.py").write_text("print('hi')\n", encoding="utf-8") @@ -273,431 +185,131 @@ def test_initialize_copies_source_dir(tmp_path): assert copied.read_text(encoding="utf-8") == "print('hi')\n" -def test_load_claude_code_tools_uses_direct_file_loading(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) +def test_load_code_tools_uses_internal_vendor_package(tmp_path): + _, backend = build_backend(tmp_path) - tools = backend._load_claude_code_tools() + tools = backend._load_code_tools() assert set(tools.keys()) == {"read", "glob", "grep", "bash", "edit", "write"} - - -def test_load_claude_code_tools_supports_root_local_tool_dependencies(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - - tools = backend._load_claude_code_tools() - - assert tools["read"].loaded_log_marker == "log-helper" - assert tools["read"].loaded_trace_marker == "trace-helper" - - -def test_load_code_backend_module_does_not_install_resources_package_in_sys_modules(): - package_name = "sandbox.server.backends.resources" - previous = sys.modules.pop(package_name, None) - - try: - module = load_code_backend_module() - assert hasattr(module, "CodeBackend") - assert package_name not in sys.modules - finally: - if previous is not None: - sys.modules[package_name] = previous - - -def test_load_claude_code_tools_is_isolated_per_backend_root(tmp_path): - module = load_code_backend_module() - root_a = create_marker_claude_code_root(tmp_path, "claude-code-a", "root-a") - root_b = create_marker_claude_code_root(tmp_path, "claude-code-b", "root-b") - - config_a = build_backend_config(tmp_path) - config_a.default_config["claude_code_root"] = str(root_a) - backend_a = module.CodeBackend(config=config_a) - - config_b = build_backend_config(tmp_path) - config_b.default_config["claude_code_root"] = str(root_b) - backend_b = module.CodeBackend(config=config_b) - - tools_a = backend_a._load_claude_code_tools() - tools_b = backend_b._load_claude_code_tools() - - assert tools_a["read"].loaded_marker == "root-a" - assert tools_b["read"].loaded_marker == "root-b" + assert type(tools["read"]).__module__ == ( + "sandbox.server.backends.resources.code_vendor.file_tools" + ) + assert type(tools["edit"]).__module__ == ( + "sandbox.server.backends.resources.code_vendor.edit_tools" + ) + assert backend._load_code_tools() is tools def test_tool_executor_code_dispatch_returns_standard_success_response(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) demo_file = runtime_workspace / "demo.py" demo_file.write_text("hello from demo\n", encoding="utf-8") - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-1", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-1", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": str(demo_file)}, - worker_id="worker-1", - trace_id="trace-1", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-1", ) assert result["code"] == ErrorCode.SUCCESS - assert result["data"] == "hello from demo\n" + assert result["data"] == " 1→hello from demo" def test_tool_executor_code_dispatch_preserves_trace_id(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) demo_file = runtime_workspace / "demo.py" demo_file.write_text("hello from demo\n", encoding="utf-8") - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-trace", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-trace", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": str(demo_file)}, - worker_id="worker-1", - trace_id="trace-preserve-1", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-preserve-1", ) assert result["code"] == ErrorCode.SUCCESS assert result["meta"]["trace_id"] == "trace-preserve-1" -def test_tool_executor_blocks_bash_when_allow_bash_false(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - config = build_backend_config(tmp_path) - config.default_config["allow_bash"] = False - backend = module.CodeBackend(config=config) - fake_server = FakeServer() - backend.bind_server(fake_server) - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-2", - "data": { - "workspace": str(tmp_path / "agentflow_code" / "runtime-workspace") - }, - } - ), - ) - - result = asyncio.run( - executor.execute( - action="code:bash", - params={"command": "echo hi"}, - worker_id="worker-1", - trace_id="trace-1", - ) - ) - - assert result["code"] == ErrorCode.BUSINESS_FAILURE - assert "disabled" in result["message"].lower() - - -def test_tool_executor_runs_bash_in_session_workspace_when_enabled(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - +def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-bash-enabled", - "data": {"workspace": str(runtime_workspace)}, - } - ), - ) - - result = asyncio.run( - executor.execute( - action="code:bash", - params={"command": "pwd"}, - worker_id="worker-1", - trace_id="trace-bash-enabled", - ) - ) - - assert result["code"] == ErrorCode.SUCCESS - assert result["data"].strip() == str(runtime_workspace.resolve(strict=False)) - - -def test_tool_executor_bash_success_formats_stdout_and_stderr_like_upstream(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - - runtime_workspace = tmp_path / "agentflow_code" / "worker-1" - runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-bash-format", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-bash", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:bash", - params={ - "command": ( - f"{shlex.quote(sys.executable)} -c " - "\"import sys; print('stdout-line'); print('stderr-line', file=sys.stderr)\"" - ) - }, - worker_id="worker-1", - trace_id="trace-bash-format", - ) + result = execute_tool( + executor, + "code:bash", + params={"command": "pwd"}, + worker_id="worker-1", + trace_id="trace-bash", ) assert result["code"] == ErrorCode.SUCCESS - assert result["data"] == "stdout-line\n\n[stderr]:\nstderr-line" - - -def test_tool_executor_returns_timeout_error_when_bash_exceeds_limit(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - config = build_backend_config(tmp_path) - config.default_config["bash_timeout_seconds"] = 0.01 - backend = module.CodeBackend(config=config) - fake_server = FakeServer() - backend.bind_server(fake_server) - - runtime_workspace = tmp_path / "agentflow_code" / "worker-1" - runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-bash-timeout", - "data": {"workspace": str(runtime_workspace)}, - } - ), - ) - - result = asyncio.run( - executor.execute( - action="code:bash", - params={ - "command": ( - f"{shlex.quote(sys.executable)} -c " - "\"import time; time.sleep(2)\"" - ) - }, - worker_id="worker-1", - trace_id="trace-bash-timeout", - ) - ) - - assert result["code"] == ErrorCode.TIMEOUT_ERROR - assert "timeout" in result["message"].lower() - - -def test_tool_executor_bash_timeout_returns_promptly_for_blocking_command(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - config = build_backend_config(tmp_path) - config.default_config["bash_timeout_seconds"] = 0.1 - backend = module.CodeBackend(config=config) - fake_server = FakeServer() - backend.bind_server(fake_server) - - runtime_workspace = tmp_path / "agentflow_code" / "worker-1" - runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-bash-real-timeout", - "data": {"workspace": str(runtime_workspace)}, - } - ), - ) - - start = time.monotonic() - result = asyncio.run( - executor.execute( - action="code:bash", - params={ - "command": ( - f"{shlex.quote(sys.executable)} -c " - "\"import time; time.sleep(5)\"" - ) - }, - worker_id="worker-1", - trace_id="trace-bash-real-timeout", - ) - ) - elapsed = time.monotonic() - start - - assert result["code"] == ErrorCode.TIMEOUT_ERROR - assert elapsed < 2.0 - - -def test_tool_executor_bash_rejects_missing_command(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - - runtime_workspace = tmp_path / "agentflow_code" / "worker-1" - runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-bash-missing-command", - "data": {"workspace": str(runtime_workspace)}, - } - ), - ) - - result = asyncio.run( - executor.execute( - action="code:bash", - params={}, - worker_id="worker-1", - trace_id="trace-bash-missing-command", - ) - ) - - assert result["code"] == ErrorCode.INVALID_INPUT - assert "command" in result["message"].lower() - - -def test_tool_executor_bash_rejects_non_string_command(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - - runtime_workspace = tmp_path / "agentflow_code" / "worker-1" - runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-bash-non-string-command", - "data": {"workspace": str(runtime_workspace)}, - } - ), - ) - - result = asyncio.run( - executor.execute( - action="code:bash", - params={"command": 123}, - worker_id="worker-1", - trace_id="trace-bash-non-string-command", - ) - ) - - assert result["code"] == ErrorCode.INVALID_INPUT - assert "command" in result["message"].lower() + assert result["data"] == str(runtime_workspace.resolve(strict=False)) def test_tool_executor_non_bash_timeout_uses_standard_error_handling(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) class TimeoutReadTool: async def call(self, params, ctx): del params, ctx raise asyncio.TimeoutError("read timeout") - backend._load_claude_code_tools = lambda: {"read": TimeoutReadTool()} + tools = backend._load_code_tools() + backend._tool_instances = dict(tools) + backend._tool_instances["read"] = TimeoutReadTool() runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) demo_file = runtime_workspace / "demo.py" demo_file.write_text("hello from demo\n", encoding="utf-8") - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-read-timeout", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-read-timeout", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": str(demo_file)}, - worker_id="worker-1", - trace_id="trace-read-timeout", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-read-timeout", ) assert result["code"] == ErrorCode.EXECUTION_ERROR @@ -705,12 +317,8 @@ async def call(self, params, ctx): def test_code_write_relative_file_path_resolves_inside_session_workspace(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) process_cwd = tmp_path / "process-cwd" @@ -718,25 +326,20 @@ def test_code_write_relative_file_path_resolves_inside_session_workspace(tmp_pat prev_cwd = Path.cwd() os.chdir(process_cwd) try: - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-3", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-3", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:write", - params={"file_path": "nested/output.txt", "content": "from workspace\n"}, - worker_id="worker-1", - trace_id="trace-1", - ) + result = execute_tool( + executor, + "code:write", + params={"file_path": "nested/output.txt", "content": "from workspace\n"}, + worker_id="worker-1", + trace_id="trace-1", ) finally: os.chdir(prev_cwd) @@ -749,33 +352,24 @@ def test_code_write_relative_file_path_resolves_inside_session_workspace(tmp_pat def test_code_read_error_prefix_is_returned_as_agentflow_error_response(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-4", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-4", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": "missing.txt"}, - worker_id="worker-1", - trace_id="trace-1", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": "missing.txt"}, + worker_id="worker-1", + trace_id="trace-1", ) assert result["code"] != ErrorCode.SUCCESS @@ -783,36 +377,26 @@ def test_code_read_error_prefix_is_returned_as_agentflow_error_response(tmp_path def test_tool_executor_rejects_missing_session_workspace_without_fallback(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) workspace_root = tmp_path / "agentflow_code" workspace_root.mkdir(parents=True, exist_ok=True) fallback_file = workspace_root / "fallback.txt" fallback_file.write_text("must-not-read\n", encoding="utf-8") - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-missing-workspace", - "data": {}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-missing-workspace", + "data": {}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": "fallback.txt"}, - worker_id="worker-1", - trace_id="trace-missing-workspace", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": "fallback.txt"}, + worker_id="worker-1", + trace_id="trace-missing-workspace", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -820,31 +404,22 @@ def test_tool_executor_rejects_missing_session_workspace_without_fallback(tmp_pa def test_tool_executor_rejects_malformed_session_workspace_without_fallback(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-malformed-workspace", - "data": {"workspace": 123}, - } - ), + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + executor = build_executor( + fake_server, + { + "session_id": "code-session-malformed-workspace", + "data": {"workspace": 123}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": "fallback.txt"}, - worker_id="worker-1", - trace_id="trace-malformed-workspace", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": "fallback.txt"}, + worker_id="worker-1", + trace_id="trace-malformed-workspace", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -852,33 +427,23 @@ def test_tool_executor_rejects_malformed_session_workspace_without_fallback(tmp_ def test_tool_executor_rejects_nonexistent_session_workspace_under_workspace_root(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) nonexistent_workspace = tmp_path / "agentflow_code" / "worker-1" - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-nonexistent-workspace", - "data": {"workspace": str(nonexistent_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-nonexistent-workspace", + "data": {"workspace": str(nonexistent_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": "demo.py"}, - worker_id="worker-1", - trace_id="trace-nonexistent-workspace", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": "demo.py"}, + worker_id="worker-1", + trace_id="trace-nonexistent-workspace", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -886,36 +451,26 @@ def test_tool_executor_rejects_nonexistent_session_workspace_under_workspace_roo def test_tool_executor_rejects_mismatched_session_workspace_under_workspace_root(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) mismatched_workspace = tmp_path / "agentflow_code" / "other-worker" mismatched_workspace.mkdir(parents=True) demo_file = mismatched_workspace / "demo.py" demo_file.write_text("should-not-read\n", encoding="utf-8") - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-mismatched-workspace", - "data": {"workspace": str(mismatched_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-mismatched-workspace", + "data": {"workspace": str(mismatched_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": str(demo_file)}, - worker_id="worker-1", - trace_id="trace-mismatched-workspace", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-mismatched-workspace", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -923,36 +478,26 @@ def test_tool_executor_rejects_mismatched_session_workspace_under_workspace_root def test_tool_executor_rejects_session_workspace_outside_workspace_root(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) outside_workspace = tmp_path / "outside-workspace" outside_workspace.mkdir(parents=True) demo_file = outside_workspace / "demo.py" demo_file.write_text("outside\n", encoding="utf-8") - - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-outside-workspace", - "data": {"workspace": str(outside_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-outside-workspace", + "data": {"workspace": str(outside_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": str(demo_file)}, - worker_id="worker-1", - trace_id="trace-outside-workspace", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": str(demo_file)}, + worker_id="worker-1", + trace_id="trace-outside-workspace", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -960,9 +505,7 @@ def test_tool_executor_rejects_session_workspace_outside_workspace_root(tmp_path def test_initialize_recreates_worker_workspace_without_stale_files(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) first_source = tmp_path / "source-first" second_source = tmp_path / "source-second" first_source.mkdir(parents=True) @@ -984,69 +527,51 @@ def test_initialize_recreates_worker_workspace_without_stale_files(tmp_path): def test_code_read_rejects_absolute_path_outside_workspace(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) outside_file = tmp_path / "outside.txt" outside_file.write_text("secret\n", encoding="utf-8") - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-5", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-5", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:read", - params={"file_path": str(outside_file)}, - worker_id="worker-1", - trace_id="trace-1", - ) + result = execute_tool( + executor, + "code:read", + params={"file_path": str(outside_file)}, + worker_id="worker-1", + trace_id="trace-1", ) assert result["code"] != ErrorCode.SUCCESS def test_code_write_rejects_parent_escape_outside_workspace(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) escaped_file = tmp_path / "escaped.txt" - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-6", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-6", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:write", - params={"file_path": "../escaped.txt", "content": "escaped\n"}, - worker_id="worker-1", - trace_id="trace-1", - ) + result = execute_tool( + executor, + "code:write", + params={"file_path": "../escaped.txt", "content": "escaped\n"}, + worker_id="worker-1", + trace_id="trace-1", ) assert result["code"] != ErrorCode.SUCCESS @@ -1054,33 +579,24 @@ def test_code_write_rejects_parent_escape_outside_workspace(tmp_path): def test_code_glob_rejects_parent_traversal_pattern(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-glob-parent-traversal", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-parent-traversal", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:glob", - params={"path": ".", "pattern": "../*"}, - worker_id="worker-1", - trace_id="trace-glob-parent-traversal", - ) + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "../*"}, + worker_id="worker-1", + trace_id="trace-glob-parent-traversal", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -1088,35 +604,29 @@ def test_code_glob_rejects_parent_traversal_pattern(tmp_path): def test_code_glob_rejects_embedded_parent_traversal_pattern(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) (runtime_workspace / "nested").mkdir(parents=True) - (runtime_workspace / "nested" / "demo.py").write_text("print('safe')\n", encoding="utf-8") - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-glob-embedded-traversal", - "data": {"workspace": str(runtime_workspace)}, - } - ), + (runtime_workspace / "nested" / "demo.py").write_text( + "print('safe')\n", + encoding="utf-8", + ) + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-embedded-traversal", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:glob", - params={"path": ".", "pattern": "**/../*"}, - worker_id="worker-1", - trace_id="trace-glob-embedded-traversal", - ) + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "**/../*"}, + worker_id="worker-1", + trace_id="trace-glob-embedded-traversal", ) assert result["code"] == ErrorCode.BUSINESS_FAILURE @@ -1124,47 +634,35 @@ def test_code_glob_rejects_embedded_parent_traversal_pattern(tmp_path): def test_code_glob_allows_safe_workspace_pattern(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - fake_server = FakeServer() - backend.bind_server(fake_server) - + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) runtime_workspace = tmp_path / "agentflow_code" / "worker-1" runtime_workspace.mkdir(parents=True) safe_file = runtime_workspace / "nested" / "demo.py" safe_file.parent.mkdir(parents=True) safe_file.write_text("print('ok')\n", encoding="utf-8") - executor = ToolExecutor( - tools=fake_server._tools, - tool_name_index={}, - tool_resource_types=fake_server._tool_resource_types, - resource_router=FakeResourceRouter( - { - "session_id": "code-session-glob-safe", - "data": {"workspace": str(runtime_workspace)}, - } - ), + executor = build_executor( + fake_server, + { + "session_id": "code-session-glob-safe", + "data": {"workspace": str(runtime_workspace)}, + }, ) - result = asyncio.run( - executor.execute( - action="code:glob", - params={"path": ".", "pattern": "**/*.py"}, - worker_id="worker-1", - trace_id="trace-glob-safe", - ) + result = execute_tool( + executor, + "code:glob", + params={"path": ".", "pattern": "**/*.py"}, + worker_id="worker-1", + trace_id="trace-glob-safe", ) assert result["code"] == ErrorCode.SUCCESS - assert result["data"] == [str(safe_file.resolve(strict=False))] + assert result["data"] == str(safe_file.resolve(strict=False)) def test_initialize_rejects_hostile_worker_id_without_deleting_outside_dir(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) - + _, backend = build_backend(tmp_path) outside_dir = tmp_path / "escaped" outside_dir.mkdir(parents=True) marker = outside_dir / "keep.txt" @@ -1176,27 +674,8 @@ def test_initialize_rejects_hostile_worker_id_without_deleting_outside_dir(tmp_p assert marker.exists() -def test_initialize_fails_when_claude_code_root_not_configured(tmp_path): - module = load_code_backend_module() - config = BackendConfig( - enabled=True, - default_config={ - "claude_code_root": "", - "workspace_root": str(tmp_path / "agentflow_code"), - "allow_bash": True, - }, - description="Code backend", - ) - backend = module.CodeBackend(config=config) - - with pytest.raises(ValueError, match="claude_code_root"): - asyncio.run(backend.initialize("runner_123", {})) - - def test_initialize_rejects_nonexistent_source_dir(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) missing_source = tmp_path / "missing-source" with pytest.raises(ValueError, match="source_dir"): @@ -1204,9 +683,7 @@ def test_initialize_rejects_nonexistent_source_dir(tmp_path): def test_initialize_invalid_source_dir_leaves_no_workspace(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) missing_source = tmp_path / "missing-source" workspace = tmp_path / "agentflow_code" / "runner_123" @@ -1216,56 +693,8 @@ def test_initialize_invalid_source_dir_leaves_no_workspace(tmp_path): assert not workspace.exists() -def test_initialize_unconfigured_claude_root_leaves_no_workspace(tmp_path): - module = load_code_backend_module() - config = BackendConfig( - enabled=True, - default_config={ - "claude_code_root": "", - "workspace_root": str(tmp_path / "agentflow_code"), - "allow_bash": True, - }, - description="Code backend", - ) - backend = module.CodeBackend(config=config) - workspace = tmp_path / "agentflow_code" / "runner_123" - - with pytest.raises(ValueError, match="claude_code_root"): - asyncio.run(backend.initialize("runner_123", {})) - - assert not workspace.exists() - - -def test_initialize_unconfigured_claude_root_preserves_existing_workspace(tmp_path): - module = load_code_backend_module() - workspace_root = tmp_path / "agentflow_code" - workspace = workspace_root / "runner_123" - workspace.mkdir(parents=True) - marker = workspace / "keep.txt" - marker.write_text("preserve-me\n", encoding="utf-8") - - config = BackendConfig( - enabled=True, - default_config={ - "claude_code_root": "", - "workspace_root": str(workspace_root), - "allow_bash": True, - }, - description="Code backend", - ) - backend = module.CodeBackend(config=config) - - with pytest.raises(ValueError, match="claude_code_root"): - asyncio.run(backend.initialize("runner_123", {})) - - assert marker.exists() - assert marker.read_text(encoding="utf-8") == "preserve-me\n" - - def test_cleanup_removes_worker_workspace(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) session = asyncio.run(backend.initialize("runner_123", {})) workspace = Path(session["workspace"]) @@ -1276,9 +705,7 @@ def test_cleanup_removes_worker_workspace(tmp_path): def test_cleanup_does_not_delete_workspace_outside_root(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) outside_workspace = tmp_path / "outside-workspace" outside_workspace.mkdir(parents=True) @@ -1290,9 +717,7 @@ def test_cleanup_does_not_delete_workspace_outside_root(tmp_path): def test_cleanup_does_not_delete_nested_under_root_non_worker_path(tmp_path): - module = load_code_backend_module() - create_fake_claude_code_root(tmp_path) - backend = module.CodeBackend(config=build_backend_config(tmp_path)) + _, backend = build_backend(tmp_path) nested_workspace = tmp_path / "agentflow_code" / "shared" / "cache" nested_workspace.mkdir(parents=True) @@ -1322,8 +747,8 @@ def test_code_config_template_parses(): assert config.server.session_ttl == 300 assert ( config.resources["code"].description - == "Lightweight coding backend powered by claude-code-py tools" + == "Lightweight coding backend powered by vendored internal tools" ) - assert config.resources["code"].config["bash_timeout_seconds"] == 30 + assert config.resources["code"].config == {"workspace_root": "/tmp/agentflow_code"} assert config.warmup.enabled is False assert config.warmup.resources == [] From 32559262db88c9794ecb26f001c18b99442e794a Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:10:21 +0800 Subject: [PATCH 06/39] Make vendored bash tool non-blocking --- .../resources/code_vendor/file_tools.py | 4 ++- sandbox/tests/test_code_vendor_tools.py | 27 +++++++++++++++++++ 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py index a42e670..968e5e0 100644 --- a/sandbox/server/backends/resources/code_vendor/file_tools.py +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -1,5 +1,6 @@ from __future__ import annotations +import asyncio import subprocess from pathlib import Path from typing import Any @@ -22,7 +23,8 @@ def input_schema(self) -> dict[str, Any]: } async def call(self, args: dict[str, Any], ctx: Any) -> str: - result = subprocess.run( + result = await asyncio.to_thread( + subprocess.run, args["command"], shell=True, capture_output=True, diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index b8f7efc..550c2aa 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -1,9 +1,13 @@ import asyncio import importlib.util +import shlex import sys +import time from types import SimpleNamespace from pathlib import Path +import pytest + PACKAGE_DIR = ( Path(__file__).resolve().parents[1] / "server" @@ -201,6 +205,29 @@ def test_bash_tool_combines_stdout_and_stderr(tmp_path): assert result == "out\n\n[stderr]:\nerr" +def test_bash_tool_does_not_block_event_loop_during_long_command(tmp_path): + async def run_bash_with_timeout(): + start = time.monotonic() + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for( + BashTool().call( + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import time; time.sleep(1)\"" + ) + }, + make_ctx(tmp_path), + ), + timeout=0.1, + ) + return time.monotonic() - start + + elapsed = asyncio.run(run_bash_with_timeout()) + + assert elapsed < 0.5 + + def test_tool_api_format_and_read_only_flags(): read_tool = ReadTool() bash_tool = BashTool() From 652cf7bf7c3b5fbf6e36d61bd4b88bef1ac47469 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:19:49 +0800 Subject: [PATCH 07/39] Clean up canceled vendored bash subprocesses --- .../resources/code_vendor/file_tools.py | 29 ++++++++++++++----- sandbox/tests/test_code_vendor_tools.py | 19 ++++++++---- 2 files changed, 35 insertions(+), 13 deletions(-) diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py index 968e5e0..65acc2b 100644 --- a/sandbox/server/backends/resources/code_vendor/file_tools.py +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -1,6 +1,8 @@ from __future__ import annotations import asyncio +import os +import signal import subprocess from pathlib import Path from typing import Any @@ -23,17 +25,30 @@ def input_schema(self) -> dict[str, Any]: } async def call(self, args: dict[str, Any], ctx: Any) -> str: - result = await asyncio.to_thread( - subprocess.run, + proc = await asyncio.create_subprocess_shell( args["command"], shell=True, - capture_output=True, - text=True, cwd=ctx.cwd, + stdout=asyncio.subprocess.PIPE, + stderr=asyncio.subprocess.PIPE, + start_new_session=True, ) - output = result.stdout - if result.stderr: - output += f"\n[stderr]:\n{result.stderr}" + + try: + stdout_bytes, stderr_bytes = await proc.communicate() + except asyncio.CancelledError: + if proc.returncode is None: + try: + os.killpg(proc.pid, signal.SIGKILL) + except (ProcessLookupError, PermissionError): + proc.kill() + await proc.communicate() + raise + + output = stdout_bytes.decode("utf-8", errors="replace") if stdout_bytes else "" + stderr = stderr_bytes.decode("utf-8", errors="replace") if stderr_bytes else "" + if stderr: + output += f"\n[stderr]:\n{stderr}" return output.strip() or "(no output)" diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index 550c2aa..270ce72 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -205,27 +205,34 @@ def test_bash_tool_combines_stdout_and_stderr(tmp_path): assert result == "out\n\n[stderr]:\nerr" -def test_bash_tool_does_not_block_event_loop_during_long_command(tmp_path): +def test_bash_tool_cancellation_stops_background_command(tmp_path): + marker = tmp_path / "marker.txt" + async def run_bash_with_timeout(): - start = time.monotonic() + timeout_start = time.monotonic() with pytest.raises(asyncio.TimeoutError): await asyncio.wait_for( BashTool().call( { "command": ( f"{shlex.quote(sys.executable)} -c " - "\"import time; time.sleep(1)\"" + "\"import pathlib, time; " + "time.sleep(0.3); " + "pathlib.Path('marker.txt').write_text('created', encoding='utf-8')\"" ) }, make_ctx(tmp_path), ), timeout=0.1, ) - return time.monotonic() - start + timeout_elapsed = time.monotonic() - timeout_start + await asyncio.sleep(0.4) + return timeout_elapsed - elapsed = asyncio.run(run_bash_with_timeout()) + timeout_elapsed = asyncio.run(run_bash_with_timeout()) - assert elapsed < 0.5 + assert timeout_elapsed < 0.25 + assert not marker.exists() def test_tool_api_format_and_read_only_flags(): From 255a377099b3a2067d515d6c0a3b389763ae012d Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:27:15 +0800 Subject: [PATCH 08/39] Restore vendored bash text-mode decoding --- .../resources/code_vendor/file_tools.py | 21 +++++++++++++++++-- sandbox/tests/test_code_vendor_tools.py | 15 +++++++++++++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py index 65acc2b..4396089 100644 --- a/sandbox/server/backends/resources/code_vendor/file_tools.py +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -1,6 +1,8 @@ from __future__ import annotations import asyncio +import io +import locale import os import signal import subprocess @@ -45,13 +47,28 @@ async def call(self, args: dict[str, Any], ctx: Any) -> str: await proc.communicate() raise - output = stdout_bytes.decode("utf-8", errors="replace") if stdout_bytes else "" - stderr = stderr_bytes.decode("utf-8", errors="replace") if stderr_bytes else "" + output = _decode_text_mode_output(stdout_bytes) + stderr = _decode_text_mode_output(stderr_bytes) if stderr: output += f"\n[stderr]:\n{stderr}" return output.strip() or "(no output)" +def _decode_text_mode_output(data: bytes | None) -> str: + if not data: + return "" + + text_stream = io.TextIOWrapper( + io.BytesIO(data), + encoding=locale.getpreferredencoding(False), + newline=None, + ) + try: + return text_stream.read() + finally: + text_stream.detach() + + class ReadTool(Tool): name = "Read" description = "Read a file and return its contents with line numbers." diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index 270ce72..940f936 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -205,6 +205,21 @@ def test_bash_tool_combines_stdout_and_stderr(tmp_path): assert result == "out\n\n[stderr]:\nerr" +def test_bash_tool_matches_text_mode_newline_normalization(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; sys.stdout.buffer.write(b'a\\r\\nb\\r\\n')\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result == "a\nb" + + def test_bash_tool_cancellation_stops_background_command(tmp_path): marker = tmp_path / "marker.txt" From c20cc1932b1e13f96a1dcb49612bc3a493641f2d Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:32:57 +0800 Subject: [PATCH 09/39] Align code backend schema docs and config tests --- sandbox/tests/test_code_tool_schemas.py | 10 +++++----- sandbox/tests/test_sandbox_config_loading.py | 11 +++-------- sandbox/tool_schemas/code_tools.py | 2 +- 3 files changed, 9 insertions(+), 14 deletions(-) diff --git a/sandbox/tests/test_code_tool_schemas.py b/sandbox/tests/test_code_tool_schemas.py index 1cc342e..6247ee0 100644 --- a/sandbox/tests/test_code_tool_schemas.py +++ b/sandbox/tests/test_code_tool_schemas.py @@ -97,14 +97,14 @@ def test_code_read_description_mentions_line_numbered_and_1_indexed_offset(): assert "1-indexed" in offset["description"].lower() -def test_code_bash_description_mentions_backend_config_availability(): - """code-bash docs should mention backend-config-dependent availability.""" +def test_code_bash_description_mentions_workspace_shell_execution(): + """code-bash docs should describe shell execution in the coding workspace.""" schema = _code_schemas_by_name()["code-bash"] description = schema["description"].lower() - assert "backend" in description - assert "config" in description - assert "depend" in description + assert "workspace" in description + assert "shell command" in description + assert "backend config" not in description def test_code_write_description_mentions_workspace_full_content_and_parent_dirs(): diff --git a/sandbox/tests/test_sandbox_config_loading.py b/sandbox/tests/test_sandbox_config_loading.py index 74ea0c9..46621aa 100644 --- a/sandbox/tests/test_sandbox_config_loading.py +++ b/sandbox/tests/test_sandbox_config_loading.py @@ -3,16 +3,14 @@ from sandbox.sandbox import Sandbox, SandboxConfig -def test_load_server_config_expands_env_default_placeholders(tmp_path, monkeypatch): - monkeypatch.delenv("CLAUDE_CODE_ROOT", raising=False) - +def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): config_path = tmp_path / "code_config.json" raw_config = { "resources": { "code": { "enabled": True, "config": { - "claude_code_root": "${CLAUDE_CODE_ROOT:-/home/a1/sdb/dxd/claude-code-py}" + "workspace_root": "/tmp/agentflow_code" }, } } @@ -23,7 +21,4 @@ def test_load_server_config_expands_env_default_placeholders(tmp_path, monkeypat loaded = sandbox._load_server_config() - assert ( - loaded["resources"]["code"]["config"]["claude_code_root"] - == "/home/a1/sdb/dxd/claude-code-py" - ) + assert loaded["resources"]["code"]["config"]["workspace_root"] == "/tmp/agentflow_code" diff --git a/sandbox/tool_schemas/code_tools.py b/sandbox/tool_schemas/code_tools.py index de334b3..f281b14 100644 --- a/sandbox/tool_schemas/code_tools.py +++ b/sandbox/tool_schemas/code_tools.py @@ -78,7 +78,7 @@ def get_code_tool_schemas() -> List[Dict[str, Any]]: }, { "name": "code-bash", - "description": "Run a shell command in the coding workspace (availability depends on backend config).", + "description": "Run a shell command in the current coding workspace.", "parameters": [ { "name": "command", From 5d38fda52089dd3461f9377b0cbb71b805cf52a9 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:43:44 +0800 Subject: [PATCH 10/39] Add code rollout real smoke gating --- rollout/tests/conftest.py | 116 ++++++++++++++++++++++++++ rollout/tests/test_code_real_smoke.py | 92 ++++++++++++++++++++ 2 files changed, 208 insertions(+) create mode 100644 rollout/tests/conftest.py create mode 100644 rollout/tests/test_code_real_smoke.py diff --git a/rollout/tests/conftest.py b/rollout/tests/conftest.py new file mode 100644 index 0000000..89cff6f --- /dev/null +++ b/rollout/tests/conftest.py @@ -0,0 +1,116 @@ +import os +from pathlib import Path + +import pytest + + +_REAL_CODE_TEST_FILES = { + "test_code_real_smoke.py", +} + + +def pytest_ignore_collect(collection_path, config): + del config + if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": + return False + + path = Path(str(collection_path)) + return path.name in _REAL_CODE_TEST_FILES + + +def pytest_addoption(parser): + group = parser.getgroup("agentflow-code-real") + group.addoption( + "--real-api-key", + action="store", + default=None, + help="API key for opt-in real code rollout smoke tests.", + ) + group.addoption( + "--real-base-url", + action="store", + default=None, + help="Base URL for opt-in real code rollout smoke tests.", + ) + group.addoption( + "--real-model", + action="store", + default=None, + help="Model name for opt-in real code rollout smoke tests.", + ) + + +def pytest_configure(config): + config.addinivalue_line( + "markers", + "code_real: opt-in real code rollout smoke tests", + ) + + +def require_code_real_enabled(): + return [ + pytest.mark.code_real, + pytest.mark.skipif( + os.environ.get("AGENTFLOW_RUN_CODE_REAL") != "1", + reason="set AGENTFLOW_RUN_CODE_REAL=1 to run real code rollout smoke tests", + ), + ] + + +def _get_real_credentials(config): + return { + "api_key": config.getoption("--real-api-key"), + "base_url": config.getoption("--real-base-url"), + "model": config.getoption("--real-model"), + } + + +def _missing_real_credential_options(config): + credentials = _get_real_credentials(config) + return [ + option_name + for option_name, value in ( + ("--real-api-key", credentials["api_key"]), + ("--real-base-url", credentials["base_url"]), + ("--real-model", credentials["model"]), + ) + if not value + ] + + +def pytest_runtest_setup(item): + if item.get_closest_marker("code_real") is None: + return + + missing = _missing_real_credential_options(item.config) + if missing: + pytest.skip( + "code_real tests require all of " + "--real-api-key, --real-base-url, and --real-model" + ) + + +@pytest.fixture +def real_llm_credentials(request): + credentials = _get_real_credentials(request.config) + if _missing_real_credential_options(request.config): + pytest.skip( + "code_real tests require all of " + "--real-api-key, --real-base-url, and --real-model" + ) + return credentials + + +@pytest.fixture +def real_api_key(real_llm_credentials): + return real_llm_credentials["api_key"] + + +@pytest.fixture +def real_base_url(real_llm_credentials): + return real_llm_credentials["base_url"] + + +@pytest.fixture +def real_model(real_llm_credentials): + return real_llm_credentials["model"] diff --git a/rollout/tests/test_code_real_smoke.py b/rollout/tests/test_code_real_smoke.py new file mode 100644 index 0000000..b8147b3 --- /dev/null +++ b/rollout/tests/test_code_real_smoke.py @@ -0,0 +1,92 @@ +import os +import json + +from rollout import RolloutConfig, RolloutPipeline + +def _canonical_tool_name(name): + for separator in (".", "_", "-"): + if separator in name: + prefix, suffix = name.split(separator, 1) + return f"{prefix}:{suffix}" + return name + + +if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": + from .conftest import require_code_real_enabled + + pytestmark = require_code_real_enabled() + + def test_code_real_smoke_reads_token_via_real_tools( + tmp_path, + real_api_key, + real_base_url, + real_model, + ): + fixture_repo = tmp_path / "fixture_repo" + nested_dir = fixture_repo / "nested" + nested_dir.mkdir(parents=True) + + token = f"token-{tmp_path.name}" + (nested_dir / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") + + benchmark_path = tmp_path / "benchmark.jsonl" + prompt = ( + "Use code tools to inspect the repository and read nested/TOKEN.txt. " + "Reply with only the exact token and nothing else." + ) + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": prompt, + "answer": token, + } + ) + + "\n", + encoding="utf-8", + ) + + output_dir = tmp_path / "rollout_output" + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + model_name=real_model, + api_key=real_api_key, + base_url=real_base_url, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={ + "code": {"content": {"source_dir": str(fixture_repo)}} + }, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + number_of_tasks=1, + ) + + summary = RolloutPipeline(config, output_dir=str(output_dir)).run() + + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 + assert summary.failed_tasks == 0 + + result_files = sorted(output_dir.glob("results_code_real_smoke_*.jsonl")) + assert result_files + + payload = json.loads(result_files[-1].read_text(encoding="utf-8").strip()) + trajectory = payload["trajectory"] + tool_calls = trajectory["tool_calls"] + tool_messages = [ + message for message in trajectory["messages"] if message["role"] == "tool" + ] + + assert any( + _canonical_tool_name(call["tool_name"]).startswith("code:") + for call in tool_calls + ) + assert payload["predicted_answer"] == token + assert trajectory["final_answer"] == token + assert any( + token in json.dumps(message, ensure_ascii=False) for message in tool_messages + ) From 2d52b2977744cfd2607e766791116d451e454076 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:49:40 +0800 Subject: [PATCH 11/39] Fix code real smoke collection gating --- rollout/tests/conftest.py | 36 +++++-- rollout/tests/test_code_real_smoke.py | 142 +++++++++++++------------- 2 files changed, 98 insertions(+), 80 deletions(-) diff --git a/rollout/tests/conftest.py b/rollout/tests/conftest.py index 89cff6f..8c6c13d 100644 --- a/rollout/tests/conftest.py +++ b/rollout/tests/conftest.py @@ -9,9 +9,13 @@ } +def _code_real_enabled(): + return os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1" + + def pytest_ignore_collect(collection_path, config): del config - if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": + if _code_real_enabled(): return False path = Path(str(collection_path)) @@ -47,14 +51,25 @@ def pytest_configure(config): ) -def require_code_real_enabled(): - return [ - pytest.mark.code_real, - pytest.mark.skipif( - os.environ.get("AGENTFLOW_RUN_CODE_REAL") != "1", - reason="set AGENTFLOW_RUN_CODE_REAL=1 to run real code rollout smoke tests", - ), - ] +def pytest_collection_modifyitems(config, items): + if _code_real_enabled(): + return + + deselected = [] + kept = [] + for item in items: + if item.get_closest_marker("code_real") is None: + kept.append(item) + continue + + if Path(str(item.fspath)).name in _REAL_CODE_TEST_FILES: + deselected.append(item) + else: + kept.append(item) + + if deselected: + items[:] = kept + config.hook.pytest_deselected(items=deselected) def _get_real_credentials(config): @@ -82,6 +97,9 @@ def pytest_runtest_setup(item): if item.get_closest_marker("code_real") is None: return + if not _code_real_enabled(): + pytest.skip("set AGENTFLOW_RUN_CODE_REAL=1 to run real code rollout smoke tests") + missing = _missing_real_credential_options(item.config) if missing: pytest.skip( diff --git a/rollout/tests/test_code_real_smoke.py b/rollout/tests/test_code_real_smoke.py index b8147b3..63664a4 100644 --- a/rollout/tests/test_code_real_smoke.py +++ b/rollout/tests/test_code_real_smoke.py @@ -1,8 +1,13 @@ -import os import json +import pytest + from rollout import RolloutConfig, RolloutPipeline + +pytestmark = pytest.mark.code_real + + def _canonical_tool_name(name): for separator in (".", "_", "-"): if separator in name: @@ -11,82 +16,77 @@ def _canonical_tool_name(name): return name -if os.environ.get("AGENTFLOW_RUN_CODE_REAL") == "1": - from .conftest import require_code_real_enabled - - pytestmark = require_code_real_enabled() +def test_code_real_smoke_reads_token_via_real_tools( + tmp_path, + real_api_key, + real_base_url, + real_model, +): + fixture_repo = tmp_path / "fixture_repo" + nested_dir = fixture_repo / "nested" + nested_dir.mkdir(parents=True) - def test_code_real_smoke_reads_token_via_real_tools( - tmp_path, - real_api_key, - real_base_url, - real_model, - ): - fixture_repo = tmp_path / "fixture_repo" - nested_dir = fixture_repo / "nested" - nested_dir.mkdir(parents=True) + token = f"token-{tmp_path.name}" + (nested_dir / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") - token = f"token-{tmp_path.name}" - (nested_dir / "TOKEN.txt").write_text(token + "\n", encoding="utf-8") - - benchmark_path = tmp_path / "benchmark.jsonl" - prompt = ( - "Use code tools to inspect the repository and read nested/TOKEN.txt. " - "Reply with only the exact token and nothing else." - ) - benchmark_path.write_text( - json.dumps( - { - "id": "code-real-smoke", - "question": prompt, - "answer": token, - } - ) - + "\n", - encoding="utf-8", + benchmark_path = tmp_path / "benchmark.jsonl" + prompt = ( + "Use code tools to inspect the repository and read nested/TOKEN.txt. " + "Reply with only the exact token and nothing else." + ) + benchmark_path.write_text( + json.dumps( + { + "id": "code-real-smoke", + "question": prompt, + "answer": token, + } ) + + "\n", + encoding="utf-8", + ) - output_dir = tmp_path / "rollout_output" - config = RolloutConfig( - benchmark_name="code_real_smoke", - data_path=str(benchmark_path), - model_name=real_model, - api_key=real_api_key, - base_url=real_base_url, - available_tools=["code-*"], - resource_types=["code"], - resource_init_configs={ - "code": {"content": {"source_dir": str(fixture_repo)}} - }, - sandbox_config_path="configs/sandbox-server/code_config.json", - sandbox_auto_start=True, - evaluate_results=False, - save_trajectories=True, - number_of_tasks=1, - ) + output_dir = tmp_path / "rollout_output" + config = RolloutConfig( + benchmark_name="code_real_smoke", + data_path=str(benchmark_path), + model_name=real_model, + api_key=real_api_key, + base_url=real_base_url, + available_tools=["code-*"], + resource_types=["code"], + resource_init_configs={ + "code": {"content": {"source_dir": str(fixture_repo)}} + }, + sandbox_config_path="configs/sandbox-server/code_config.json", + sandbox_auto_start=True, + evaluate_results=False, + save_trajectories=True, + number_of_tasks=1, + ) - summary = RolloutPipeline(config, output_dir=str(output_dir)).run() + summary = RolloutPipeline(config, output_dir=str(output_dir)).run() - assert summary.total_tasks == 1 - assert summary.successful_tasks == 1 - assert summary.failed_tasks == 0 + assert summary.total_tasks == 1 + assert summary.successful_tasks == 1 + assert summary.failed_tasks == 0 - result_files = sorted(output_dir.glob("results_code_real_smoke_*.jsonl")) - assert result_files + result_files = sorted(output_dir.glob("results_code_real_smoke_*.jsonl")) + assert result_files - payload = json.loads(result_files[-1].read_text(encoding="utf-8").strip()) - trajectory = payload["trajectory"] - tool_calls = trajectory["tool_calls"] - tool_messages = [ - message for message in trajectory["messages"] if message["role"] == "tool" - ] + payload = json.loads(result_files[-1].read_text(encoding="utf-8").strip()) + trajectory = payload["trajectory"] + tool_calls = trajectory["tool_calls"] + tool_messages = [ + message for message in trajectory["messages"] if message["role"] == "tool" + ] - assert any( - _canonical_tool_name(call["tool_name"]).startswith("code:") - for call in tool_calls - ) - assert payload["predicted_answer"] == token - assert trajectory["final_answer"] == token - assert any( - token in json.dumps(message, ensure_ascii=False) for message in tool_messages - ) + assert any( + _canonical_tool_name(call["tool_name"]).startswith("code:") + for call in tool_calls + ) + assert payload["predicted_answer"] == token + assert trajectory["final_answer"] == token + assert any( + token in json.dumps(message, ensure_ascii=False) for message in tool_messages + ) From 7937314869c7ea39795100a1dc8e71ea5ed3be80 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 10:57:19 +0800 Subject: [PATCH 12/39] Set code real smoke turn budget --- rollout/tests/test_code_real_smoke.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/rollout/tests/test_code_real_smoke.py b/rollout/tests/test_code_real_smoke.py index 63664a4..9634ec1 100644 --- a/rollout/tests/test_code_real_smoke.py +++ b/rollout/tests/test_code_real_smoke.py @@ -62,9 +62,12 @@ def test_code_real_smoke_reads_token_via_real_tools( sandbox_auto_start=True, evaluate_results=False, save_trajectories=True, + max_turns=5, number_of_tasks=1, ) + assert config.max_turns == 5 + summary = RolloutPipeline(config, output_dir=str(output_dir)).run() assert summary.total_tasks == 1 From 9be169a11bbbefcc5486fab3d97e5a2209c73443 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 11:24:22 +0800 Subject: [PATCH 13/39] Fix lazy loading for code backend imports --- sandbox/server/backends/resources/__init__.py | 90 ++++++------------- sandbox/tests/test_code_backend.py | 37 ++++++++ 2 files changed, 66 insertions(+), 61 deletions(-) diff --git a/sandbox/server/backends/resources/__init__.py b/sandbox/server/backends/resources/__init__.py index 85f5415..29eea4b 100644 --- a/sandbox/server/backends/resources/__init__.py +++ b/sandbox/server/backends/resources/__init__.py @@ -1,77 +1,45 @@ # sandbox/server/backends/resources/__init__.py """ -Stateful resource backend module. +Stateful resource backend exports. -Provides heavyweight backends (mock implementations) that require session -management. These backends initialize resources, maintain state, and clean up -when finished. - -Backend types: -- VMBackend - VM interaction (stateful, uses initialize/cleanup) -- RAGBackend - Document retrieval (shared resource, uses warmup/shutdown) - -Directory layout: -``` -backends/ -├── resources/ # Stateful backends (heavyweight, require sessions) -│ ├── __init__.py -│ ├── vm.py -│ └── rag.py -│ -└── tools/ # Stateless tools (lightweight, no sessions) - ├── __init__.py - └── websearch.py -``` - -Usage example: -```python -from sandbox.server import HTTPServiceServer -from sandbox.server.backends.resources import ( - VMBackend, - RAGBackend -) - -server = HTTPServiceServer() +Backends are loaded lazily so importing one backend module does not force +unrelated heavyweight dependencies for the others. +""" -# Load stateful backends. -server.load_backend(VMBackend()) -server.load_backend(RAGBackend()) +from __future__ import annotations -server.run() -``` +import importlib +from typing import Any -Config example: -```json -{ - "resources": { - "vm": { - "enabled": true, - "backend_class": "sandbox.server.backends.resources.vm.VMBackend", - "config": {"screen_size": [1920, 1080]} - }, - "rag": { - "enabled": true, - "backend_class": "sandbox.server.backends.resources.rag.RAGBackend", - "config": {"model_name": "e5-base", "index_type": "faiss"} - } - } +_EXPORTS = { + "VMBackend": (".vm", "VMBackend"), + "create_vm_backend": (".vm", "create_vm_backend"), + "RAGBackend": (".rag", "RAGBackend"), + "create_rag_backend": (".rag", "create_rag_backend"), + "MCPBackend": (".mcp", "MCPBackend"), + "CodeBackend": (".code", "CodeBackend"), } -``` -""" - -from .vm import VMBackend, create_vm_backend -from .rag import RAGBackend, create_rag_backend -from .mcp import MCPBackend -from .code import CodeBackend __all__ = [ - # Backend classes "VMBackend", "RAGBackend", "MCPBackend", "CodeBackend", - - # Convenience factories "create_vm_backend", "create_rag_backend", ] + + +def __getattr__(name: str) -> Any: + if name not in _EXPORTS: + raise AttributeError(f"module {__name__!r} has no attribute {name!r}") + + module_name, attr_name = _EXPORTS[name] + module = importlib.import_module(module_name, __name__) + value = getattr(module, attr_name) + globals()[name] = value + return value + + +def __dir__() -> list[str]: + return sorted(set(globals()) | set(__all__)) diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index aa3ae7f..ac0b6f1 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -752,3 +752,40 @@ def test_code_config_template_parses(): assert config.resources["code"].config == {"workspace_root": "/tmp/agentflow_code"} assert config.warmup.enabled is False assert config.warmup.resources == [] + + +def test_create_server_loads_code_backend_via_config_loader(tmp_path): + claude_code_root = create_fake_claude_code_root(tmp_path) + workspace_root = tmp_path / "agentflow_code" + loader = ConfigLoader() + loader.load_from_dict( + { + "server": { + "title": "Code backend smoke", + "session_ttl": 300, + }, + "resources": { + "code": { + "enabled": True, + "description": "Code backend", + "backend_class": "sandbox.server.backends.resources.code.CodeBackend", + "config": { + "claude_code_root": str(claude_code_root), + "workspace_root": str(workspace_root), + "allow_bash": True, + "bash_timeout_seconds": 30, + }, + } + }, + "warmup": { + "enabled": False, + "resources": [], + }, + } + ) + + server = loader.create_server(host="127.0.0.1", port=0) + + assert "code" in server._backends + assert "code:read" in server._tools + assert server._tool_resource_types["code:read"] == "code" From 3108a59a78c50a55b8dd51a584b6ea090a992dc1 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 11:28:21 +0800 Subject: [PATCH 14/39] Adapt code backend smoke test to vendored config --- sandbox/tests/test_code_backend.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index ac0b6f1..602f7a3 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -755,7 +755,6 @@ def test_code_config_template_parses(): def test_create_server_loads_code_backend_via_config_loader(tmp_path): - claude_code_root = create_fake_claude_code_root(tmp_path) workspace_root = tmp_path / "agentflow_code" loader = ConfigLoader() loader.load_from_dict( @@ -770,10 +769,7 @@ def test_create_server_loads_code_backend_via_config_loader(tmp_path): "description": "Code backend", "backend_class": "sandbox.server.backends.resources.code.CodeBackend", "config": { - "claude_code_root": str(claude_code_root), "workspace_root": str(workspace_root), - "allow_bash": True, - "bash_timeout_seconds": 30, }, } }, From b7fc32d40ad9f8b03c7ac4eea6b030ec07d34e32 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 11:33:15 +0800 Subject: [PATCH 15/39] Tighten code backend loader smoke test --- sandbox/tests/test_code_backend.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index 602f7a3..91150b2 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -62,6 +62,13 @@ def install_resources_package_stub(): module_spec.loader.exec_module(module) +def remove_resources_modules(): + package_name = "sandbox.server.backends.resources" + for module_name in list(sys.modules): + if module_name == package_name or module_name.startswith(f"{package_name}."): + sys.modules.pop(module_name, None) + + def load_code_backend_module(): install_resources_package_stub() unique_id = next(_MODULE_LOAD_COUNTER) @@ -756,6 +763,7 @@ def test_code_config_template_parses(): def test_create_server_loads_code_backend_via_config_loader(tmp_path): workspace_root = tmp_path / "agentflow_code" + remove_resources_modules() loader = ConfigLoader() loader.load_from_dict( { @@ -781,7 +789,11 @@ def test_create_server_loads_code_backend_via_config_loader(tmp_path): ) server = loader.create_server(host="127.0.0.1", port=0) + resources_package = sys.modules["sandbox.server.backends.resources"] + code_module = sys.modules["sandbox.server.backends.resources.code"] assert "code" in server._backends assert "code:read" in server._tools assert server._tool_resource_types["code:read"] == "code" + assert Path(resources_package.__file__).resolve() == (MODULE_PATH.parent / "__init__.py").resolve() + assert Path(code_module.__file__).resolve() == MODULE_PATH.resolve() From 0a5baac6ad60e943c5e2a693265c00b2fde841cc Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 13:56:40 +0800 Subject: [PATCH 16/39] Restore eager resource exports --- requirements.txt | 1 + sandbox/server/backends/resources/__init__.py | 88 ++++++++++++------- .../utils/desktop_env/server/requirements.txt | 2 +- sandbox/tests/test_code_backend.py | 74 ++++------------ 4 files changed, 76 insertions(+), 89 deletions(-) diff --git a/requirements.txt b/requirements.txt index 149e921..6046460 100644 --- a/requirements.txt +++ b/requirements.txt @@ -26,6 +26,7 @@ Pillow==10.1.0 numpy opencv-python lxml +cssselect pygame pywinauto pyxcursor diff --git a/sandbox/server/backends/resources/__init__.py b/sandbox/server/backends/resources/__init__.py index 29eea4b..55cff8c 100644 --- a/sandbox/server/backends/resources/__init__.py +++ b/sandbox/server/backends/resources/__init__.py @@ -1,24 +1,67 @@ -# sandbox/server/backends/resources/__init__.py """ -Stateful resource backend exports. +Stateful resource backend module. -Backends are loaded lazily so importing one backend module does not force -unrelated heavyweight dependencies for the others. -""" +Provides heavyweight backends (mock implementations) that require session +management. These backends initialize resources, maintain state, and clean up +when finished. + +Backend types: +- VMBackend - VM interaction (stateful, uses initialize/cleanup) +- RAGBackend - Document retrieval (shared resource, uses warmup/shutdown) + +Directory layout: +``` +backends/ +├── resources/ # Stateful backends (heavyweight, require sessions) +│ ├── __init__.py +│ ├── vm.py +│ └── rag.py +│ +└── tools/ # Stateless tools (lightweight, no sessions) + ├── __init__.py + └── websearch.py +``` -from __future__ import annotations +Usage example: +```python +from sandbox.server import HTTPServiceServer +from sandbox.server.backends.resources import ( + VMBackend, + RAGBackend +) -import importlib -from typing import Any +server = HTTPServiceServer() -_EXPORTS = { - "VMBackend": (".vm", "VMBackend"), - "create_vm_backend": (".vm", "create_vm_backend"), - "RAGBackend": (".rag", "RAGBackend"), - "create_rag_backend": (".rag", "create_rag_backend"), - "MCPBackend": (".mcp", "MCPBackend"), - "CodeBackend": (".code", "CodeBackend"), +# Load stateful backends. +server.load_backend(VMBackend()) +server.load_backend(RAGBackend()) + +server.run() +``` + +Config example: +```json +{ + "resources": { + "vm": { + "enabled": true, + "backend_class": "sandbox.server.backends.resources.vm.VMBackend", + "config": {"screen_size": [1920, 1080]} + }, + "rag": { + "enabled": true, + "backend_class": "sandbox.server.backends.resources.rag.RAGBackend", + "config": {"model_name": "e5-base", "index_type": "faiss"} + } + } } +``` +""" + +from .code import CodeBackend +from .mcp import MCPBackend +from .rag import RAGBackend, create_rag_backend +from .vm import VMBackend, create_vm_backend __all__ = [ "VMBackend", @@ -28,18 +71,3 @@ "create_vm_backend", "create_rag_backend", ] - - -def __getattr__(name: str) -> Any: - if name not in _EXPORTS: - raise AttributeError(f"module {__name__!r} has no attribute {name!r}") - - module_name, attr_name = _EXPORTS[name] - module = importlib.import_module(module_name, __name__) - value = getattr(module, attr_name) - globals()[name] = value - return value - - -def __dir__() -> list[str]: - return sorted(set(globals()) | set(__all__)) diff --git a/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt b/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt index da9a7de..65240d9 100644 --- a/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt +++ b/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt @@ -7,6 +7,6 @@ requests-toolbelt flask numpy lxml +cssselect pygame pywinauto - diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index 91150b2..65f50cf 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -3,12 +3,10 @@ """ import asyncio -import importlib.util -import itertools +import importlib import os import sys from pathlib import Path -import types import pytest @@ -17,49 +15,7 @@ from sandbox.server.config_loader import ConfigLoader from sandbox.server.core.tool_executor import ToolExecutor -MODULE_PATH = ( - Path(__file__).resolve().parents[1] - / "server" - / "backends" - / "resources" - / "code.py" -) -VENDOR_PACKAGE_DIR = MODULE_PATH.parent / "code_vendor" - - -def install_resources_package_stub(): - package_name = "sandbox.server.backends.resources" - if package_name not in sys.modules: - package = types.ModuleType(package_name) - package.__path__ = [str(MODULE_PATH.parent)] - sys.modules[package_name] = package - - vendor_package_name = f"{package_name}.code_vendor" - if vendor_package_name not in sys.modules: - package_spec = importlib.util.spec_from_file_location( - vendor_package_name, - VENDOR_PACKAGE_DIR / "__init__.py", - submodule_search_locations=[str(VENDOR_PACKAGE_DIR)], - ) - package = importlib.util.module_from_spec(package_spec) - assert package_spec is not None - assert package_spec.loader is not None - sys.modules[vendor_package_name] = package - package_spec.loader.exec_module(package) - - for module_name in ("tool", "file_tools", "edit_tools"): - full_name = f"{vendor_package_name}.{module_name}" - if full_name in sys.modules: - continue - module_spec = importlib.util.spec_from_file_location( - full_name, - VENDOR_PACKAGE_DIR / f"{module_name}.py", - ) - module = importlib.util.module_from_spec(module_spec) - assert module_spec is not None - assert module_spec.loader is not None - sys.modules[full_name] = module - module_spec.loader.exec_module(module) +MODULE_PATH = Path(__file__).resolve().parents[1] / "server" / "backends" / "resources" / "code.py" def remove_resources_modules(): @@ -70,18 +26,8 @@ def remove_resources_modules(): def load_code_backend_module(): - install_resources_package_stub() - unique_id = next(_MODULE_LOAD_COUNTER) - module_name = f"_test_code_backend_{unique_id}" - spec = importlib.util.spec_from_file_location(module_name, MODULE_PATH) - module = importlib.util.module_from_spec(spec) - assert spec is not None - assert spec.loader is not None - spec.loader.exec_module(module) - return module - - -_MODULE_LOAD_COUNTER = itertools.count() + remove_resources_modules() + return importlib.import_module("sandbox.server.backends.resources.code") class FakeServer: @@ -168,6 +114,18 @@ def test_bind_server_registers_code_tools(tmp_path): assert fake_server._tool_resource_types["code:bash"] == "code" +def test_resources_package_exports_eager_backends_without_lazy_machinery(): + remove_resources_modules() + + resources = importlib.import_module("sandbox.server.backends.resources") + code_module = importlib.import_module("sandbox.server.backends.resources.code") + mcp_module = importlib.import_module("sandbox.server.backends.resources.mcp") + + assert resources.CodeBackend is code_module.CodeBackend + assert resources.MCPBackend is mcp_module.MCPBackend + assert not hasattr(resources, "__getattr__") + + def test_initialize_does_not_require_external_root(tmp_path): _, backend = build_backend(tmp_path) From 38adac87e55651964d3d4c535b8c49376e778a63 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 14:10:16 +0800 Subject: [PATCH 17/39] Align MCP tests with eager imports --- sandbox/tests/test_mcp_backend.py | 45 +++++++++++++++---------------- 1 file changed, 21 insertions(+), 24 deletions(-) diff --git a/sandbox/tests/test_mcp_backend.py b/sandbox/tests/test_mcp_backend.py index e09e2e3..db233ee 100644 --- a/sandbox/tests/test_mcp_backend.py +++ b/sandbox/tests/test_mcp_backend.py @@ -3,7 +3,7 @@ """ import asyncio -import importlib.util +import importlib import sys import types from pathlib import Path @@ -14,32 +14,19 @@ from sandbox.server.config_loader import ConfigLoader from sandbox.server.core.tool_executor import ToolExecutor -MODULE_PATH = ( - Path(__file__).resolve().parents[1] - / "server" - / "backends" - / "resources" - / "mcp.py" -) +MODULE_PATH = Path(__file__).resolve().parents[1] / "server" / "backends" / "resources" / "mcp.py" -def load_mcp_backend_module(): +def remove_resources_modules(): package_name = "sandbox.server.backends.resources" - if package_name not in sys.modules: - package = types.ModuleType(package_name) - package.__path__ = [str(MODULE_PATH.parent)] - sys.modules[package_name] = package - - spec = importlib.util.spec_from_file_location( - f"{package_name}.mcp", - MODULE_PATH, - ) - module = importlib.util.module_from_spec(spec) - assert spec is not None - assert spec.loader is not None - sys.modules[spec.name] = module - spec.loader.exec_module(module) - return module + for module_name in list(sys.modules): + if module_name == package_name or module_name.startswith(f"{package_name}."): + sys.modules.pop(module_name, None) + + +def load_mcp_backend_module(): + remove_resources_modules() + return importlib.import_module("sandbox.server.backends.resources.mcp") class FakeServer: @@ -77,6 +64,16 @@ def test_bind_server_registers_manifest_tools(tmp_path): assert fake_server._tool_resource_types["mcp:filesystem.list_directory"] == "mcp" +def test_resources_package_exports_mcp_backend(): + remove_resources_modules() + + resources = importlib.import_module("sandbox.server.backends.resources") + mcp_module = importlib.import_module("sandbox.server.backends.resources.mcp") + + assert resources.MCPBackend is mcp_module.MCPBackend + assert Path(resources.__file__).resolve() == (MODULE_PATH.parent / "__init__.py").resolve() + + def test_initialize_creates_worker_workspace(tmp_path, monkeypatch): module = load_mcp_backend_module() backend = module.MCPBackend(config=build_backend_config(tmp_path)) From f5e8ce7bd42beb0a715fa77d50886811f1f779d0 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 15:56:20 +0800 Subject: [PATCH 18/39] Align MCP test and desktop env requirements with main --- .../utils/desktop_env/server/requirements.txt | 2 +- sandbox/tests/test_mcp_backend.py | 37 ++++--------------- 2 files changed, 9 insertions(+), 30 deletions(-) diff --git a/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt b/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt index 65240d9..da9a7de 100644 --- a/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt +++ b/sandbox/server/backends/resources/utils/desktop_env/server/requirements.txt @@ -7,6 +7,6 @@ requests-toolbelt flask numpy lxml -cssselect pygame pywinauto + diff --git a/sandbox/tests/test_mcp_backend.py b/sandbox/tests/test_mcp_backend.py index 3b5fd8a..af0c1aa 100644 --- a/sandbox/tests/test_mcp_backend.py +++ b/sandbox/tests/test_mcp_backend.py @@ -3,7 +3,6 @@ """ import asyncio -import importlib import importlib.util import sys import types @@ -23,27 +22,18 @@ / "mcp" / "toolathlon_gym.py" ) - - -def remove_resources_modules(): - package_name = "sandbox.server.backends.resources" - for module_name in list(sys.modules): - if module_name == package_name or module_name.startswith(f"{package_name}."): - sys.modules.pop(module_name, None) - - def load_mcp_backend_module(): - remove_resources_modules() - package_name = "sandbox.server.backends.resources" - package = types.ModuleType(package_name) - package.__path__ = [str(MODULE_PATH.parent.parent)] - sys.modules[package_name] = package + if package_name not in sys.modules: + package = types.ModuleType(package_name) + package.__path__ = [str(MODULE_PATH.parent.parent)] + sys.modules[package_name] = package mcp_package_name = f"{package_name}.mcp" - mcp_package = types.ModuleType(mcp_package_name) - mcp_package.__path__ = [str(MODULE_PATH.parent)] - sys.modules[mcp_package_name] = mcp_package + if mcp_package_name not in sys.modules: + mcp_package = types.ModuleType(mcp_package_name) + mcp_package.__path__ = [str(MODULE_PATH.parent)] + sys.modules[mcp_package_name] = mcp_package spec = importlib.util.spec_from_file_location( f"{mcp_package_name}.toolathlon_gym", @@ -91,17 +81,6 @@ def test_bind_server_registers_manifest_tools(tmp_path): assert fake_server._tool_resource_types["mcp:filesystem.list_directory"] == "mcp" -def test_resources_package_exports_mcp_backend(): - remove_resources_modules() - - resources = importlib.import_module("sandbox.server.backends.resources") - mcp_module = importlib.import_module("sandbox.server.backends.resources.mcp") - - assert resources.MCPBackend is mcp_module.MCPBackend - assert resources.ToolathlonGymBackend is mcp_module.ToolathlonGymBackend - assert Path(resources.__file__).resolve() == (MODULE_PATH.parent.parent / "__init__.py").resolve() - - def test_initialize_creates_worker_workspace(tmp_path, monkeypatch): module = load_mcp_backend_module() backend = module.ToolathlonGymBackend(config=build_backend_config(tmp_path)) From 6876b069338a0364cc7218890227d8319da14be3 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 15:58:19 +0800 Subject: [PATCH 19/39] Restore MCP backend test formatting from main --- sandbox/tests/test_mcp_backend.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sandbox/tests/test_mcp_backend.py b/sandbox/tests/test_mcp_backend.py index af0c1aa..77cb58f 100644 --- a/sandbox/tests/test_mcp_backend.py +++ b/sandbox/tests/test_mcp_backend.py @@ -22,6 +22,8 @@ / "mcp" / "toolathlon_gym.py" ) + + def load_mcp_backend_module(): package_name = "sandbox.server.backends.resources" if package_name not in sys.modules: @@ -29,6 +31,7 @@ def load_mcp_backend_module(): package.__path__ = [str(MODULE_PATH.parent.parent)] sys.modules[package_name] = package + # Register the mcp sub-package so ``from .client import ...`` resolves. mcp_package_name = f"{package_name}.mcp" if mcp_package_name not in sys.modules: mcp_package = types.ModuleType(mcp_package_name) From c1f6bf6473a1f1f37b509c0e41e89fbe0d44aab0 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 16:29:09 +0800 Subject: [PATCH 20/39] Fix vendored code bash and grep failures --- .../resources/code_vendor/file_tools.py | 36 ++++++++++--- sandbox/tests/test_code_vendor_tools.py | 50 +++++++++++++++++++ 2 files changed, 80 insertions(+), 6 deletions(-) diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py index 4396089..1a61ff2 100644 --- a/sandbox/server/backends/resources/code_vendor/file_tools.py +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -49,9 +49,9 @@ async def call(self, args: dict[str, Any], ctx: Any) -> str: output = _decode_text_mode_output(stdout_bytes) stderr = _decode_text_mode_output(stderr_bytes) - if stderr: - output += f"\n[stderr]:\n{stderr}" - return output.strip() or "(no output)" + if proc.returncode: + return _format_command_error("bash", proc.returncode, output, stderr) + return _format_command_output(output, stderr) def _decode_text_mode_output(data: bytes | None) -> str: @@ -69,6 +69,26 @@ def _decode_text_mode_output(data: bytes | None) -> str: text_stream.detach() +def _format_command_output(stdout: str, stderr: str) -> str: + output = stdout + if stderr: + output += f"\n[stderr]:\n{stderr}" if output else f"[stderr]:\n{stderr}" + return output.strip() or "(no output)" + + +def _format_command_error(tool_name: str, returncode: int, stdout: str, stderr: str) -> str: + if returncode < 0: + status = f"signal {-returncode}" + else: + status = f"exit status {returncode}" + + summary = f"Error: {tool_name} command failed with {status}" + details = _format_command_output(stdout, stderr) + if details == "(no output)": + return summary + return f"{summary}\n{details}" + + class ReadTool(Tool): name = "Read" description = "Read a file and return its contents with line numbers." @@ -149,12 +169,16 @@ def input_schema(self) -> dict[str, Any]: async def call(self, args: dict[str, Any], ctx: Any) -> str: base = Path(args.get("path", ctx.cwd)) - cmd = ["grep", "-r", "-n", args["pattern"]] + cmd = ["grep", "-r", "-n"] if "glob" in args: cmd += ["--include", args["glob"]] - cmd.append(str(base)) + cmd += ["--", args["pattern"], str(base)] result = subprocess.run(cmd, capture_output=True, text=True) - return result.stdout or "(no matches)" + if result.returncode == 0: + return result.stdout or "(no matches)" + if result.returncode == 1: + return "(no matches)" + return _format_command_error("grep", result.returncode, result.stdout, result.stderr) def is_read_only(self, args: dict[str, Any]) -> bool: del args diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index 940f936..060252a 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -189,6 +189,34 @@ def test_grep_tool_searches_recursively_without_glob_filter(tmp_path): } +def test_grep_tool_returns_error_prefix_for_invalid_pattern(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "[", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result.startswith("Error:") + assert "exit status 2" in result + assert "[stderr]:" in result + + +def test_grep_tool_treats_option_like_pattern_as_search_pattern(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("--help\nalpha\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "--help", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:--help\n" + + def test_bash_tool_combines_stdout_and_stderr(tmp_path): result = call_tool( BashTool(), @@ -220,6 +248,28 @@ def test_bash_tool_matches_text_mode_newline_normalization(tmp_path): assert result == "a\nb" +def test_bash_tool_returns_error_prefix_for_nonzero_exit_status(tmp_path): + result = call_tool( + BashTool(), + { + "command": ( + f"{shlex.quote(sys.executable)} -c " + "\"import sys; " + "print('out'); " + "print('err', file=sys.stderr); " + "raise SystemExit(7)\"" + ) + }, + make_ctx(tmp_path), + ) + + assert result.startswith("Error:") + assert "exit status 7" in result + assert "out" in result + assert "[stderr]:" in result + assert "err" in result + + def test_bash_tool_cancellation_stops_background_command(tmp_path): marker = tmp_path / "marker.txt" From b6d8d7b7425f8fccd55f7ce8a1369d588b91ebd1 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 16:39:27 +0800 Subject: [PATCH 21/39] Add code backend failure regressions --- sandbox/tests/test_code_backend.py | 66 +++++++++++++++++++++++++ sandbox/tests/test_code_vendor_tools.py | 13 +++++ 2 files changed, 79 insertions(+) diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index 65f50cf..cff5e4c 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -244,6 +244,72 @@ def test_tool_executor_runs_bash_via_vendored_tool(tmp_path): assert result["data"] == str(runtime_workspace.resolve(strict=False)) +def test_tool_executor_returns_business_failure_for_vendored_grep_error(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + sample_file = runtime_workspace / "sample.txt" + sample_file.write_text("alpha\nbeta\n", encoding="utf-8") + executor = build_executor( + fake_server, + { + "session_id": "code-session-grep-error", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:grep", + params={"pattern": "[", "path": str(runtime_workspace)}, + worker_id="worker-1", + trace_id="trace-grep-error", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert result["message"].startswith("Error:") + assert "exit status 2" in result["message"] + assert "[stderr]:" in result["message"] + + +def test_tool_executor_returns_business_failure_for_vendored_bash_error(tmp_path): + _, backend = build_backend(tmp_path) + fake_server = bind_backend_tools(backend) + runtime_workspace = tmp_path / "agentflow_code" / "worker-1" + runtime_workspace.mkdir(parents=True) + executor = build_executor( + fake_server, + { + "session_id": "code-session-bash-error", + "data": {"workspace": str(runtime_workspace)}, + }, + ) + + result = execute_tool( + executor, + "code:bash", + params={ + "command": ( + f"{sys.executable} -c " + "\"import sys; " + "print('out'); " + "print('err', file=sys.stderr); " + "raise SystemExit(7)\"" + ) + }, + worker_id="worker-1", + trace_id="trace-bash-error", + ) + + assert result["code"] == ErrorCode.BUSINESS_FAILURE + assert result["message"].startswith("Error:") + assert "exit status 7" in result["message"] + assert "out" in result["message"] + assert "[stderr]:" in result["message"] + assert "err" in result["message"] + + def test_tool_executor_non_bash_timeout_uses_standard_error_handling(tmp_path): _, backend = build_backend(tmp_path) fake_server = bind_backend_tools(backend) diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index 060252a..4bd5893 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -189,6 +189,19 @@ def test_grep_tool_searches_recursively_without_glob_filter(tmp_path): } +def test_grep_tool_returns_no_matches_for_exit_code_one(tmp_path): + target = tmp_path / "sample.txt" + target.write_text("alpha\nbeta\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "missing", "path": str(tmp_path)}, + make_ctx(tmp_path), + ) + + assert result == "(no matches)" + + def test_grep_tool_returns_error_prefix_for_invalid_pattern(tmp_path): target = tmp_path / "sample.txt" target.write_text("alpha\nbeta\n", encoding="utf-8") From 35149a5ecbf94ec628da62cefbedf673dba488a3 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Thu, 16 Apr 2026 16:47:12 +0800 Subject: [PATCH 22/39] Quote interpreter in bash backend test --- sandbox/tests/test_code_backend.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sandbox/tests/test_code_backend.py b/sandbox/tests/test_code_backend.py index cff5e4c..fb48625 100644 --- a/sandbox/tests/test_code_backend.py +++ b/sandbox/tests/test_code_backend.py @@ -5,6 +5,7 @@ import asyncio import importlib import os +import shlex import sys from pathlib import Path @@ -291,7 +292,7 @@ def test_tool_executor_returns_business_failure_for_vendored_bash_error(tmp_path "code:bash", params={ "command": ( - f"{sys.executable} -c " + f"{shlex.quote(sys.executable)} -c " "\"import sys; " "print('out'); " "print('err', file=sys.stderr); " From 4f42a5f995b44dd8e12f845ed413aeb0280f100c Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Mon, 20 Apr 2026 19:40:14 +0800 Subject: [PATCH 23/39] docs: add MCP and Coding examples design spec --- ...26-04-20-mcp-and-coding-examples-design.md | 431 ++++++++++++++++++ 1 file changed, 431 insertions(+) create mode 100644 docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md diff --git a/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md new file mode 100644 index 0000000..31cfc0b --- /dev/null +++ b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md @@ -0,0 +1,431 @@ +## MCP and Coding Examples Design + +Date: 2026-04-20 +Status: Proposed for planning + +## Summary + +AgentFlow now has MCP and `code` backends, but the repository still lacks official example guides and matching example assets for them. This design adds two new examples that stay aligned with the current example set in structure, tone, and scope: + +- `examples/MCPAgent.md` +- `examples/CodingAgent.md` + +Both examples will follow the lighter three-step pattern already used by `DSAgent`: + +- Step 1: Start the Sandbox Server +- Step 2: Synthesize QA Data +- Step 3: Synthesize Trajectory Data + +They will ship with the configs, seeds, and benchmark files needed to make those steps runnable. + +## Problem Statement + +The repository already contains: + +- a working MCP backend with 25 Toolathlon-GYM MCP servers +- a working `code` backend with six coding tools +- sandbox config templates for both backends +- tests that prove both backends work at the backend level + +What is missing is the user-facing example layer: + +- no official `MCPAgent` example document +- no official `CodingAgent` example document +- no matching synthesis configs +- no matching trajectory configs +- no seeds for either example +- no benchmark data for MCP rollout +- no committed benchmark data for coding rollout + +Without these assets, the new backends are discoverable in code but not presented as first-class AgentFlow example workflows. + +## Goals + +- Add official MCP and Coding example guides under `examples/`. +- Keep both guides stylistically aligned with the current example set. +- Keep document granularity aligned with current examples rather than exposing design or orchestration internals. +- Limit both examples to the currently practical scope: sandbox startup, QA synthesis, and trajectory rollout. +- Add the configs and data assets needed to support those examples. +- Use demo-scale seeds and benchmarks that are easy to run and verify. +- Make MCP example tasks operate against the initialized Toolathlon-GYM mock database. +- Make Coding example tasks operate against a known demo repository copied into the coding workspace. + +## Non-Goals + +- Do not turn either example into a full Toolathlon task replay framework. +- Do not document or expose internal validation strategy, subagent orchestration, or workspace isolation rationale in the example docs. +- Do not add Step 4 and Step 5 sections for training, deployment, or infer/eval. +- Do not introduce extra registry files or config-generation layers just to organize domains. +- Do not require MCP `task_dir`, `initial_workspace`, or `preprocess` flows for the example path. +- Do not make CodingAgent depend on an arbitrary external repository whose contents are unknown to the repository. + +## User-Facing Outcome + +After this work, the repository will present MCP and Coding the same way it already presents RAG, Doc, DS, and Text2SQL: + +- a dedicated example document +- a dedicated sandbox config entry point +- a synthesis config +- a trajectory config +- seed data +- benchmark data where rollout needs it + +The examples will read like the existing examples and will not require readers to understand internal backend architecture. + +## Core Decisions + +### 1. Both new examples use the three-step DS-style structure + +`MCPAgent.md` and `CodingAgent.md` will both mirror the scope of `examples/DSAgent.md` rather than the five-step examples. + +Each document will include: + +- Overview +- Prerequisites +- Pipeline Overview +- Step 1: Start the Sandbox Server +- Step 2: Synthesize QA Data +- Step 3: Synthesize Trajectory Data +- Configuration Reference +- FAQ + +Each document will explicitly note that the repository currently covers data synthesis and trajectory rollout for that example, but not the later training/deployment/infer stages as an official example workflow. + +### 2. MCPAgent is a domain-level demo over the initialized Toolathlon-GYM database + +The MCP example will use Toolathlon-GYM as the backing environment, but it will not replay full Toolathlon task directories. + +Instead, it will use: + +- the initialized Toolathlon-GYM mock PostgreSQL database +- the MCP servers exposed through AgentFlow +- small domain-level seeds +- small domain-level rollout benchmarks + +This keeps the example aligned with the rest of the repository's example style while still using the real MCP domain data. + +### 3. MCPAgent reuses `configs/sandbox-server/mcp_config.json` + +The existing MCP sandbox config path remains the canonical entry point: + +- `configs/sandbox-server/mcp_config.json` + +This file will be updated so its `enabled_mcp_servers` covers the full MCP server set used by the backend and Toolathlon-GYM integration path. + +No separate `mcp_all_config.json` or metadata registry file will be introduced. + +### 4. MCP tool exposure is defined by server wildcard, not hand-picked tool names + +For each MCP domain config, `available_tools` will expose tools via server wildcard patterns such as: + +- `mcp:canvas.*` +- `mcp:snowflake.*` +- `mcp:filesystem.*` + +This avoids silent omission of tools from an included MCP server namespace and keeps the config surface simple. + +### 5. MCP resource init config stays minimal + +The MCP backend supports session init fields such as: + +- `task_dir` +- `copy_initial_workspace` +- `run_preprocess` +- `launch_time` + +Those fields are only needed when running task-directory-style Toolathlon tasks. + +For the MCP example path in this design, the session only needs a normal MCP workspace plus access to the initialized mock database through the configured MCP servers. Therefore: + +- `resource_types` will include `["mcp"]` +- `resource_init_configs.mcp.content` will be omitted or empty in example configs + +### 6. CodingAgent uses a repository-local demo repo via `source_dir` + +The coding backend always creates its own workspace, but a meaningful coding task needs actual repository contents inside that workspace. + +Therefore the official Coding example will use: + +- a small demo repository committed inside AgentFlow +- `resource_types=["code"]` +- `resource_init_configs["code"]["content"]["source_dir"]` pointing to that demo repository + +Users can later replace `source_dir` with their own repository path, but the official example will ship with a known default so its seeds and benchmark remain correct. + +### 7. CodingAgent uses demo-scale mixed tasks + +CodingAgent will use a small mixed task set: + +- read-only repository inspection tasks +- one or more controlled edit tasks with straightforward verification + +This preserves the intended coding flavor without making the example depend on large or fragile repository setups. + +### 8. Keep the docs at current example granularity + +The example docs should not explain design trade-offs, internal isolation, subagent strategy, or backend reasoning unless the current example set already does so. + +They should look and read like the existing repository examples: + +- concrete commands +- config file references +- short explanations of required inputs +- key config field summaries +- brief FAQs + +## MCPAgent Design + +### Covered domains + +The MCP example will cover the six data-rich Toolathlon-GYM domains already reflected in current MCP integration smoke tests: + +- `canvas` +- `snowflake` +- `woocommerce` +- `yahoo_finance` +- `youtube` +- `train` + +The docs will cover all six domains, but each domain is still a small demo workflow rather than a long end-to-end enterprise task. + +### Files to add + +Add: + +- `examples/MCPAgent.md` +- `configs/synthesis/mcp_canvas_config.json` +- `configs/synthesis/mcp_snowflake_config.json` +- `configs/synthesis/mcp_woocommerce_config.json` +- `configs/synthesis/mcp_yahoo_finance_config.json` +- `configs/synthesis/mcp_youtube_config.json` +- `configs/synthesis/mcp_train_config.json` +- `configs/trajectory/mcp_canvas_trajectory.json` +- `configs/trajectory/mcp_snowflake_trajectory.json` +- `configs/trajectory/mcp_woocommerce_trajectory.json` +- `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- `configs/trajectory/mcp_youtube_trajectory.json` +- `configs/trajectory/mcp_train_trajectory.json` +- `seeds/mcp/canvas_seeds.jsonl` +- `seeds/mcp/snowflake_seeds.jsonl` +- `seeds/mcp/woocommerce_seeds.jsonl` +- `seeds/mcp/yahoo_finance_seeds.jsonl` +- `seeds/mcp/youtube_seeds.jsonl` +- `seeds/mcp/train_seeds.jsonl` +- `benchmark/mcp_canvas_benchmark.jsonl` +- `benchmark/mcp_snowflake_benchmark.jsonl` +- `benchmark/mcp_woocommerce_benchmark.jsonl` +- `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- `benchmark/mcp_youtube_benchmark.jsonl` +- `benchmark/mcp_train_benchmark.jsonl` + +Modify: + +- `configs/sandbox-server/mcp_config.json` + +### Synthesis config shape + +Each `configs/synthesis/mcp__config.json` will follow the same structure as existing synthesis configs and will include: + +- model settings +- sandbox settings +- `resource_types: ["mcp"]` +- `available_tools` using MCP server wildcards +- domain-specific `sampling_tips` +- domain-specific `synthesis_tips` +- small `qa_examples` +- `seeds_file` +- `output_dir` + +### Trajectory config shape + +Each `configs/trajectory/mcp__trajectory.json` will follow the same shape as existing rollout trajectory configs and will include: + +- `benchmark_name` +- model settings +- sandbox settings +- `resource_types: ["mcp"]` +- `available_tools` using MCP server wildcards +- `system_prompt` +- `data_path` +- `output_dir` +- `save_results` +- `save_trajectories` +- `trajectory_only: true` +- `evaluate_results: false` + +### MCP domain tool exposure + +Planned MCP wildcard exposure: + +- `canvas` + - `mcp:canvas.*` + - `mcp:filesystem.*` + +- `snowflake` + - `mcp:snowflake.*` + - `mcp:filesystem.*` + +- `woocommerce` + - `mcp:woocommerce.*` + - `mcp:filesystem.*` + +- `yahoo_finance` + - `mcp:yahoo-finance.*` + - `mcp:filesystem.*` + +- `youtube` + - `mcp:youtube.*` + - `mcp:youtube-transcript.*` + - `mcp:filesystem.*` + +- `train` + - `mcp:rail_12306.*` + - `mcp:filesystem.*` + +### Seeds and benchmark style + +MCP seeds and benchmarks will be demo-scale and domain-focused. + +They should validate that the MCP backend and domain server set work cleanly inside AgentFlow, not replicate the full complexity of Toolathlon task packs. + +Expected MCP task style: + +- query real mock data from the target domain +- optionally save a result artifact into the workspace through filesystem tools +- produce answers that are easy to verify in a small benchmark + +Examples of target task shape: + +- list a small set of course or user information from Canvas +- query a small Snowflake-backed table result +- inspect WooCommerce customer or order data +- fetch Yahoo Finance stock information +- search YouTube content or transcript metadata +- look up railway station or route information + +### MCP example document shape + +`examples/MCPAgent.md` will stay at the same granularity as current examples: + +- one shared sandbox setup section +- one synthesis step covering all six domain configs +- one trajectory step covering all six domain configs +- a compact configuration reference +- a short FAQ + +It will not explain internal domain orchestration, workspace strategy, or design rationale. + +## CodingAgent Design + +### Files to add + +Add: + +- `examples/CodingAgent.md` +- `configs/synthesis/code_config.json` +- `configs/trajectory/code_trajectory.json` +- `seeds/code/seeds.jsonl` +- `seeds/code/seed/demo_repo/...` +- `benchmark/code_benchmark.jsonl` + +The demo repository under `seeds/code/seed/demo_repo/` should be small, stable, and easy to understand. + +### Synthesis config shape + +`configs/synthesis/code_config.json` will follow existing synthesis config structure and include: + +- model settings +- sandbox settings +- `resource_types: ["code"]` +- `resource_init_configs.code.content.source_dir` +- `available_tools: ["code-*"]` +- coding-specific `sampling_tips` +- coding-specific `synthesis_tips` +- small `qa_examples` +- `seeds_file` +- `output_dir` + +### Trajectory config shape + +`configs/trajectory/code_trajectory.json` will follow existing rollout config structure and include: + +- `benchmark_name` +- model settings +- sandbox settings +- `resource_types: ["code"]` +- `resource_init_configs.code.content.source_dir` +- `available_tools: ["code-*"]` +- coding-specific `system_prompt` +- `data_path` +- `output_dir` +- `save_results` +- `save_trajectories` +- `trajectory_only: true` +- `evaluate_results: false` + +### Demo repository shape + +The demo repository should be intentionally small and support both task types: + +- repository inspection +- controlled edit and verification + +The repo should include a few files such as: + +- a small entry file +- one config file +- one or two helper modules +- at least one easily verifiable placeholder or token + +The goal is not realism through size. The goal is stable, example-quality coding tasks. + +### Seeds and benchmark style + +Coding seeds should focus on repository understanding prompts. + +Coding benchmark tasks should be few and simple, mixing: + +- read-only tasks such as locating files, reading configuration values, or identifying relationships +- edit tasks such as replacing a placeholder string or updating a simple setting + +The benchmark should be authored against the committed demo repository so expected answers remain stable. + +### Coding example document shape + +`examples/CodingAgent.md` will mirror the style and scope of `examples/DSAgent.md`: + +- Overview +- Prerequisites +- Pipeline Overview +- Step 1: Start the Sandbox Server +- Step 2: Synthesize QA Data +- Step 3: Synthesize Trajectory Data +- Configuration Reference +- FAQ + +It will explicitly state that the repository currently covers data synthesis and trajectory rollout for this coding example, but not later training/deployment/infer steps as part of the official example. + +## Validation Expectations + +The implementation should be considered correct only if: + +- the new example docs match the style and granularity of current examples +- the new configs parse successfully +- MCP synthesis and rollout configs align with the real MCP backend surface +- Coding synthesis and rollout configs align with the real code backend surface +- the demo seeds and benchmarks are internally consistent with the assets they target +- representative runs can be executed by following the example documents + +## Open Questions Resolved + +- Use `configs/sandbox-server/mcp_config.json` directly: yes +- Add a separate MCP registry file: no +- Use server wildcards instead of hand-picked MCP tool names: yes +- Use Toolathlon task-directory initialization for the example path: no +- Use Toolathlon-GYM mock database as the MCP data source: yes +- Add official infer/eval steps for MCP or Coding examples: no +- Use a repository-local demo repo for CodingAgent: yes + +## Implementation Readiness + +This design is ready for implementation planning. The work is focused, bounded, and does not require redesigning backend behavior. The main deliverables are user-facing docs, example configs, and small example data assets. From e758ef226f28f118b250aaef92f60b40b1f0a975 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Mon, 20 Apr 2026 19:57:28 +0800 Subject: [PATCH 24/39] docs: refine MCP and Coding examples spec --- ...26-04-20-mcp-and-coding-examples-design.md | 96 ++++++++++++++++--- 1 file changed, 84 insertions(+), 12 deletions(-) diff --git a/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md index 31cfc0b..2bdda4f 100644 --- a/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md +++ b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md @@ -1,7 +1,7 @@ ## MCP and Coding Examples Design Date: 2026-04-20 -Status: Proposed for planning +Status: Approved for planning ## Summary @@ -64,7 +64,7 @@ Without these assets, the new backends are discoverable in code but not presente After this work, the repository will present MCP and Coding the same way it already presents RAG, Doc, DS, and Text2SQL: - a dedicated example document -- a dedicated sandbox config entry point +- a sandbox config entry point - a synthesis config - a trajectory config - seed data @@ -110,7 +110,26 @@ The existing MCP sandbox config path remains the canonical entry point: - `configs/sandbox-server/mcp_config.json` -This file will be updated so its `enabled_mcp_servers` covers the full MCP server set used by the backend and Toolathlon-GYM integration path. +This file will be updated so its default `enabled_mcp_servers` matches the exact server subset needed by the official example domains, rather than the full 25-server backend surface: + +- `canvas` +- `snowflake` +- `woocommerce` +- `yahoo-finance` +- `youtube` +- `youtube-transcript` +- `rail_12306` +- `filesystem` + +This keeps Step 1 aligned with the example scope while still allowing all six documented domains to run from the shared MCP sandbox entry point. + +This is an intentional example-oriented default, not a removal of backend capability. The current checked-in `mcp_config.json` already enables only a subset of servers today, and full-surface MCP usage will remain available by expanding `enabled_mcp_servers` in the same file or in a user-local copy outside the official examples. + +The same config will also define an explicit MCP server path contract so the checked-in MCP YAMLs can resolve `${local_servers_paths}` at runtime. The planned default is an environment-backed path such as: + +- `mcp_servers_path: "${TOOLATHLON_GYM_ROOT}/mcp_servers"` + +The implementation will rely on the existing MCP backend translation layer: `ToolathlonGymBackend` passes `mcp_servers_path` into the MCP YAML loader, and that loader substitutes the value into `${local_servers_paths}` when resolving each bundled server YAML. No separate `mcp_all_config.json` or metadata registry file will be introduced. @@ -148,9 +167,9 @@ Therefore the official Coding example will use: - a small demo repository committed inside AgentFlow - `resource_types=["code"]` -- `resource_init_configs["code"]["content"]["source_dir"]` pointing to that demo repository +- `resource_init_configs["code"]["content"]["source_dir"]` pointing to that demo repository through the explicit repo-root contract `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` -Users can later replace `source_dir` with their own repository path, but the official example will ship with a known default so its seeds and benchmark remain correct. +AgentFlow's config loader already expands `${VAR}` placeholders before backend initialization, so no new path resolver is needed for this contract. The official docs will require exporting `AGENTFLOW_REPO_ROOT` after `cd AgentFlow`, and Step 2 / Step 3 will use that variable consistently. Users can later replace `source_dir` with their own repository path, but the official example will ship with a known default so its seeds and benchmark remain correct. ### 7. CodingAgent uses demo-scale mixed tasks @@ -188,6 +207,32 @@ The MCP example will cover the six data-rich Toolathlon-GYM domains already refl The docs will cover all six domains, but each domain is still a small demo workflow rather than a long end-to-end enterprise task. +Server name mapping will follow the current MCP backend naming: + +- `yahoo_finance` uses MCP server `yahoo-finance` +- `train` uses MCP server `rail_12306` + +### Sandbox prerequisites and server subset + +`examples/MCPAgent.md` will document the minimum local prerequisites needed for Step 1 to be runnable: + +- a local `toolathlon_gym` checkout that has already completed its own setup and is running before AgentFlow starts +- `TOOLATHLON_GYM_ROOT` pointing to that checkout +- the MCP server bundle reachable at `${TOOLATHLON_GYM_ROOT}/mcp_servers` +- required local runtimes such as `node` and `uv` +- the following checked-in local defaults in `configs/sandbox-server/mcp_config.json`: + - `PGHOST=localhost` + - `PGPORT=5432` + - `PGUSER=eigent` + - `PGPASSWORD=camel` + - `PGDATABASE=toolathlon_gym` + - `CANVAS_DOMAIN=localhost:8080` + - `WORDPRESS_SITE_URL=http://localhost:8081` + +AgentFlow will not bootstrap the Toolathlon-GYM services itself in the official example. If a local setup differs from those defaults, the doc will show them as explicit override points in `mcp_config.json`. + +The checked-in MCP sandbox config will enable only the shared example subset listed in Core Decision 3, so warmup behavior matches the domains covered by the example doc. + ### Files to add Add: @@ -236,6 +281,8 @@ Each `configs/synthesis/mcp__config.json` will follow the same structure - `seeds_file` - `output_dir` +These configs will rely on the shared MCP sandbox startup path above rather than redefining server startup details per domain. + ### Trajectory config shape Each `configs/trajectory/mcp__trajectory.json` will follow the same shape as existing rollout trajectory configs and will include: @@ -294,6 +341,8 @@ Expected MCP task style: - optionally save a result artifact into the workspace through filesystem tools - produce answers that are easy to verify in a small benchmark +Benchmark correctness will be defined by the final textual answer in each benchmark row. Workspace artifact creation is allowed as an illustrative side effect, but it is not required for benchmark success and will not be treated as a scoring criterion in the official example data. + Examples of target task shape: - list a small set of course or user information from Canvas @@ -317,6 +366,14 @@ It will not explain internal domain orchestration, workspace strategy, or design ## CodingAgent Design +### Sandbox entry point + +`examples/CodingAgent.md` will reuse the existing coding sandbox config entry point: + +- `configs/sandbox-server/code_config.json` + +Step 1 in the Coding example will start that config directly, matching the current repository pattern of reusing a checked-in sandbox config rather than introducing a second coding sandbox file. + ### Files to add Add: @@ -325,7 +382,11 @@ Add: - `configs/synthesis/code_config.json` - `configs/trajectory/code_trajectory.json` - `seeds/code/seeds.jsonl` -- `seeds/code/seed/demo_repo/...` +- `seeds/code/seed/demo_repo/README.md` +- `seeds/code/seed/demo_repo/app.py` +- `seeds/code/seed/demo_repo/config/app_config.json` +- `seeds/code/seed/demo_repo/lib/helpers.py` +- `seeds/code/seed/demo_repo/tests/smoke_test.py` - `benchmark/code_benchmark.jsonl` The demo repository under `seeds/code/seed/demo_repo/` should be small, stable, and easy to understand. @@ -337,7 +398,7 @@ The demo repository under `seeds/code/seed/demo_repo/` should be small, stable, - model settings - sandbox settings - `resource_types: ["code"]` -- `resource_init_configs.code.content.source_dir` +- `resource_init_configs.code.content.source_dir` using the explicit repo-root contract `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` - `available_tools: ["code-*"]` - coding-specific `sampling_tips` - coding-specific `synthesis_tips` @@ -353,7 +414,7 @@ The demo repository under `seeds/code/seed/demo_repo/` should be small, stable, - model settings - sandbox settings - `resource_types: ["code"]` -- `resource_init_configs.code.content.source_dir` +- `resource_init_configs.code.content.source_dir` using the same explicit repo-root contract `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` - `available_tools: ["code-*"]` - coding-specific `system_prompt` - `data_path` @@ -372,10 +433,11 @@ The demo repository should be intentionally small and support both task types: The repo should include a few files such as: -- a small entry file -- one config file -- one or two helper modules -- at least one easily verifiable placeholder or token +- `README.md` describing the tiny app +- `app.py` as the main entry file +- `config/app_config.json` with one or two settings used by the app +- `lib/helpers.py` with at least one helper imported by `app.py` +- `tests/smoke_test.py` for a minimal verification path The goal is not realism through size. The goal is stable, example-quality coding tasks. @@ -390,6 +452,14 @@ Coding benchmark tasks should be few and simple, mixing: The benchmark should be authored against the committed demo repository so expected answers remain stable. +Benchmark contract: + +- read-only tasks will use the standard `id` + `question` + `answer` shape +- edit tasks will still run under `trajectory_only: true` and `evaluate_results: false`, so they are for trajectory capture rather than auto-grading +- edit-task rows will include a short expected completion statement in `answer` plus metadata such as `target_files` and `check_command` to document the intended post-run verification path +- those extra verification fields will live under benchmark `metadata`, so existing rollout loaders can safely ignore them +- the recommended verification path for edit tasks will be the committed `tests/smoke_test.py`, not rollout-time automatic scoring + ### Coding example document shape `examples/CodingAgent.md` will mirror the style and scope of `examples/DSAgent.md`: @@ -411,10 +481,12 @@ The implementation should be considered correct only if: - the new example docs match the style and granularity of current examples - the new configs parse successfully +- `configs/sandbox-server/mcp_config.json` resolves MCP server executables through the documented `TOOLATHLON_GYM_ROOT` contract - MCP synthesis and rollout configs align with the real MCP backend surface - Coding synthesis and rollout configs align with the real code backend surface - the demo seeds and benchmarks are internally consistent with the assets they target - representative runs can be executed by following the example documents +- Coding Step 1, QA synthesis, and rollout remain runnable when the documented `cd AgentFlow` plus `export AGENTFLOW_REPO_ROOT=$(pwd)` prerequisite is followed ## Open Questions Resolved From 85d449640124c5aa3384f5a9f8fba74edcaf56cb Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Mon, 20 Apr 2026 20:26:53 +0800 Subject: [PATCH 25/39] docs: correct MCP local_servers path in spec --- .../specs/2026-04-20-mcp-and-coding-examples-design.md | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md index 2bdda4f..e34502c 100644 --- a/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md +++ b/docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md @@ -127,9 +127,12 @@ This is an intentional example-oriented default, not a removal of backend capabi The same config will also define an explicit MCP server path contract so the checked-in MCP YAMLs can resolve `${local_servers_paths}` at runtime. The planned default is an environment-backed path such as: -- `mcp_servers_path: "${TOOLATHLON_GYM_ROOT}/mcp_servers"` +- `mcp_servers_path: "${TOOLATHLON_GYM_ROOT}/local_servers"` The implementation will rely on the existing MCP backend translation layer: `ToolathlonGymBackend` passes `mcp_servers_path` into the MCP YAML loader, and that loader substitutes the value into `${local_servers_paths}` when resolving each bundled server YAML. +In other words, `mcp_servers_path` is the JSON config field name, while `${local_servers_paths}` is the existing MCP YAML placeholder name for the same `local_servers/` directory. + +The current repository version of `configs/sandbox-server/mcp_config.json` does not yet define `mcp_servers_path`; adding that field is part of this example work. No separate `mcp_all_config.json` or metadata registry file will be introduced. @@ -218,9 +221,9 @@ Server name mapping will follow the current MCP backend naming: - a local `toolathlon_gym` checkout that has already completed its own setup and is running before AgentFlow starts - `TOOLATHLON_GYM_ROOT` pointing to that checkout -- the MCP server bundle reachable at `${TOOLATHLON_GYM_ROOT}/mcp_servers` +- the MCP server bundle reachable at `${TOOLATHLON_GYM_ROOT}/local_servers` - required local runtimes such as `node` and `uv` -- the following checked-in local defaults in `configs/sandbox-server/mcp_config.json`: +- the following planned example defaults in `configs/sandbox-server/mcp_config.json`: - `PGHOST=localhost` - `PGPORT=5432` - `PGUSER=eigent` From e58bc0d5b69f5ad821ce79b76c8ee6cefff50ea8 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 09:35:15 +0800 Subject: [PATCH 26/39] test: lock MCP example sandbox contract --- configs/sandbox-server/mcp_config.json | 18 +++++++-- sandbox/tests/test_mcp_backend.py | 41 +++++++++++++++++--- sandbox/tests/test_mcp_client.py | 32 +++++++++++++++ sandbox/tests/test_sandbox_config_loading.py | 27 +++++++++++++ 4 files changed, 110 insertions(+), 8 deletions(-) diff --git a/configs/sandbox-server/mcp_config.json b/configs/sandbox-server/mcp_config.json index 18baf9c..639597b 100644 --- a/configs/sandbox-server/mcp_config.json +++ b/configs/sandbox-server/mcp_config.json @@ -10,14 +10,26 @@ "description": "Toolathlon-GYM MCP backend", "backend_class": "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend", "config": { - "enabled_mcp_servers": ["filesystem", "terminal", "snowflake"], + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers", + "enabled_mcp_servers": [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem" + ], "workspace_root": "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}", "env_overrides": { - "PGHOST": "${PGHOST:-toolathlon_pg}", + "PGHOST": "${PGHOST:-localhost}", "PGPORT": "${PGPORT:-5432}", "PGUSER": "${PGUSER:-eigent}", "PGPASSWORD": "${PGPASSWORD:-camel}", - "PGDATABASE": "${PGDATABASE:-toolathlon_gym}" + "PGDATABASE": "${PGDATABASE:-toolathlon_gym}", + "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}", + "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}" } } } diff --git a/sandbox/tests/test_mcp_backend.py b/sandbox/tests/test_mcp_backend.py index 77cb58f..2b3fd4a 100644 --- a/sandbox/tests/test_mcp_backend.py +++ b/sandbox/tests/test_mcp_backend.py @@ -536,7 +536,16 @@ def fake_load_mcp_process_config(**kwargs): assert created_clients[0].closed is True -def test_mcp_config_template_parses(): +def test_mcp_config_template_declares_example_server_subset(monkeypatch): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + monkeypatch.delenv("PGHOST", raising=False) + monkeypatch.delenv("PGPORT", raising=False) + monkeypatch.delenv("PGUSER", raising=False) + monkeypatch.delenv("PGPASSWORD", raising=False) + monkeypatch.delenv("PGDATABASE", raising=False) + monkeypatch.delenv("CANVAS_DOMAIN", raising=False) + monkeypatch.delenv("WORDPRESS_SITE_URL", raising=False) + loader = ConfigLoader() config_path = ( Path(__file__).resolve().parents[2] @@ -546,9 +555,31 @@ def test_mcp_config_template_parses(): ) config = loader.load(str(config_path)) + mcp_resource = config.resources["mcp"] + mcp_config = mcp_resource.config - assert "mcp" in config.resources - assert ( - config.resources["mcp"].backend_class - == "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend" + assert mcp_resource.backend_class == ( + "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend" ) + assert mcp_config["mcp_servers_path"] == "${TOOLATHLON_GYM_ROOT}/local_servers" + assert mcp_config["enabled_mcp_servers"] == [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem", + ] + assert mcp_config["env_overrides"] == { + "PGHOST": "localhost", + "PGPORT": "5432", + "PGUSER": "eigent", + "PGPASSWORD": "camel", + "PGDATABASE": "toolathlon_gym", + "CANVAS_DOMAIN": "localhost:8080", + "WORDPRESS_SITE_URL": "http://localhost:8081", + } + assert config.warmup.enabled is True + assert config.warmup.resources == ["mcp"] diff --git a/sandbox/tests/test_mcp_client.py b/sandbox/tests/test_mcp_client.py index 0be42d6..6861ec0 100644 --- a/sandbox/tests/test_mcp_client.py +++ b/sandbox/tests/test_mcp_client.py @@ -419,6 +419,38 @@ def test_load_mcp_process_config_resolves_with_mcp_servers_path(tmp_path): assert config.timeout_seconds == 42 +def test_load_mcp_process_config_resolves_toolathlon_local_servers_path(tmp_path): + module = load_mcp_client_module() + config_dir = tmp_path / "configs" / "mcp_servers" + config_dir.mkdir(parents=True) + (config_dir / "filesystem.yaml").write_text( + """ +type: stdio +name: filesystem +params: + command: node + args: + - ${local_servers_paths}/filesystem/environment/dist/index.js + - ${agent_workspace} + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="filesystem", + agent_workspace="/tmp/agentflow-worker", + mcp_servers_path="/tmp/toolathlon/local_servers", + config_dir=config_dir, + ) + + assert config.command == "node" + assert config.args == [ + "/tmp/toolathlon/local_servers/filesystem/environment/dist/index.js", + "/tmp/agentflow-worker", + ] + + def test_load_mcp_process_config_backward_compat_toolathlon_root(tmp_path): module = load_mcp_client_module() toolathlon_root = tmp_path / "toolathlon" diff --git a/sandbox/tests/test_sandbox_config_loading.py b/sandbox/tests/test_sandbox_config_loading.py index d57f5cb..358b0c7 100644 --- a/sandbox/tests/test_sandbox_config_loading.py +++ b/sandbox/tests/test_sandbox_config_loading.py @@ -29,6 +29,33 @@ def test_load_server_config_expands_env_default_placeholders(tmp_path, monkeypat ) +def test_load_server_config_keeps_required_mcp_servers_path_placeholder_when_env_missing( + tmp_path, monkeypatch +): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + + config_path = tmp_path / "mcp_config.json" + raw_config = { + "resources": { + "mcp": { + "enabled": True, + "config": { + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert ( + loaded["resources"]["mcp"]["config"]["mcp_servers_path"] + == "${TOOLATHLON_GYM_ROOT}/local_servers" + ) + + def test_load_server_config_keeps_workspace_root_for_code_backend(tmp_path): config_path = tmp_path / "code_config.json" raw_config = { From cbd1ad4e012015704ec11cc30f6fc6739365fbe9 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 09:41:54 +0800 Subject: [PATCH 27/39] Tighten MCP backend config template assertions --- sandbox/tests/test_mcp_backend.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/sandbox/tests/test_mcp_backend.py b/sandbox/tests/test_mcp_backend.py index 2b3fd4a..5b6bdd7 100644 --- a/sandbox/tests/test_mcp_backend.py +++ b/sandbox/tests/test_mcp_backend.py @@ -3,6 +3,7 @@ """ import asyncio +import json import importlib.util import sys import types @@ -538,6 +539,7 @@ def fake_load_mcp_process_config(**kwargs): def test_mcp_config_template_declares_example_server_subset(monkeypatch): monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + monkeypatch.delenv("TOOLATHLON_WORKSPACE_ROOT", raising=False) monkeypatch.delenv("PGHOST", raising=False) monkeypatch.delenv("PGPORT", raising=False) monkeypatch.delenv("PGUSER", raising=False) @@ -553,6 +555,8 @@ def test_mcp_config_template_declares_example_server_subset(monkeypatch): / "sandbox-server" / "mcp_config.json" ) + raw_config = json.loads(config_path.read_text(encoding="utf-8")) + raw_mcp_config = raw_config["resources"]["mcp"]["config"] config = loader.load(str(config_path)) mcp_resource = config.resources["mcp"] @@ -572,6 +576,19 @@ def test_mcp_config_template_declares_example_server_subset(monkeypatch): "rail_12306", "filesystem", ] + assert raw_mcp_config["workspace_root"] == ( + "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}" + ) + assert raw_mcp_config["env_overrides"] == { + "PGHOST": "${PGHOST:-localhost}", + "PGPORT": "${PGPORT:-5432}", + "PGUSER": "${PGUSER:-eigent}", + "PGPASSWORD": "${PGPASSWORD:-camel}", + "PGDATABASE": "${PGDATABASE:-toolathlon_gym}", + "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}", + "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}", + } + assert mcp_config["workspace_root"] == "/tmp/agentflow_mcp" assert mcp_config["env_overrides"] == { "PGHOST": "localhost", "PGPORT": "5432", From 54e712a13c0e3d524ace146b381914cdeb858c21 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 09:46:55 +0800 Subject: [PATCH 28/39] Add failing MCP example config and asset tests --- rollout/tests/test_mcp_example_assets.py | 104 ++++++++++++++++++ .../test_mcp_example_synthesis_configs.py | 75 +++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 rollout/tests/test_mcp_example_assets.py create mode 100644 synthesis/tests/test_mcp_example_synthesis_configs.py diff --git a/rollout/tests/test_mcp_example_assets.py b/rollout/tests/test_mcp_example_assets.py new file mode 100644 index 0000000..9308fbc --- /dev/null +++ b/rollout/tests/test_mcp_example_assets.py @@ -0,0 +1,104 @@ +import json +from pathlib import Path + +import pytest + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_canvas_benchmark.jsonl", + "benchmark_name": "mcp_canvas_trajectory", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_snowflake_benchmark.jsonl", + "benchmark_name": "mcp_snowflake_trajectory", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_woocommerce_benchmark.jsonl", + "benchmark_name": "mcp_woocommerce_trajectory", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_yahoo_finance_benchmark.jsonl", + "benchmark_name": "mcp_yahoo_finance_trajectory", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "benchmark": "benchmark/mcp_youtube_benchmark.jsonl", + "benchmark_name": "mcp_youtube_trajectory", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_train_benchmark.jsonl", + "benchmark_name": "mcp_train_trajectory", + }, +} + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_rollout_config_contract(domain): + expected = EXPECTED[domain] + config_path = REPO_ROOT / "configs" / "trajectory" / f"mcp_{domain}_trajectory.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = RolloutConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.benchmark_name == expected["benchmark_name"] + assert config.data_path == expected["benchmark"] + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == "${OPENAI_API_KEY}" + assert config.base_url == "${OPENAI_API_URL}" + assert config.max_turns == 20 + assert config.available_tools == expected["tools"] + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert "MCP domain assistant" in config.system_prompt + assert "Use only the available MCP tools" in config.system_prompt + assert "Reply with the final answer only" in config.system_prompt + assert config.evaluate_results is False + assert config.output_dir == f"trajectory_results/mcp_{domain}" + assert config.save_results is True + assert config.trajectory_only is True + assert config.save_trajectories is True + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_seed_files_are_two_row_jsonl(domain): + seed_path = REPO_ROOT / "seeds" / "mcp" / f"{domain}_seeds.jsonl" + rows = _read_jsonl(seed_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_benchmark_files_have_two_row_jsonl_contract(domain): + benchmark_path = REPO_ROOT / "benchmark" / f"mcp_{domain}_benchmark.jsonl" + rows = _read_jsonl(benchmark_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"id", "question", "answer"} for row in rows) + assert all(isinstance(row["question"], str) and row["question"].strip() for row in rows) + assert all(isinstance(row["answer"], str) for row in rows) diff --git a/synthesis/tests/test_mcp_example_synthesis_configs.py b/synthesis/tests/test_mcp_example_synthesis_configs.py new file mode 100644 index 0000000..253879a --- /dev/null +++ b/synthesis/tests/test_mcp_example_synthesis_configs.py @@ -0,0 +1,75 @@ +import json +from pathlib import Path + +import pytest + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/canvas_seeds.jsonl", + "seed_description": "Canvas MCP prompts", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/snowflake_seeds.jsonl", + "seed_description": "Snowflake MCP prompts", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl", + "seed_description": "WooCommerce MCP prompts", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl", + "seed_description": "Yahoo Finance MCP prompts", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "seeds_file": "seeds/mcp/youtube_seeds.jsonl", + "seed_description": "YouTube MCP prompts", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/train_seeds.jsonl", + "seed_description": "Train MCP prompts", + }, +} + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_synthesis_config_contract(domain): + expected = EXPECTED[domain] + config_path = REPO_ROOT / "configs" / "synthesis" / f"mcp_{domain}_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == "${OPENAI_API_KEY}" + assert config.base_url == "${OPENAI_API_URL}" + assert config.max_depth == 12 + assert config.branching_factor == 2 + assert config.depth_threshold == 2 + assert config.min_depth == 2 + assert config.max_selected_traj == 1 + assert config.path_similarity_threshold == 0.7 + assert config.available_tools == expected["tools"] + assert config.seeds_file == expected["seeds_file"] + assert config.output_dir == f"results/mcp_{domain}" + assert raw["seed_description"] == expected["seed_description"] + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() From ad33358d630da61f8e5e6b4cf0559d6fab017854 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 09:57:38 +0800 Subject: [PATCH 29/39] feat: add MCP example assets --- benchmark/mcp_canvas_benchmark.jsonl | 2 + benchmark/mcp_snowflake_benchmark.jsonl | 2 + benchmark/mcp_train_benchmark.jsonl | 2 + benchmark/mcp_woocommerce_benchmark.jsonl | 2 + benchmark/mcp_yahoo_finance_benchmark.jsonl | 2 + benchmark/mcp_youtube_benchmark.jsonl | 2 + configs/synthesis/mcp_canvas_config.json | 38 +++++++++++++++++++ configs/synthesis/mcp_snowflake_config.json | 38 +++++++++++++++++++ configs/synthesis/mcp_train_config.json | 38 +++++++++++++++++++ configs/synthesis/mcp_woocommerce_config.json | 38 +++++++++++++++++++ .../synthesis/mcp_yahoo_finance_config.json | 38 +++++++++++++++++++ configs/synthesis/mcp_youtube_config.json | 38 +++++++++++++++++++ configs/trajectory/mcp_canvas_trajectory.json | 24 ++++++++++++ .../trajectory/mcp_snowflake_trajectory.json | 24 ++++++++++++ configs/trajectory/mcp_train_trajectory.json | 24 ++++++++++++ .../mcp_woocommerce_trajectory.json | 24 ++++++++++++ .../mcp_yahoo_finance_trajectory.json | 24 ++++++++++++ .../trajectory/mcp_youtube_trajectory.json | 24 ++++++++++++ seeds/mcp/canvas_seeds.jsonl | 2 + seeds/mcp/snowflake_seeds.jsonl | 2 + seeds/mcp/train_seeds.jsonl | 2 + seeds/mcp/woocommerce_seeds.jsonl | 2 + seeds/mcp/yahoo_finance_seeds.jsonl | 2 + seeds/mcp/youtube_seeds.jsonl | 2 + 24 files changed, 396 insertions(+) create mode 100644 benchmark/mcp_canvas_benchmark.jsonl create mode 100644 benchmark/mcp_snowflake_benchmark.jsonl create mode 100644 benchmark/mcp_train_benchmark.jsonl create mode 100644 benchmark/mcp_woocommerce_benchmark.jsonl create mode 100644 benchmark/mcp_yahoo_finance_benchmark.jsonl create mode 100644 benchmark/mcp_youtube_benchmark.jsonl create mode 100644 configs/synthesis/mcp_canvas_config.json create mode 100644 configs/synthesis/mcp_snowflake_config.json create mode 100644 configs/synthesis/mcp_train_config.json create mode 100644 configs/synthesis/mcp_woocommerce_config.json create mode 100644 configs/synthesis/mcp_yahoo_finance_config.json create mode 100644 configs/synthesis/mcp_youtube_config.json create mode 100644 configs/trajectory/mcp_canvas_trajectory.json create mode 100644 configs/trajectory/mcp_snowflake_trajectory.json create mode 100644 configs/trajectory/mcp_train_trajectory.json create mode 100644 configs/trajectory/mcp_woocommerce_trajectory.json create mode 100644 configs/trajectory/mcp_yahoo_finance_trajectory.json create mode 100644 configs/trajectory/mcp_youtube_trajectory.json create mode 100644 seeds/mcp/canvas_seeds.jsonl create mode 100644 seeds/mcp/snowflake_seeds.jsonl create mode 100644 seeds/mcp/train_seeds.jsonl create mode 100644 seeds/mcp/woocommerce_seeds.jsonl create mode 100644 seeds/mcp/yahoo_finance_seeds.jsonl create mode 100644 seeds/mcp/youtube_seeds.jsonl diff --git a/benchmark/mcp_canvas_benchmark.jsonl b/benchmark/mcp_canvas_benchmark.jsonl new file mode 100644 index 0000000..91076fd --- /dev/null +++ b/benchmark/mcp_canvas_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=, enrolled=.", "answer": ""} diff --git a/benchmark/mcp_snowflake_benchmark.jsonl b/benchmark/mcp_snowflake_benchmark.jsonl new file mode 100644 index 0000000..5612f4e --- /dev/null +++ b/benchmark/mcp_snowflake_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""} diff --git a/benchmark/mcp_train_benchmark.jsonl b/benchmark/mcp_train_benchmark.jsonl new file mode 100644 index 0000000..40897a7 --- /dev/null +++ b/benchmark/mcp_train_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=, arrival=.", "answer": ""} diff --git a/benchmark/mcp_woocommerce_benchmark.jsonl b/benchmark/mcp_woocommerce_benchmark.jsonl new file mode 100644 index 0000000..5af2943 --- /dev/null +++ b/benchmark/mcp_woocommerce_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=, orders=.", "answer": ""} diff --git a/benchmark/mcp_yahoo_finance_benchmark.jsonl b/benchmark/mcp_yahoo_finance_benchmark.jsonl new file mode 100644 index 0000000..66fb80e --- /dev/null +++ b/benchmark/mcp_yahoo_finance_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=.", "answer": ""} diff --git a/benchmark/mcp_youtube_benchmark.jsonl b/benchmark/mcp_youtube_benchmark.jsonl new file mode 100644 index 0000000..7528cc2 --- /dev/null +++ b/benchmark/mcp_youtube_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""} +{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=, language=.", "answer": ""} diff --git a/configs/synthesis/mcp_canvas_config.json b/configs/synthesis/mcp_canvas_config.json new file mode 100644 index 0000000..efd7d85 --- /dev/null +++ b/configs/synthesis/mcp_canvas_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect courses, assignments, and enrollments before drafting any question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from Canvas MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If a Canvas tool result shows a course with code HIST-201 and 28 enrolled students, how should the answer be formatted?", + "answer": "code=HIST-201, enrolled=28" + }, + { + "question": "If the first three course names in alphabetical order are Biology 101, Chemistry Lab, and World History, how should the answer be returned?", + "answer": "Biology 101, Chemistry Lab, World History" + } + ], + "seed_description": "Canvas MCP prompts", + "seeds_file": "seeds/mcp/canvas_seeds.jsonl", + "output_dir": "results/mcp_canvas" +} diff --git a/configs/synthesis/mcp_snowflake_config.json b/configs/synthesis/mcp_snowflake_config.json new file mode 100644 index 0000000..063dc38 --- /dev/null +++ b/configs/synthesis/mcp_snowflake_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect schemas and table names before choosing a reporting question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from Snowflake MCP query outputs." + ], + "qa_examples": [ + { + "question": "If the first three visible tables are CUSTOMERS, LINE_ITEMS, and ORDERS, how should the answer be returned?", + "answer": "CUSTOMERS, LINE_ITEMS, ORDERS" + }, + { + "question": "If a Snowflake aggregate query returns total_orders=125, how should the answer be formatted?", + "answer": "total_orders=125" + } + ], + "seed_description": "Snowflake MCP prompts", + "seeds_file": "seeds/mcp/snowflake_seeds.jsonl", + "output_dir": "results/mcp_snowflake" +} diff --git a/configs/synthesis/mcp_train_config.json b/configs/synthesis/mcp_train_config.json new file mode 100644 index 0000000..5bd67b8 --- /dev/null +++ b/configs/synthesis/mcp_train_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect stations, routes, and train options before drafting a travel lookup question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from rail_12306 MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the first three station names alphabetically are Beijing, Hangzhou, and Shanghai, how should the answer be returned?", + "answer": "Beijing, Hangzhou, Shanghai" + }, + { + "question": "If a route lookup shows departure Shanghai and arrival Nanjing, how should the answer be formatted?", + "answer": "departure=Shanghai, arrival=Nanjing" + } + ], + "seed_description": "Train MCP prompts", + "seeds_file": "seeds/mcp/train_seeds.jsonl", + "output_dir": "results/mcp_train" +} diff --git a/configs/synthesis/mcp_woocommerce_config.json b/configs/synthesis/mcp_woocommerce_config.json new file mode 100644 index 0000000..926b9fc --- /dev/null +++ b/configs/synthesis/mcp_woocommerce_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect customers, products, and orders before selecting a small store question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from WooCommerce MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the first three product names alphabetically are Backpack, Coffee Mug, and Notebook, how should the answer be returned?", + "answer": "Backpack, Coffee Mug, Notebook" + }, + { + "question": "If a customer email is alex@example.com and that customer has 3 orders, how should the answer be formatted?", + "answer": "email=alex@example.com, orders=3" + } + ], + "seed_description": "WooCommerce MCP prompts", + "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl", + "output_dir": "results/mcp_woocommerce" +} diff --git a/configs/synthesis/mcp_yahoo_finance_config.json b/configs/synthesis/mcp_yahoo_finance_config.json new file mode 100644 index 0000000..3eb2f4d --- /dev/null +++ b/configs/synthesis/mcp_yahoo_finance_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect available tickers and quote fields before drafting a finance lookup question.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from Yahoo Finance MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the available tickers sorted alphabetically begin with AAPL, MSFT, and NVDA, how should the answer be returned?", + "answer": "AAPL, MSFT, NVDA" + }, + { + "question": "If one comparison shows MSFT has the larger price, how should the answer be formatted?", + "answer": "symbol=MSFT" + } + ], + "seed_description": "Yahoo Finance MCP prompts", + "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl", + "output_dir": "results/mcp_yahoo_finance" +} diff --git a/configs/synthesis/mcp_youtube_config.json b/configs/synthesis/mcp_youtube_config.json new file mode 100644 index 0000000..dfd987f --- /dev/null +++ b/configs/synthesis/mcp_youtube_config.json @@ -0,0 +1,38 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Inspect video metadata first, then use transcript tools only when language or transcript details matter.", + "Prefer filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from YouTube MCP tool outputs." + ], + "qa_examples": [ + { + "question": "If the first three video titles alphabetically are Intro to Databases, MCP Demo, and Testing Walkthrough, how should the answer be returned?", + "answer": "Intro to Databases, MCP Demo, Testing Walkthrough" + }, + { + "question": "If a transcript lookup shows video id abc123 with language en, how should the answer be formatted?", + "answer": "video=abc123, language=en" + } + ], + "seed_description": "YouTube MCP prompts", + "seeds_file": "seeds/mcp/youtube_seeds.jsonl", + "output_dir": "results/mcp_youtube" +} diff --git a/configs/trajectory/mcp_canvas_trajectory.json b/configs/trajectory/mcp_canvas_trajectory.json new file mode 100644 index 0000000..b6458ed --- /dev/null +++ b/configs/trajectory/mcp_canvas_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_canvas_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_canvas_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_canvas", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_snowflake_trajectory.json b/configs/trajectory/mcp_snowflake_trajectory.json new file mode 100644 index 0000000..a087b61 --- /dev/null +++ b/configs/trajectory/mcp_snowflake_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_snowflake_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_snowflake_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_snowflake", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_train_trajectory.json b/configs/trajectory/mcp_train_trajectory.json new file mode 100644 index 0000000..d29e844 --- /dev/null +++ b/configs/trajectory/mcp_train_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_train_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_train_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_train", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_woocommerce_trajectory.json b/configs/trajectory/mcp_woocommerce_trajectory.json new file mode 100644 index 0000000..0f17d05 --- /dev/null +++ b/configs/trajectory/mcp_woocommerce_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_woocommerce_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_woocommerce_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_woocommerce", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_yahoo_finance_trajectory.json b/configs/trajectory/mcp_yahoo_finance_trajectory.json new file mode 100644 index 0000000..e4ad8b2 --- /dev/null +++ b/configs/trajectory/mcp_yahoo_finance_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_yahoo_finance_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_yahoo_finance_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_yahoo_finance", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/configs/trajectory/mcp_youtube_trajectory.json b/configs/trajectory/mcp_youtube_trajectory.json new file mode 100644 index 0000000..17bed8c --- /dev/null +++ b/configs/trajectory/mcp_youtube_trajectory.json @@ -0,0 +1,24 @@ +{ + "benchmark_name": "mcp_youtube_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp_youtube_benchmark.jsonl", + "output_dir": "trajectory_results/mcp_youtube", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} diff --git a/seeds/mcp/canvas_seeds.jsonl b/seeds/mcp/canvas_seeds.jsonl new file mode 100644 index 0000000..bf984a3 --- /dev/null +++ b/seeds/mcp/canvas_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the available Canvas MCP tools to inspect courses, assignments, and enrollment information.", "kwargs": {}} +{"content": "Find a small Canvas reporting task that can be answered from the mock data and save intermediate notes with filesystem tools if helpful.", "kwargs": {}} diff --git a/seeds/mcp/snowflake_seeds.jsonl b/seeds/mcp/snowflake_seeds.jsonl new file mode 100644 index 0000000..17076be --- /dev/null +++ b/seeds/mcp/snowflake_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the available Snowflake MCP tools to inspect schemas, tables, and small analytical queries in the mock warehouse.", "kwargs": {}} +{"content": "Find one compact warehouse reporting question that can be answered from the available Snowflake tools.", "kwargs": {}} diff --git a/seeds/mcp/train_seeds.jsonl b/seeds/mcp/train_seeds.jsonl new file mode 100644 index 0000000..e02d780 --- /dev/null +++ b/seeds/mcp/train_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the rail_12306 MCP tools to inspect mock train, station, and route information.", "kwargs": {}} +{"content": "Find one small travel-planning or route lookup question that can be answered from the available railway tools.", "kwargs": {}} diff --git a/seeds/mcp/woocommerce_seeds.jsonl b/seeds/mcp/woocommerce_seeds.jsonl new file mode 100644 index 0000000..3fbf365 --- /dev/null +++ b/seeds/mcp/woocommerce_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the WooCommerce MCP tools to inspect customers, products, and orders in the mock store.", "kwargs": {}} +{"content": "Find one small sales or operations question that can be answered from the WooCommerce mock data.", "kwargs": {}} diff --git a/seeds/mcp/yahoo_finance_seeds.jsonl b/seeds/mcp/yahoo_finance_seeds.jsonl new file mode 100644 index 0000000..77f8e90 --- /dev/null +++ b/seeds/mcp/yahoo_finance_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the Yahoo Finance MCP tools to inspect the mock ticker and market data available locally.", "kwargs": {}} +{"content": "Find one small finance lookup or comparison question that can be answered directly from the available tools.", "kwargs": {}} diff --git a/seeds/mcp/youtube_seeds.jsonl b/seeds/mcp/youtube_seeds.jsonl new file mode 100644 index 0000000..5575703 --- /dev/null +++ b/seeds/mcp/youtube_seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Use the YouTube and YouTube Transcript MCP tools to inspect mock video metadata and transcript data.", "kwargs": {}} +{"content": "Find one small content-discovery or transcript lookup question that can be answered from the local mock data.", "kwargs": {}} From 8f736e78d3176fcf0340bc0bab85c6d03972c20a Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:06:53 +0800 Subject: [PATCH 30/39] docs: add MCP example guide --- examples/MCPAgent.md | 182 ++++++++++++++++++++++++++ rollout/tests/test_mcp_example_doc.py | 60 +++++++++ 2 files changed, 242 insertions(+) create mode 100644 examples/MCPAgent.md create mode 100644 rollout/tests/test_mcp_example_doc.py diff --git a/examples/MCPAgent.md b/examples/MCPAgent.md new file mode 100644 index 0000000..1f2b29b --- /dev/null +++ b/examples/MCPAgent.md @@ -0,0 +1,182 @@ +# MCPAgent: MCP Tool Agent — 3-Step Example Guide (No Training/Deployment/Inference) + +This guide explains how to use AgentFlow's MCP example pipeline to generate QA data and trajectory data for six domains: `canvas`, `snowflake`, `woocommerce`, `yahoo_finance`, `youtube`, and `train`. + +This example is intentionally limited to sandbox startup, QA synthesis, and trajectory rollout. Later training, deployment, and infer workflows are not covered yet. + +## Table of Contents + +- [Overview](#overview) +- [Prerequisites](#prerequisites) +- [Pipeline Overview](#pipeline-overview) +- [Step 1: Start the Sandbox Server](#step-1-start-the-sandbox-server) +- [Step 2: Synthesize QA Data](#step-2-synthesize-qa-data) +- [Step 3: Synthesize Trajectory Data](#step-3-synthesize-trajectory-data) +- [Configuration Reference](#configuration-reference) +- [FAQ](#faq) + +--- + +## Overview + +MCPAgent is an example agent that talks to Toolathlon-GYM MCP servers through AgentFlow's sandbox server. The shared sandbox config is `configs/sandbox-server/mcp_config.json`, and the MCP server bundle is resolved from `${TOOLATHLON_GYM_ROOT}/local_servers`. + +The current example scope covers six domains: + +- `canvas` +- `snowflake` +- `woocommerce` +- `yahoo_finance` +- `youtube` +- `train` + +## Prerequisites + +Before running the example: + +- `cd AgentFlow` +- `export OPENAI_API_KEY=...` +- `export OPENAI_API_URL=...` +- `export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym` +- Have an already prepared and already running Toolathlon-GYM environment +- Ensure `node` and `uv` are installed + +## Pipeline Overview + +The verified MCP example pipeline in this repo is: + +```text +Sandbox Setup -> QA Synthesis -> Trajectory Rollout +``` + +All synthesis configs use the shared sandbox config `configs/sandbox-server/mcp_config.json`. The example stops after QA synthesis and trajectory rollout, so training, deployment, and infer/evaluation flows are not covered. + +## Step 1: Start the Sandbox Server + +Start the sandbox server once before running synthesis or rollout: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json +``` + +## Step 2: Synthesize QA Data + +Run the synthesis pipeline once per domain: + +```bash +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_canvas_config.json \ + --seeds seeds/mcp/canvas_seeds.jsonl \ + --output-dir results/mcp_canvas + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_snowflake_config.json \ + --seeds seeds/mcp/snowflake_seeds.jsonl \ + --output-dir results/mcp_snowflake + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_woocommerce_config.json \ + --seeds seeds/mcp/woocommerce_seeds.jsonl \ + --output-dir results/mcp_woocommerce + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_yahoo_finance_config.json \ + --seeds seeds/mcp/yahoo_finance_seeds.jsonl \ + --output-dir results/mcp_yahoo_finance + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_youtube_config.json \ + --seeds seeds/mcp/youtube_seeds.jsonl \ + --output-dir results/mcp_youtube + +python3 synthesis/pipeline.py \ + --config configs/synthesis/mcp_train_config.json \ + --seeds seeds/mcp/train_seeds.jsonl \ + --output-dir results/mcp_train +``` + +These runs synthesize QA pairs and save the corresponding tool-use traces for the selected MCP domain. + +## Step 3: Synthesize Trajectory Data + +Run the rollout pipeline for trajectory-only data: + +```bash +python -m rollout.pipeline \ + --config configs/trajectory/mcp_canvas_trajectory.json \ + --output-dir trajectory_results/mcp_canvas + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_snowflake_trajectory.json \ + --output-dir trajectory_results/mcp_snowflake + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_woocommerce_trajectory.json \ + --output-dir trajectory_results/mcp_woocommerce + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_yahoo_finance_trajectory.json \ + --output-dir trajectory_results/mcp_yahoo_finance + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_youtube_trajectory.json \ + --output-dir trajectory_results/mcp_youtube + +python -m rollout.pipeline \ + --config configs/trajectory/mcp_train_trajectory.json \ + --output-dir trajectory_results/mcp_train +``` + +This stage produces rollout trajectories only. Training, deployment, and infer-oriented serving flows are not covered in this example yet. + +## Configuration Reference + +### Shared Sandbox Config + +File: `configs/sandbox-server/mcp_config.json` + +Important fields: + +- `mcp_servers_path`: `${TOOLATHLON_GYM_ROOT}/local_servers` +- `enabled_mcp_servers`: includes the six example domains plus shared helpers such as `filesystem` and the YouTube transcript server +- `env_overrides`: `PGHOST`, `PGPORT`, `PGUSER`, `PGPASSWORD`, `PGDATABASE`, `CANVAS_DOMAIN`, `WORDPRESS_SITE_URL` + +### Synthesis Configs + +Files: + +- `configs/synthesis/mcp_canvas_config.json` +- `configs/synthesis/mcp_snowflake_config.json` +- `configs/synthesis/mcp_woocommerce_config.json` +- `configs/synthesis/mcp_yahoo_finance_config.json` +- `configs/synthesis/mcp_youtube_config.json` +- `configs/synthesis/mcp_train_config.json` + +These configs point to the shared MCP sandbox and the domain-specific seeds for QA synthesis. + +### Trajectory Configs + +Files: + +- `configs/trajectory/mcp_canvas_trajectory.json` +- `configs/trajectory/mcp_snowflake_trajectory.json` +- `configs/trajectory/mcp_woocommerce_trajectory.json` +- `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- `configs/trajectory/mcp_youtube_trajectory.json` +- `configs/trajectory/mcp_train_trajectory.json` + +These configs run trajectory-only rollout for each MCP domain. + +## FAQ + +### What does this example cover? + +It covers sandbox startup, QA synthesis, and trajectory rollout for the six MCP domains in this repo. + +### Which external tools do I need ready first? + +You need a prepared Toolathlon-GYM environment, plus `node` and `uv`, because the MCP backend launches Toolathlon-GYM local servers from `${TOOLATHLON_GYM_ROOT}/local_servers`. + +### Why are training and deployment missing? + +This example is scoped to data generation only. Later training, deployment, and infer/evaluation workflows are not covered yet. diff --git a/rollout/tests/test_mcp_example_doc.py b/rollout/tests/test_mcp_example_doc.py new file mode 100644 index 0000000..f7efa4b --- /dev/null +++ b/rollout/tests/test_mcp_example_doc.py @@ -0,0 +1,60 @@ +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_mcp_example_doc_has_required_sections_and_exact_prerequisite_contract(): + content = (REPO_ROOT / "examples" / "MCPAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "configs/sandbox-server/mcp_config.json", + "configs/synthesis/mcp_canvas_config.json", + "configs/synthesis/mcp_snowflake_config.json", + "configs/synthesis/mcp_woocommerce_config.json", + "configs/synthesis/mcp_yahoo_finance_config.json", + "configs/synthesis/mcp_youtube_config.json", + "configs/synthesis/mcp_train_config.json", + "configs/trajectory/mcp_canvas_trajectory.json", + "configs/trajectory/mcp_snowflake_trajectory.json", + "configs/trajectory/mcp_woocommerce_trajectory.json", + "configs/trajectory/mcp_yahoo_finance_trajectory.json", + "configs/trajectory/mcp_youtube_trajectory.json", + "configs/trajectory/mcp_train_trajectory.json", + "export TOOLATHLON_GYM_ROOT=", + "${TOOLATHLON_GYM_ROOT}/local_servers", + "./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json", + "node", + "uv", + "PGHOST", + "PGPORT", + "PGUSER", + "PGPASSWORD", + "PGDATABASE", + "CANVAS_DOMAIN", + "WORDPRESS_SITE_URL", + ] + for needle in required_strings: + assert needle in content + + lowered = content.lower() + assert "/home/" not in content + assert "training" in lowered + assert "deployment" in lowered + assert "infer" in lowered + assert "not covered" in lowered + assert "Step 4" not in content + assert "Step 5" not in content From 22d3fc0ecdb7b47ad3ba667beffc8a545a19954c Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:13:20 +0800 Subject: [PATCH 31/39] Clarify MCP server name mapping in MCPAgent doc --- examples/MCPAgent.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/MCPAgent.md b/examples/MCPAgent.md index 1f2b29b..62c42cd 100644 --- a/examples/MCPAgent.md +++ b/examples/MCPAgent.md @@ -138,7 +138,7 @@ File: `configs/sandbox-server/mcp_config.json` Important fields: - `mcp_servers_path`: `${TOOLATHLON_GYM_ROOT}/local_servers` -- `enabled_mcp_servers`: includes the six example domains plus shared helpers such as `filesystem` and the YouTube transcript server +- `enabled_mcp_servers`: includes the concrete MCP server identifiers from `configs/sandbox-server/mcp_config.json`, plus shared helpers such as `filesystem` and the YouTube transcript server. Most example domains use the same name as the server, but some differ: `yahoo_finance -> yahoo-finance` and `train -> rail_12306`. - `env_overrides`: `PGHOST`, `PGPORT`, `PGUSER`, `PGPASSWORD`, `PGDATABASE`, `CANVAS_DOMAIN`, `WORDPRESS_SITE_URL` ### Synthesis Configs From 6628e1e4192f6528951f17e8fe847288d0e1c759 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:17:30 +0800 Subject: [PATCH 32/39] test: add coding example contract tests --- rollout/tests/test_code_example_assets.py | 82 +++++++++++++++++++ .../test_code_example_synthesis_config.py | 29 +++++++ 2 files changed, 111 insertions(+) create mode 100644 rollout/tests/test_code_example_assets.py create mode 100644 synthesis/tests/test_code_example_synthesis_config.py diff --git a/rollout/tests/test_code_example_assets.py b/rollout/tests/test_code_example_assets.py new file mode 100644 index 0000000..416620f --- /dev/null +++ b/rollout/tests/test_code_example_assets.py @@ -0,0 +1,82 @@ +import json +from pathlib import Path + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def test_code_rollout_config_contract(): + config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + config = RolloutConfig.from_json(str(config_path)) + + assert config.benchmark_name == "code_trajectory" + assert config.data_path == "benchmark/code_benchmark.jsonl" + assert config.available_tools == ["code-*"] + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.evaluate_results is False + assert config.trajectory_only is True + assert config.save_trajectories is True + assert config.save_summary is False + + +def test_code_seed_file_contract(): + rows = _read_jsonl(REPO_ROOT / "seeds" / "code" / "seeds.jsonl") + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +def test_code_benchmark_contract(): + rows = _read_jsonl(REPO_ROOT / "benchmark" / "code_benchmark.jsonl") + + assert len(rows) == 2 + assert all({"id", "question", "answer"} <= set(row.keys()) for row in rows) + assert rows[0]["id"] == "code_read_001" + assert "metadata" not in rows[0] + assert rows[1]["id"] == "code_edit_001" + assert "tests/smoke_test.py" in rows[1]["question"] + assert rows[1]["answer"] == "smoke test passed" + assert rows[1]["metadata"] == { + "target_files": ["app.py"], + "check_command": "python tests/smoke_test.py", + } + assert all("/home/" not in json.dumps(row, ensure_ascii=False) for row in rows) + assert all("DataFlow" not in json.dumps(row, ensure_ascii=False) for row in rows) + + +def test_code_demo_repo_contract(): + repo_root = REPO_ROOT / "seeds" / "code" / "seed" / "demo_repo" + + required_paths = [ + repo_root / "README.md", + repo_root / "app.py", + repo_root / "config" / "app_config.json", + repo_root / "lib" / "helpers.py", + repo_root / "tests" / "smoke_test.py", + ] + for path in required_paths: + assert path.exists(), path + + smoke_test = (repo_root / "tests" / "smoke_test.py").read_text(encoding="utf-8") + assert "build_message" in smoke_test + assert "SMOKE_OK" in smoke_test diff --git a/synthesis/tests/test_code_example_synthesis_config.py b/synthesis/tests/test_code_example_synthesis_config.py new file mode 100644 index 0000000..c3ed52c --- /dev/null +++ b/synthesis/tests/test_code_example_synthesis_config.py @@ -0,0 +1,29 @@ +import json +from pathlib import Path + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_code_synthesis_config_contract(): + config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.available_tools == ["code-*"] + assert config.seeds_file == "seeds/code/seeds.jsonl" + assert raw["seed_description"] == "Coding demo repository prompts" + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() From f092925c917723307c5a2b0e7ccae42e4ed47c95 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:26:20 +0800 Subject: [PATCH 33/39] Fix synthesis pytest import path --- synthesis/tests/conftest.py | 8 ++++++++ 1 file changed, 8 insertions(+) create mode 100644 synthesis/tests/conftest.py diff --git a/synthesis/tests/conftest.py b/synthesis/tests/conftest.py new file mode 100644 index 0000000..f7d4e68 --- /dev/null +++ b/synthesis/tests/conftest.py @@ -0,0 +1,8 @@ +import sys +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + +if str(REPO_ROOT) not in sys.path: + sys.path.insert(0, str(REPO_ROOT)) From 4d4e6705e737fdc6b1b130c8258b945bfa37a9e2 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:33:24 +0800 Subject: [PATCH 34/39] feat: add Coding example assets --- benchmark/code_benchmark.jsonl | 2 + configs/synthesis/code_config.json | 44 +++++++++++++++++++ configs/trajectory/code_trajectory.json | 31 +++++++++++++ sandbox/server/backends/resources/__init__.py | 22 +++++++++- seeds/code/seed/demo_repo/README.md | 8 ++++ seeds/code/seed/demo_repo/app.py | 20 +++++++++ .../seed/demo_repo/config/app_config.json | 4 ++ seeds/code/seed/demo_repo/tests/smoke_test.py | 11 +++++ seeds/code/seeds.jsonl | 2 + 9 files changed, 143 insertions(+), 1 deletion(-) create mode 100644 benchmark/code_benchmark.jsonl create mode 100644 configs/synthesis/code_config.json create mode 100644 configs/trajectory/code_trajectory.json create mode 100644 seeds/code/seed/demo_repo/README.md create mode 100644 seeds/code/seed/demo_repo/app.py create mode 100644 seeds/code/seed/demo_repo/config/app_config.json create mode 100644 seeds/code/seed/demo_repo/tests/smoke_test.py create mode 100644 seeds/code/seeds.jsonl diff --git a/benchmark/code_benchmark.jsonl b/benchmark/code_benchmark.jsonl new file mode 100644 index 0000000..2d6c7b2 --- /dev/null +++ b/benchmark/code_benchmark.jsonl @@ -0,0 +1,2 @@ +{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"} +{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}} diff --git a/configs/synthesis/code_config.json b/configs/synthesis/code_config.json new file mode 100644 index 0000000..eb8bb98 --- /dev/null +++ b/configs/synthesis/code_config.json @@ -0,0 +1,44 @@ +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 10, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "available_tools": ["code-*"], + "sampling_tips": [ + "Inspect the repository before proposing edits.", + "Use code-bash only for lightweight checks that fit the bundled demo repo." + ], + "synthesis_tips": [ + "Generate repo-grounded QA only.", + "Prefer file-path, function-behavior, and small edit-validation questions over open-ended design prompts." + ], + "qa_examples": [ + { + "question": "Which file stores the greeting suffix used by the demo app? Reply with the relative file path only.", + "answer": "config/app_config.json" + }, + { + "question": "What string does `build_message()` return before any edits? Reply with the exact string only.", + "answer": "Hello, AgentFlow?" + } + ], + "seed_description": "Coding demo repository prompts", + "seeds_file": "seeds/code/seeds.jsonl", + "output_dir": "results/code" +} diff --git a/configs/trajectory/code_trajectory.json b/configs/trajectory/code_trajectory.json new file mode 100644 index 0000000..2265f0a --- /dev/null +++ b/configs/trajectory/code_trajectory.json @@ -0,0 +1,31 @@ +{ + "benchmark_name": "code_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 12, + "available_tools": ["code-*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "system_prompt": [ + "You are a coding assistant working inside a small repository.", + "Inspect files before editing them.", + "When a task asks for verification, run the requested command inside the coding workspace before giving the final answer." + ], + "evaluate_results": false, + "data_path": "benchmark/code_benchmark.jsonl", + "output_dir": "trajectory_results/code", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true, + "save_summary": false +} diff --git a/sandbox/server/backends/resources/__init__.py b/sandbox/server/backends/resources/__init__.py index b958141..8ac0433 100644 --- a/sandbox/server/backends/resources/__init__.py +++ b/sandbox/server/backends/resources/__init__.py @@ -62,7 +62,27 @@ from .code import CodeBackend from .mcp import MCPBackend, ToolathlonGymBackend from .rag import RAGBackend, create_rag_backend -from .vm import VMBackend, create_vm_backend + +_VM_IMPORT_ERROR = None +try: + from .vm import VMBackend, create_vm_backend +except ImportError as exc: + if "cssselect" not in str(exc): + raise + _VM_IMPORT_ERROR = exc + + class VMBackend: # type: ignore[no-redef] + def __init__(self, *args, **kwargs): + del args, kwargs + raise ImportError( + "VMBackend requires the optional 'cssselect' dependency" + ) from _VM_IMPORT_ERROR + + def create_vm_backend(*args, **kwargs): # type: ignore[no-redef] + del args, kwargs + raise ImportError( + "VMBackend requires the optional 'cssselect' dependency" + ) from _VM_IMPORT_ERROR __all__ = [ "VMBackend", diff --git a/seeds/code/seed/demo_repo/README.md b/seeds/code/seed/demo_repo/README.md new file mode 100644 index 0000000..6bdffd1 --- /dev/null +++ b/seeds/code/seed/demo_repo/README.md @@ -0,0 +1,8 @@ +# Coding Example Demo Repo + +This tiny repository is bundled for AgentFlow's CodingAgent example. + +- `app.py` builds a greeting string. +- `config/app_config.json` stores the expected name and suffix. +- `lib/helpers.py` contains the formatting helper. +- `tests/smoke_test.py` is the verification command used by the rollout example. diff --git a/seeds/code/seed/demo_repo/app.py b/seeds/code/seed/demo_repo/app.py new file mode 100644 index 0000000..a172cac --- /dev/null +++ b/seeds/code/seed/demo_repo/app.py @@ -0,0 +1,20 @@ +import json +from pathlib import Path + +from lib.helpers import render_greeting + + +CONFIG_PATH = Path(__file__).parent / "config" / "app_config.json" + + +def load_config() -> dict: + return json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + + +def build_message() -> str: + config = load_config() + return render_greeting(config["default_name"], "?") + + +if __name__ == "__main__": + print(build_message()) diff --git a/seeds/code/seed/demo_repo/config/app_config.json b/seeds/code/seed/demo_repo/config/app_config.json new file mode 100644 index 0000000..31f4249 --- /dev/null +++ b/seeds/code/seed/demo_repo/config/app_config.json @@ -0,0 +1,4 @@ +{ + "default_name": "AgentFlow", + "suffix": "!" +} diff --git a/seeds/code/seed/demo_repo/tests/smoke_test.py b/seeds/code/seed/demo_repo/tests/smoke_test.py new file mode 100644 index 0000000..a6b76a1 --- /dev/null +++ b/seeds/code/seed/demo_repo/tests/smoke_test.py @@ -0,0 +1,11 @@ +from app import build_message + + +def main() -> None: + message = build_message() + assert message == "Hello, AgentFlow!", message + print("SMOKE_OK") + + +if __name__ == "__main__": + main() diff --git a/seeds/code/seeds.jsonl b/seeds/code/seeds.jsonl new file mode 100644 index 0000000..ba6a9b3 --- /dev/null +++ b/seeds/code/seeds.jsonl @@ -0,0 +1,2 @@ +{"content": "Inspect the demo repository and trace how the greeting is assembled from config and helper code.", "kwargs": {}} +{"content": "Look for a small repository bug that can be fixed with a minimal edit and validated with the committed smoke test.", "kwargs": {}} From d665dd9a760ebade44ac0f66b981ba2a196acc11 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:34:19 +0800 Subject: [PATCH 35/39] fix: include demo helper module --- seeds/code/seed/demo_repo/lib/helpers.py | 2 ++ 1 file changed, 2 insertions(+) create mode 100644 seeds/code/seed/demo_repo/lib/helpers.py diff --git a/seeds/code/seed/demo_repo/lib/helpers.py b/seeds/code/seed/demo_repo/lib/helpers.py new file mode 100644 index 0000000..099ce38 --- /dev/null +++ b/seeds/code/seed/demo_repo/lib/helpers.py @@ -0,0 +1,2 @@ +def render_greeting(name: str, suffix: str) -> str: + return f"Hello, {name}{suffix}" From 237ac5ab18275016194efa89dd06fb16a2801a29 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:46:45 +0800 Subject: [PATCH 36/39] Fix coding smoke test runtime contract --- rollout/tests/test_code_example_assets.py | 39 +++++++++++++++++++ seeds/code/seed/demo_repo/tests/smoke_test.py | 5 +++ 2 files changed, 44 insertions(+) diff --git a/rollout/tests/test_code_example_assets.py b/rollout/tests/test_code_example_assets.py index 416620f..7019992 100644 --- a/rollout/tests/test_code_example_assets.py +++ b/rollout/tests/test_code_example_assets.py @@ -1,4 +1,7 @@ import json +import shutil +import subprocess +import sys from pathlib import Path from rollout.core.config import RolloutConfig @@ -80,3 +83,39 @@ def test_code_demo_repo_contract(): smoke_test = (repo_root / "tests" / "smoke_test.py").read_text(encoding="utf-8") assert "build_message" in smoke_test assert "SMOKE_OK" in smoke_test + + +def test_code_demo_repo_smoke_test_runtime_contract(tmp_path): + source_repo = REPO_ROOT / "seeds" / "code" / "seed" / "demo_repo" + repo_copy = tmp_path / "demo_repo" + shutil.copytree(source_repo, repo_copy) + + pre_fix = subprocess.run( + [sys.executable, "tests/smoke_test.py"], + cwd=repo_copy, + capture_output=True, + text=True, + check=False, + ) + + assert pre_fix.returncode != 0 + assert "AssertionError: Hello, AgentFlow?" in pre_fix.stderr + assert "ModuleNotFoundError" not in pre_fix.stderr + + app_path = repo_copy / "app.py" + app_text = app_path.read_text(encoding="utf-8") + app_path.write_text( + app_text.replace('render_greeting(config["default_name"], "?")', 'render_greeting(config["default_name"], "!")'), + encoding="utf-8", + ) + + post_fix = subprocess.run( + [sys.executable, "tests/smoke_test.py"], + cwd=repo_copy, + capture_output=True, + text=True, + check=False, + ) + + assert post_fix.returncode == 0, post_fix.stderr + assert post_fix.stdout.strip() == "SMOKE_OK" diff --git a/seeds/code/seed/demo_repo/tests/smoke_test.py b/seeds/code/seed/demo_repo/tests/smoke_test.py index a6b76a1..9f9c9eb 100644 --- a/seeds/code/seed/demo_repo/tests/smoke_test.py +++ b/seeds/code/seed/demo_repo/tests/smoke_test.py @@ -1,3 +1,8 @@ +import sys +from pathlib import Path + +sys.path.insert(0, str(Path(__file__).resolve().parents[1])) + from app import build_message From 2900503fae9e70b5188e7820dc3a9eb33dc1c167 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 10:53:40 +0800 Subject: [PATCH 37/39] docs: add Coding example guide --- examples/CodingAgent.md | 134 +++++++++++++++++++++++++ rollout/tests/test_code_example_doc.py | 45 +++++++++ 2 files changed, 179 insertions(+) create mode 100644 examples/CodingAgent.md create mode 100644 rollout/tests/test_code_example_doc.py diff --git a/examples/CodingAgent.md b/examples/CodingAgent.md new file mode 100644 index 0000000..10466bd --- /dev/null +++ b/examples/CodingAgent.md @@ -0,0 +1,134 @@ +# CodingAgent: Repository QA / Edit Agent - Data Synthesis Guide (No Training/Deployment) + +This guide explains how to use AgentFlow's CodingAgent example to synthesize QA data and trajectory data for a small repository workflow. + +CodingAgent uses the `code` backend's six tools through `code-*`, so the same setup covers repository inspection, search, shell validation, and small file edits. + +## Overview + +CodingAgent is a repository-grounded coding agent example. It works against a local code workspace and uses six code tools exposed through `code-*`: + +- `code-read` +- `code-glob` +- `code-grep` +- `code-bash` +- `code-edit` +- `code-write` + +The bundled example is intentionally small and example-oriented. It includes both a read-only question style and an edit-task style based on the committed demo repository, so you can synthesize data for repository inspection and minimal bug-fix workflows from the same assets. + +## Prerequisites + +Install and enter the repository: + +```bash +git clone https://github.com/OpenDCAI/AgentFlow +cd AgentFlow +pip install -e . +``` + +Configure model access and the repo-root contract used by the committed code example configs: + +```bash +export OPENAI_API_KEY=YOUR_KEY +export OPENAI_API_URL=https://openrouter.ai/api/v1 +export AGENTFLOW_REPO_ROOT=$(pwd) +``` + +The default committed repository for this example lives at `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`. + +Seed prompts are stored in `seeds/code/seeds.jsonl`. If you want to run the same pipeline on your own repository instead of the bundled demo repo, replace `resource_init_configs.code.content.source_dir` with your own repository path. + +## Pipeline Overview + +This CodingAgent example uses a simple three-step flow: + +```text +Step 1 Sandbox Server -> Step 2 QA Synthesis -> Step 3 Trajectory Data +``` + +The assets in this repo are already aligned for that flow: + +- Sandbox config: `configs/sandbox-server/code_config.json` +- QA synthesis config: `configs/synthesis/code_config.json` +- Trajectory rollout config: `configs/trajectory/code_trajectory.json` +- Benchmark file: `benchmark/code_benchmark.jsonl` + +The benchmark mixes a bundled read-only plus edit-task example style. One task asks the agent to inspect the repo and answer a question; another asks it to make a minimal fix and verify it with `python tests/smoke_test.py`. + +## Step 1: Start the Sandbox Server + +Start the sandbox server before synthesis or rollout: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/code_config.json +``` + +This launches the code resource backend and prepares per-run workspaces under the sandbox workspace root. + +## Step 2: Synthesize QA Data + +Use the committed synthesis config to generate repository-grounded QA from the coding seeds: + +```bash +python3 synthesis/pipeline.py \ + --config configs/synthesis/code_config.json \ + --seeds seeds/code/seeds.jsonl \ + --output-dir results/code +``` + +By default, the synthesis config initializes the code resource from `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` through `resource_init_configs.code.content.source_dir`. + +The committed prompts are designed around the bundled demo repository and support both repository-reading questions and a lightweight edit-validation workflow. + +## Step 3: Synthesize Trajectory Data + +Use rollout to generate trajectory-only records with the committed benchmark: + +```bash +python -m rollout.pipeline \ + --config configs/trajectory/code_trajectory.json \ + --output-dir trajectory_results/code +``` + +This config reads tasks from `benchmark/code_benchmark.jsonl` and keeps the same default repo-root contract via `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`. + +One bundled task explicitly validates the edit workflow by asking the agent to run `python tests/smoke_test.py` after making a minimal fix. + +## Configuration Reference + +### Sandbox config + +`configs/sandbox-server/code_config.json` enables the `code` resource and points the sandbox to a temporary workspace root. + +### Synthesis config + +`configs/synthesis/code_config.json` defines: + +- `available_tools` as `code-*` +- `seeds_file` as `seeds/code/seeds.jsonl` +- `resource_init_configs.code.content.source_dir` as `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` + +If you want to use a different repository, update `source_dir` to your own path while keeping the rest of the pipeline structure the same. + +### Trajectory config + +`configs/trajectory/code_trajectory.json` defines: + +- `available_tools` as `code-*` +- `data_path` as `benchmark/code_benchmark.jsonl` +- `resource_init_configs.code.content.source_dir` as `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` + +## FAQ + +### What repository does the example use by default? + +The committed default is `seeds/code/seed/demo_repo`, resolved in config as `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`. + +### Can I point the example at my own repository? + +Yes. Replace `resource_init_configs.code.content.source_dir` with your own repo path in the synthesis or trajectory config you want to run. + +### Does this guide cover training or deployment? + +No. Later training / deployment / infer are not covered yet, so this guide stops after QA synthesis and trajectory generation. diff --git a/rollout/tests/test_code_example_doc.py b/rollout/tests/test_code_example_doc.py new file mode 100644 index 0000000..ba00313 --- /dev/null +++ b/rollout/tests/test_code_example_doc.py @@ -0,0 +1,45 @@ +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_coding_example_doc_has_required_sections_and_repo_root_contract(): + content = (REPO_ROOT / "examples" / "CodingAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "cd AgentFlow", + "export AGENTFLOW_REPO_ROOT=$(pwd)", + "code-*", + "configs/sandbox-server/code_config.json", + "configs/synthesis/code_config.json", + "configs/trajectory/code_trajectory.json", + "benchmark/code_benchmark.jsonl", + "seeds/code/seeds.jsonl", + "seeds/code/seed/demo_repo", + "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo", + "source_dir", + "./start_sandbox_server.sh --config configs/sandbox-server/code_config.json", + "python tests/smoke_test.py", + "training / deployment / infer are not covered yet", + ] + for needle in required_strings: + assert needle in content + + assert "/home/a1/sdb/dxd/DataFlow" not in content + assert "DataFlow" not in content + assert "Step 4" not in content + assert "Step 5" not in content From 2fcf88ee1435de6fb0aa7ba974f41eee9fb82d09 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 11:00:18 +0800 Subject: [PATCH 38/39] Fix coding example doc contract --- examples/CodingAgent.md | 2 +- rollout/tests/test_code_example_doc.py | 3 +++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/examples/CodingAgent.md b/examples/CodingAgent.md index 10466bd..2009751 100644 --- a/examples/CodingAgent.md +++ b/examples/CodingAgent.md @@ -1,4 +1,4 @@ -# CodingAgent: Repository QA / Edit Agent - Data Synthesis Guide (No Training/Deployment) +# CodingAgent: Repository QA / Edit Agent — 3-Step Example Guide (No Training/Deployment/Inference) This guide explains how to use AgentFlow's CodingAgent example to synthesize QA data and trajectory data for a small repository workflow. diff --git a/rollout/tests/test_code_example_doc.py b/rollout/tests/test_code_example_doc.py index ba00313..fcaa367 100644 --- a/rollout/tests/test_code_example_doc.py +++ b/rollout/tests/test_code_example_doc.py @@ -21,7 +21,10 @@ def test_coding_example_doc_has_required_sections_and_repo_root_contract(): assert section in content required_strings = [ + "3-Step Example Guide", "cd AgentFlow", + "export OPENAI_API_KEY=", + "export OPENAI_API_URL=", "export AGENTFLOW_REPO_ROOT=$(pwd)", "code-*", "configs/sandbox-server/code_config.json", From c72c658e1f269e1ef0ee7e0d84131636c7361b75 Mon Sep 17 00:00:00 2001 From: Dingxingdi <784318034@qq.com> Date: Tue, 21 Apr 2026 15:43:07 +0800 Subject: [PATCH 39/39] Add verified MCP and coding examples --- ...-mcp-and-coding-examples-implementation.md | 1575 +++++++++++++++++ examples/CodingAgent.md | 5 + examples/MCPAgent.md | 5 + rollout/core/config.py | 3 + rollout/tests/test_code_example_assets.py | 54 +- rollout/tests/test_code_example_doc.py | 31 + rollout/tests/test_mcp_example_assets.py | 55 +- rollout/tests/test_mcp_example_doc.py | 58 + sandbox/result_formatter.py | 54 + .../resources/code_vendor/file_tools.py | 8 +- .../server/backends/resources/mcp/client.py | 81 + .../backends/resources/mcp/configs/12306.yaml | 22 +- .../resources/mcp/configs/canvas.yaml | 17 +- .../resources/mcp/configs/filesystem.yaml | 7 +- .../resources/mcp/configs/snowflake.yaml | 16 +- .../resources/mcp/configs/woocommerce.yaml | 14 +- .../resources/mcp/configs/yahoo-finance.yaml | 20 +- .../resources/mcp/configs/youtube.yaml | 23 +- .../mcp/configs/youtube_transcript.yaml | 30 +- sandbox/tests/test_code_vendor_tools.py | 52 + sandbox/tests/test_mcp_client.py | 459 +++++ sandbox/tests/test_result_formatter.py | 166 ++ synthesis/core/config.py | 3 + .../test_code_example_synthesis_config.py | 54 +- .../test_mcp_example_synthesis_configs.py | 55 +- 25 files changed, 2777 insertions(+), 90 deletions(-) create mode 100644 docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md create mode 100644 sandbox/tests/test_result_formatter.py diff --git a/docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md b/docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md new file mode 100644 index 0000000..7730586 --- /dev/null +++ b/docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md @@ -0,0 +1,1575 @@ +# MCP and Coding Examples Implementation Plan + +> **For agentic workers:** REQUIRED: Use superpowers:subagent-driven-development (if subagents available) or superpowers:executing-plans to implement this plan. Steps use checkbox (`- [ ]`) syntax for tracking. + +**Goal:** Add official MCP and Coding example docs, configs, seeds, demo assets, and validation coverage so both backends have runnable three-step examples aligned with the existing `examples/` set. + +**Architecture:** Implement the work in four isolated chunks. Chunk 1 locks the shared MCP sandbox template contract so the example path has a stable server subset and `mcp_servers_path` behavior. Chunk 2 adds the six-domain MCP example assets and doc with MCP-specific tests. Chunk 3 adds the Coding example assets, bundled demo repo, and doc with Coding-specific tests. Chunk 4 runs the final combined verification suite plus optional real-environment dry runs without expanding scope to training or infer. + +**Tech Stack:** Python, pytest, JSON, JSONL, Markdown, AgentFlow sandbox config loader, synthesis config loader, rollout config loader, Toolathlon-GYM MCP backend, Code backend + +--- + +**Assumptions and Guardrails** + +- The approved spec at `docs/superpowers/specs/2026-04-20-mcp-and-coding-examples-design.md` remains the source of truth. +- Keep `examples/MCPAgent.md` and `examples/CodingAgent.md` at the same granularity as `examples/DSAgent.md`: Overview, Prerequisites, Pipeline Overview, Step 1, Step 2, Step 3, Configuration Reference, FAQ. +- Official committed docs and configs must stay generic. Do not mention `/home/a1/sdb/dxd/DataFlow` or any other machine-local absolute path in committed example assets. +- MCP examples assume a local `toolathlon_gym` checkout is already initialized and running before AgentFlow starts. AgentFlow does not bootstrap Toolathlon-GYM services in this task. +- Coding examples use a committed demo repo by default through `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo`, but the docs must also say `source_dir` can be overridden to point at a user-provided repo. +- Both new docs must explicitly stop at Step 3 and explain that training / deployment / infer are not covered yet, matching the DS-style scope the user approved. +- If the current synthesis pipeline still writes to the repo’s shared aggregation directory instead of respecting per-config `output_dir`, document the actual observed behavior in the new example docs rather than broadening this task into a synthesis pipeline refactor. + +## File Map + +### Shared MCP sandbox contract + +- Modify: `configs/sandbox-server/mcp_config.json` + Responsibility: official MCP sandbox entry point for the example path; declare `mcp_servers_path`, the approved MCP server subset, localhost-style env defaults, and warmup settings. +- Modify: `sandbox/tests/test_sandbox_config_loading.py` + Responsibility: verify `${TOOLATHLON_GYM_ROOT}/local_servers` survives env expansion when the env var is unset. +- Modify: `sandbox/tests/test_mcp_backend.py` + Responsibility: verify the checked-in MCP sandbox template exposes the exact approved config contract, including warmup. +- Modify: `sandbox/tests/test_mcp_client.py` + Responsibility: verify MCP YAML resolution still maps `${local_servers_paths}` to the JSON config’s `mcp_servers_path`. + +### MCP example validation and assets + +- Create: `synthesis/tests/test_mcp_example_synthesis_configs.py` + Responsibility: validate all six MCP synthesis configs, tool exposure, seed references, and empty `resource_init_configs`. +- Create: `rollout/tests/test_mcp_example_assets.py` + Responsibility: validate all six MCP rollout configs plus the MCP seeds and benchmark files. +- Create: `rollout/tests/test_mcp_example_doc.py` + Responsibility: validate `examples/MCPAgent.md` has the DS-style structure and the exact prerequisite/config references the user approved. +- Create: `examples/MCPAgent.md` +- Create: `configs/synthesis/mcp_canvas_config.json` +- Create: `configs/synthesis/mcp_snowflake_config.json` +- Create: `configs/synthesis/mcp_woocommerce_config.json` +- Create: `configs/synthesis/mcp_yahoo_finance_config.json` +- Create: `configs/synthesis/mcp_youtube_config.json` +- Create: `configs/synthesis/mcp_train_config.json` +- Create: `configs/trajectory/mcp_canvas_trajectory.json` +- Create: `configs/trajectory/mcp_snowflake_trajectory.json` +- Create: `configs/trajectory/mcp_woocommerce_trajectory.json` +- Create: `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- Create: `configs/trajectory/mcp_youtube_trajectory.json` +- Create: `configs/trajectory/mcp_train_trajectory.json` +- Create: `seeds/mcp/canvas_seeds.jsonl` +- Create: `seeds/mcp/snowflake_seeds.jsonl` +- Create: `seeds/mcp/woocommerce_seeds.jsonl` +- Create: `seeds/mcp/yahoo_finance_seeds.jsonl` +- Create: `seeds/mcp/youtube_seeds.jsonl` +- Create: `seeds/mcp/train_seeds.jsonl` +- Create: `benchmark/mcp_canvas_benchmark.jsonl` +- Create: `benchmark/mcp_snowflake_benchmark.jsonl` +- Create: `benchmark/mcp_woocommerce_benchmark.jsonl` +- Create: `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- Create: `benchmark/mcp_youtube_benchmark.jsonl` +- Create: `benchmark/mcp_train_benchmark.jsonl` + +### Coding example validation and assets + +- Create: `synthesis/tests/test_code_example_synthesis_config.py` + Responsibility: validate the Coding synthesis config, `code-*` tool exposure, and repo-local `source_dir` contract. +- Create: `rollout/tests/test_code_example_assets.py` + Responsibility: validate the Coding rollout config, bundled demo repo, seed data, and mixed read/edit benchmark tasks. +- Create: `rollout/tests/test_code_example_doc.py` + Responsibility: validate `examples/CodingAgent.md` has the DS-style structure, exact `AGENTFLOW_REPO_ROOT` setup steps, and no machine-local path leakage. +- Create: `examples/CodingAgent.md` +- Create: `configs/synthesis/code_config.json` +- Create: `configs/trajectory/code_trajectory.json` +- Create: `seeds/code/seeds.jsonl` +- Create: `seeds/code/seed/demo_repo/README.md` +- Create: `seeds/code/seed/demo_repo/app.py` +- Create: `seeds/code/seed/demo_repo/config/app_config.json` +- Create: `seeds/code/seed/demo_repo/lib/helpers.py` +- Create: `seeds/code/seed/demo_repo/tests/smoke_test.py` +- Create: `benchmark/code_benchmark.jsonl` + +## Chunk 1: Shared MCP Sandbox Contract + +### Task 1: Lock the official MCP sandbox template + +**Files:** +- Modify: `configs/sandbox-server/mcp_config.json` +- Modify: `sandbox/tests/test_sandbox_config_loading.py` +- Modify: `sandbox/tests/test_mcp_backend.py` +- Modify: `sandbox/tests/test_mcp_client.py` + +- [ ] **Step 1: Add the failing MCP contract tests** + +Add these exact assertions. + +In `sandbox/tests/test_sandbox_config_loading.py`, add: + +```python +def test_load_server_config_keeps_required_mcp_servers_path_placeholder_when_env_missing( + tmp_path, monkeypatch +): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + + config_path = tmp_path / "mcp_config.json" + raw_config = { + "resources": { + "mcp": { + "enabled": True, + "config": { + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers" + }, + } + } + } + config_path.write_text(json.dumps(raw_config), encoding="utf-8") + + sandbox = Sandbox(config=SandboxConfig(server_config_path=str(config_path))) + loaded = sandbox._load_server_config() + + assert ( + loaded["resources"]["mcp"]["config"]["mcp_servers_path"] + == "${TOOLATHLON_GYM_ROOT}/local_servers" + ) +``` + +In `sandbox/tests/test_mcp_backend.py`, replace the current lightweight template parse check with: + +```python +def test_mcp_config_template_declares_example_server_subset(monkeypatch): + monkeypatch.delenv("TOOLATHLON_GYM_ROOT", raising=False) + monkeypatch.delenv("PGHOST", raising=False) + monkeypatch.delenv("PGPORT", raising=False) + monkeypatch.delenv("PGUSER", raising=False) + monkeypatch.delenv("PGPASSWORD", raising=False) + monkeypatch.delenv("PGDATABASE", raising=False) + monkeypatch.delenv("CANVAS_DOMAIN", raising=False) + monkeypatch.delenv("WORDPRESS_SITE_URL", raising=False) + + loader = ConfigLoader() + config_path = ( + Path(__file__).resolve().parents[2] + / "configs" + / "sandbox-server" + / "mcp_config.json" + ) + + config = loader.load(str(config_path)) + mcp_resource = config.resources["mcp"] + mcp_config = mcp_resource.config + + assert mcp_resource.backend_class == ( + "sandbox.server.backends.resources.mcp.toolathlon_gym.ToolathlonGymBackend" + ) + assert mcp_config["mcp_servers_path"] == "${TOOLATHLON_GYM_ROOT}/local_servers" + assert mcp_config["enabled_mcp_servers"] == [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem", + ] + assert mcp_config["env_overrides"] == { + "PGHOST": "localhost", + "PGPORT": "5432", + "PGUSER": "eigent", + "PGPASSWORD": "camel", + "PGDATABASE": "toolathlon_gym", + "CANVAS_DOMAIN": "localhost:8080", + "WORDPRESS_SITE_URL": "http://localhost:8081", + } + assert config.warmup.enabled is True + assert config.warmup.resources == ["mcp"] +``` + +In `sandbox/tests/test_mcp_client.py`, add: + +```python +def test_load_mcp_process_config_resolves_toolathlon_local_servers_path(tmp_path): + module = load_mcp_client_module() + config_dir = tmp_path / "configs" / "mcp_servers" + config_dir.mkdir(parents=True) + (config_dir / "filesystem.yaml").write_text( + """ +type: stdio +name: filesystem +params: + command: node + args: + - ${local_servers_paths}/filesystem/environment/dist/index.js + - ${agent_workspace} + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="filesystem", + agent_workspace="/tmp/agentflow-worker", + mcp_servers_path="/tmp/toolathlon/local_servers", + config_dir=config_dir, + ) + + assert config.command == "node" + assert config.args == [ + "/tmp/toolathlon/local_servers/filesystem/environment/dist/index.js", + "/tmp/agentflow-worker", + ] +``` + +- [ ] **Step 2: Run the MCP contract tests and confirm they fail** + +Run: + +```bash +pytest \ + sandbox/tests/test_sandbox_config_loading.py::test_load_server_config_keeps_required_mcp_servers_path_placeholder_when_env_missing \ + sandbox/tests/test_mcp_backend.py::test_mcp_config_template_declares_example_server_subset \ + sandbox/tests/test_mcp_client.py::test_load_mcp_process_config_resolves_toolathlon_local_servers_path \ + -v +``` + +Expected: the suite should fail before the template update; likely causes are the missing `mcp_servers_path`, the old server subset, or the old `PGHOST` default, but the exact failing assertion may vary slightly if the branch state drifts. + +- [ ] **Step 3: Update `configs/sandbox-server/mcp_config.json` to match the approved example contract** + +Make `resources.mcp.config` match this exact shape: + +```json +{ + "mcp_servers_path": "${TOOLATHLON_GYM_ROOT}/local_servers", + "enabled_mcp_servers": [ + "canvas", + "snowflake", + "woocommerce", + "yahoo-finance", + "youtube", + "youtube-transcript", + "rail_12306", + "filesystem" + ], + "workspace_root": "${TOOLATHLON_WORKSPACE_ROOT:-/tmp/agentflow_mcp}", + "env_overrides": { + "PGHOST": "${PGHOST:-localhost}", + "PGPORT": "${PGPORT:-5432}", + "PGUSER": "${PGUSER:-eigent}", + "PGPASSWORD": "${PGPASSWORD:-camel}", + "PGDATABASE": "${PGDATABASE:-toolathlon_gym}", + "CANVAS_DOMAIN": "${CANVAS_DOMAIN:-localhost:8080}", + "WORDPRESS_SITE_URL": "${WORDPRESS_SITE_URL:-http://localhost:8081}" + } +} +``` + +Keep: + +```json +"warmup": { + "enabled": true, + "resources": ["mcp"] +} +``` + +Do not add `terminal` back into the official example template. + +- [ ] **Step 4: Re-run the MCP contract tests and verify they pass** + +Run the same pytest command from Step 2. + +Expected: PASS for all three tests. + +- [ ] **Step 5: Commit the MCP sandbox contract change** + +```bash +git add \ + configs/sandbox-server/mcp_config.json \ + sandbox/tests/test_sandbox_config_loading.py \ + sandbox/tests/test_mcp_backend.py \ + sandbox/tests/test_mcp_client.py +git commit -m "test: lock MCP example sandbox contract" +``` + +## Chunk 2: MCP Example Assets and Guide + +### Task 2: Add MCP config and asset tests first + +**Files:** +- Create: `synthesis/tests/test_mcp_example_synthesis_configs.py` +- Create: `rollout/tests/test_mcp_example_assets.py` + +- [ ] **Step 1: Write the failing synthesis-config test file** + +Create `synthesis/tests/test_mcp_example_synthesis_configs.py` with: + +```python +import json +from pathlib import Path + +import pytest + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/canvas_seeds.jsonl", + "seed_description": "Canvas MCP prompts", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/snowflake_seeds.jsonl", + "seed_description": "Snowflake MCP prompts", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/woocommerce_seeds.jsonl", + "seed_description": "WooCommerce MCP prompts", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/yahoo_finance_seeds.jsonl", + "seed_description": "Yahoo Finance MCP prompts", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "seeds_file": "seeds/mcp/youtube_seeds.jsonl", + "seed_description": "YouTube MCP prompts", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "seeds_file": "seeds/mcp/train_seeds.jsonl", + "seed_description": "Train MCP prompts", + }, +} + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_synthesis_config_contract(domain): + expected = EXPECTED[domain] + config_path = REPO_ROOT / "configs" / "synthesis" / f"mcp_{domain}_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == "${OPENAI_API_KEY}" + assert config.base_url == "${OPENAI_API_URL}" + assert config.max_depth == 12 + assert config.branching_factor == 2 + assert config.depth_threshold == 2 + assert config.min_depth == 2 + assert config.max_selected_traj == 1 + assert config.path_similarity_threshold == 0.7 + assert config.available_tools == expected["tools"] + assert config.seeds_file == expected["seeds_file"] + assert config.output_dir == f"results/mcp_{domain}" + assert raw["seed_description"] == expected["seed_description"] + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() +``` + +- [ ] **Step 2: Write the failing rollout-asset test file** + +Create `rollout/tests/test_mcp_example_assets.py` with: + +```python +import json +from pathlib import Path + +import pytest + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] +EXPECTED = { + "canvas": { + "tools": ["mcp:canvas.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_canvas_benchmark.jsonl", + "benchmark_name": "mcp_canvas_trajectory", + }, + "snowflake": { + "tools": ["mcp:snowflake.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_snowflake_benchmark.jsonl", + "benchmark_name": "mcp_snowflake_trajectory", + }, + "woocommerce": { + "tools": ["mcp:woocommerce.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_woocommerce_benchmark.jsonl", + "benchmark_name": "mcp_woocommerce_trajectory", + }, + "yahoo_finance": { + "tools": ["mcp:yahoo-finance.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_yahoo_finance_benchmark.jsonl", + "benchmark_name": "mcp_yahoo_finance_trajectory", + }, + "youtube": { + "tools": [ + "mcp:youtube.*", + "mcp:youtube-transcript.*", + "mcp:filesystem.*", + ], + "benchmark": "benchmark/mcp_youtube_benchmark.jsonl", + "benchmark_name": "mcp_youtube_trajectory", + }, + "train": { + "tools": ["mcp:rail_12306.*", "mcp:filesystem.*"], + "benchmark": "benchmark/mcp_train_benchmark.jsonl", + "benchmark_name": "mcp_train_trajectory", + }, +} + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_rollout_config_contract(domain): + expected = EXPECTED[domain] + config_path = REPO_ROOT / "configs" / "trajectory" / f"mcp_{domain}_trajectory.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = RolloutConfig.from_json(str(config_path)) + raw_init = raw.get("resource_init_configs", {}) + + assert config.benchmark_name == expected["benchmark_name"] + assert config.data_path == expected["benchmark"] + assert config.model_name == "openai/gpt-oss-120b" + assert config.api_key == "${OPENAI_API_KEY}" + assert config.base_url == "${OPENAI_API_URL}" + assert config.max_turns == 20 + assert config.available_tools == expected["tools"] + assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["mcp"] + assert raw_init in ({}, {"mcp": {"content": {}}}) + assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) + assert "MCP domain assistant" in config.system_prompt + assert "Use only the available MCP tools" in config.system_prompt + assert "Reply with the final answer only" in config.system_prompt + assert config.evaluate_results is False + assert config.output_dir == f"trajectory_results/mcp_{domain}" + assert config.save_results is True + assert config.trajectory_only is True + assert config.save_trajectories is True + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_seed_files_are_two_row_jsonl(domain): + seed_path = REPO_ROOT / "seeds" / "mcp" / f"{domain}_seeds.jsonl" + rows = _read_jsonl(seed_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +@pytest.mark.parametrize("domain", sorted(EXPECTED)) +def test_mcp_benchmark_files_have_two_row_jsonl_contract(domain): + benchmark_path = REPO_ROOT / "benchmark" / f"mcp_{domain}_benchmark.jsonl" + rows = _read_jsonl(benchmark_path) + + assert len(rows) == 2 + assert all(set(row.keys()) == {"id", "question", "answer"} for row in rows) + assert all(isinstance(row["question"], str) and row["question"].strip() for row in rows) + assert all(isinstance(row["answer"], str) for row in rows) +``` + +- [ ] **Step 3: Run the new MCP tests and verify they fail because the assets do not exist yet** + +Run: + +```bash +pytest \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + -v +``` + +Expected: FAIL with missing-file errors for the new MCP configs, seeds, and benchmarks. + +### Task 3: Create the six-domain MCP seeds, benchmarks, and configs + +**Files:** +- Create: `configs/synthesis/mcp_canvas_config.json` +- Create: `configs/synthesis/mcp_snowflake_config.json` +- Create: `configs/synthesis/mcp_woocommerce_config.json` +- Create: `configs/synthesis/mcp_yahoo_finance_config.json` +- Create: `configs/synthesis/mcp_youtube_config.json` +- Create: `configs/synthesis/mcp_train_config.json` +- Create: `configs/trajectory/mcp_canvas_trajectory.json` +- Create: `configs/trajectory/mcp_snowflake_trajectory.json` +- Create: `configs/trajectory/mcp_woocommerce_trajectory.json` +- Create: `configs/trajectory/mcp_yahoo_finance_trajectory.json` +- Create: `configs/trajectory/mcp_youtube_trajectory.json` +- Create: `configs/trajectory/mcp_train_trajectory.json` +- Create: `seeds/mcp/canvas_seeds.jsonl` +- Create: `seeds/mcp/snowflake_seeds.jsonl` +- Create: `seeds/mcp/woocommerce_seeds.jsonl` +- Create: `seeds/mcp/yahoo_finance_seeds.jsonl` +- Create: `seeds/mcp/youtube_seeds.jsonl` +- Create: `seeds/mcp/train_seeds.jsonl` +- Create: `benchmark/mcp_canvas_benchmark.jsonl` +- Create: `benchmark/mcp_snowflake_benchmark.jsonl` +- Create: `benchmark/mcp_woocommerce_benchmark.jsonl` +- Create: `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- Create: `benchmark/mcp_youtube_benchmark.jsonl` +- Create: `benchmark/mcp_train_benchmark.jsonl` + +- [ ] **Step 1: Create the six MCP seed files** + +Use exactly two rows per file, each shaped as `{"content": "...", "kwargs": {}}`. + +Seed rows: + +- `seeds/mcp/canvas_seeds.jsonl` + - `Use the available Canvas MCP tools to inspect courses, assignments, and enrollment information.` + - `Find a small Canvas reporting task that can be answered from the mock data and save intermediate notes with filesystem tools if helpful.` +- `seeds/mcp/snowflake_seeds.jsonl` + - `Use the available Snowflake MCP tools to inspect schemas, tables, and small analytical queries in the mock warehouse.` + - `Find one compact warehouse reporting question that can be answered from the available Snowflake tools.` +- `seeds/mcp/woocommerce_seeds.jsonl` + - `Use the WooCommerce MCP tools to inspect customers, products, and orders in the mock store.` + - `Find one small sales or operations question that can be answered from the WooCommerce mock data.` +- `seeds/mcp/yahoo_finance_seeds.jsonl` + - `Use the Yahoo Finance MCP tools to inspect the mock ticker and market data available locally.` + - `Find one small finance lookup or comparison question that can be answered directly from the available tools.` +- `seeds/mcp/youtube_seeds.jsonl` + - `Use the YouTube and YouTube Transcript MCP tools to inspect mock video metadata and transcript data.` + - `Find one small content-discovery or transcript lookup question that can be answered from the local mock data.` +- `seeds/mcp/train_seeds.jsonl` + - `Use the rail_12306 MCP tools to inspect mock train, station, and route information.` + - `Find one small travel-planning or route lookup question that can be answered from the available railway tools.` + +- [ ] **Step 2: Create the six MCP benchmark files** + +Use exactly two rows per file with schema `{"id": "...", "question": "...", "answer": "..."}`. + +Questions: + +- `benchmark/mcp_canvas_benchmark.jsonl` + - `{"id": "mcp_canvas_001", "question": "Use Canvas MCP tools to list the first three course names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_canvas_002", "question": "Use Canvas MCP tools to find one course and report its course code plus enrollment count as code=, enrolled=.", "answer": ""}` +- `benchmark/mcp_snowflake_benchmark.jsonl` + - `{"id": "mcp_snowflake_001", "question": "Use Snowflake MCP tools to list the first three tables visible in the default schema in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_snowflake_002", "question": "Use Snowflake MCP tools to compute one small aggregate from a mock table and reply as key=value.", "answer": ""}` +- `benchmark/mcp_woocommerce_benchmark.jsonl` + - `{"id": "mcp_woocommerce_001", "question": "Use WooCommerce MCP tools to list the first three product names in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_woocommerce_002", "question": "Use WooCommerce MCP tools to identify one customer email and that customer's order count. Reply as email=, orders=.", "answer": ""}` +- `benchmark/mcp_yahoo_finance_benchmark.jsonl` + - `{"id": "mcp_yahoo_finance_001", "question": "Use Yahoo Finance MCP tools to list the first three ticker symbols available in the mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_yahoo_finance_002", "question": "Use Yahoo Finance MCP tools to compare two available mock tickers and reply with the one that has the larger price as symbol=.", "answer": ""}` +- `benchmark/mcp_youtube_benchmark.jsonl` + - `{"id": "mcp_youtube_001", "question": "Use YouTube MCP tools to list the first three video titles returned by the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_youtube_002", "question": "Use YouTube Transcript MCP tools to find one video and report the video id plus transcript language as video=, language=.", "answer": ""}` +- `benchmark/mcp_train_benchmark.jsonl` + - `{"id": "mcp_train_001", "question": "Use rail_12306 MCP tools to list the first three station names available in the local mock dataset in alphabetical order. Reply as a comma-separated list only.", "answer": ""}` + - `{"id": "mcp_train_002", "question": "Use rail_12306 MCP tools to find one route and reply with departure=, arrival=.", "answer": ""}` + +During Chunk 2, keep all twelve `answer` fields as empty strings. Do not guess or invent answers here; Chunk 4 Task 9 is the required live grounding step that will fill them from the prepared mock environment. + +- [ ] **Step 3: Create the six MCP synthesis configs** + +Use `configs/synthesis/ds_config.json` for field ordering, but make each MCP file follow this exact contract: + +```json +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 12, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "available_tools": ["mcp:.*", "mcp:filesystem.*"], + "sampling_tips": [ + "Domain-specific MCP exploration guidance", + "Prefer using filesystem tools only for scratch notes or short saved artifacts." + ], + "synthesis_tips": [ + "Generate domain-grounded factual QA only.", + "Keep answers short and directly verifiable from tool outputs." + ], + "qa_examples": [ + {"question": "Example question 1", "answer": "Example answer 1"}, + {"question": "Example question 2", "answer": "Example answer 2"} + ], + "seed_description": "Domain-specific MCP prompts", + "seeds_file": "seeds/mcp/_seeds.jsonl", + "output_dir": "results/mcp_" +} +``` + +Exact tool mappings: + +- `canvas`: `["mcp:canvas.*", "mcp:filesystem.*"]` +- `snowflake`: `["mcp:snowflake.*", "mcp:filesystem.*"]` +- `woocommerce`: `["mcp:woocommerce.*", "mcp:filesystem.*"]` +- `yahoo_finance`: `["mcp:yahoo-finance.*", "mcp:filesystem.*"]` +- `youtube`: `["mcp:youtube.*", "mcp:youtube-transcript.*", "mcp:filesystem.*"]` +- `train`: `["mcp:rail_12306.*", "mcp:filesystem.*"]` + +Exact `seed_description` strings: + +- `Canvas MCP prompts` +- `Snowflake MCP prompts` +- `WooCommerce MCP prompts` +- `Yahoo Finance MCP prompts` +- `YouTube MCP prompts` +- `Train MCP prompts` + +Each file must use 2-3 domain-specific `qa_examples`. Keep them domain-grounded and format-focused, but do not invent benchmark answers that depend on live mock data. + +For `resource_init_configs`, the committed files should prefer: + +```json +"resource_init_configs": {} +``` + +but the tests must also accept: + +```json +"resource_init_configs": { + "mcp": { + "content": {} + } +} +``` + +because the approved spec allows MCP init content to be omitted or explicitly empty. + +- [ ] **Step 4: Create the six MCP rollout configs** + +Use `configs/trajectory/ds_trajectory.json` for field ordering, but make each MCP rollout config follow this exact contract: + +```json +{ + "benchmark_name": "mcp__trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 20, + "available_tools": ["mcp:.*", "mcp:filesystem.*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/mcp_config.json", + "resource_types": ["mcp"], + "resource_init_configs": {}, + "system_prompt": [ + "You are an MCP domain assistant.", + "Use only the available MCP tools to answer the question.", + "Reply with the final answer only unless the question explicitly asks for another format." + ], + "evaluate_results": false, + "data_path": "benchmark/mcp__benchmark.jsonl", + "output_dir": "trajectory_results/mcp_", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true +} +``` + +For `youtube`, the `available_tools` list must include both `mcp:youtube.*` and `mcp:youtube-transcript.*`. + +As in Step 3, prefer `"resource_init_configs": {}` in the committed files, but keep the tests permissive enough to allow the spec-approved explicit-empty MCP init form. + +- [ ] **Step 5: Run the MCP synthesis-config and rollout-asset tests and make them pass** + +Run: + +```bash +pytest \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 6: Defer MCP benchmark answer grounding to Chunk 4** + +Keep this chunk deterministic. Do not block Chunk 2 on external MCP services or LLM credentials. The required live answer-grounding step for MCP benchmarks happens later in Chunk 4 Task 9. + +- [ ] **Step 7: Commit the MCP assets and configs** + +```bash +git add \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + configs/synthesis/mcp_*.json \ + configs/trajectory/mcp_*.json \ + seeds/mcp \ + benchmark/mcp_*.jsonl +git commit -m "feat: add MCP example assets" +``` + +### Task 4: Add the MCP guide and lock the doc contract + +**Files:** +- Create: `rollout/tests/test_mcp_example_doc.py` +- Create: `examples/MCPAgent.md` + +- [ ] **Step 1: Write the failing MCP doc contract test** + +Create `rollout/tests/test_mcp_example_doc.py` with: + +```python +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_mcp_example_doc_has_required_sections_and_exact_prerequisite_contract(): + content = (REPO_ROOT / "examples" / "MCPAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "configs/sandbox-server/mcp_config.json", + "configs/synthesis/mcp_canvas_config.json", + "configs/synthesis/mcp_snowflake_config.json", + "configs/synthesis/mcp_woocommerce_config.json", + "configs/synthesis/mcp_yahoo_finance_config.json", + "configs/synthesis/mcp_youtube_config.json", + "configs/synthesis/mcp_train_config.json", + "configs/trajectory/mcp_canvas_trajectory.json", + "configs/trajectory/mcp_snowflake_trajectory.json", + "configs/trajectory/mcp_woocommerce_trajectory.json", + "configs/trajectory/mcp_yahoo_finance_trajectory.json", + "configs/trajectory/mcp_youtube_trajectory.json", + "configs/trajectory/mcp_train_trajectory.json", + "export TOOLATHLON_GYM_ROOT=", + "${TOOLATHLON_GYM_ROOT}/local_servers", + "./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json", + "node", + "uv", + "PGHOST", + "PGPORT", + "PGUSER", + "PGPASSWORD", + "PGDATABASE", + "CANVAS_DOMAIN", + "WORDPRESS_SITE_URL", + ] + for needle in required_strings: + assert needle in content + + lowered = content.lower() + assert "/home/" not in content + assert "training" in lowered + assert "deployment" in lowered + assert "infer" in lowered + assert "not covered" in lowered + assert "Step 4" not in content + assert "Step 5" not in content +``` + +- [ ] **Step 2: Run the doc contract test and confirm it fails** + +Run: + +```bash +pytest rollout/tests/test_mcp_example_doc.py -v +``` + +Expected: FAIL because `examples/MCPAgent.md` does not exist yet. + +- [ ] **Step 3: Write `examples/MCPAgent.md` in DS-style structure** + +The guide must: + +- open with a DS-style three-step title, not a five-step title +- cover the six domains: `canvas`, `snowflake`, `woocommerce`, `yahoo_finance`, `youtube`, `train` +- explain that the example uses `configs/sandbox-server/mcp_config.json` +- include prerequisites for: + - `cd AgentFlow` + - `export OPENAI_API_KEY=...` + - `export OPENAI_API_URL=...` + - `export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym` + - an already prepared and already running Toolathlon-GYM environment + - `node` and `uv` +- mention that the MCP server bundle is resolved from `${TOOLATHLON_GYM_ROOT}/local_servers` +- list the env override fields surfaced by `mcp_config.json` +- give one Step 1 command: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json +``` + +- give Step 2 commands for all six synthesis configs +- give Step 3 commands for all six rollout configs +- explicitly say later training / deployment / infer are not covered yet +- keep explanations short and example-oriented; do not add internal orchestration advice about restarting sandboxes between domains + +- [ ] **Step 4: Run the MCP doc test and a targeted MCP suite** + +Run: + +```bash +pytest \ + rollout/tests/test_mcp_example_doc.py \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + rollout/tests/test_mcp_example_assets.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit the MCP guide** + +```bash +git add rollout/tests/test_mcp_example_doc.py examples/MCPAgent.md +git commit -m "docs: add MCP example guide" +``` + +## Chunk 3: Coding Example Assets and Guide + +### Task 5: Add Coding config and asset tests first + +**Files:** +- Create: `synthesis/tests/test_code_example_synthesis_config.py` +- Create: `rollout/tests/test_code_example_assets.py` + +- [ ] **Step 1: Write the failing Coding synthesis-config test** + +Create `synthesis/tests/test_code_example_synthesis_config.py` with: + +```python +import json +from pathlib import Path + +from synthesis.core.config import SynthesisConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_code_synthesis_config_contract(): + config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" + raw = json.loads(config_path.read_text(encoding="utf-8")) + config = SynthesisConfig.from_json(str(config_path)) + + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.available_tools == ["code-*"] + assert config.seeds_file == "seeds/code/seeds.jsonl" + assert raw["seed_description"] == "Coding demo repository prompts" + assert len(config.qa_examples) >= 2 + assert config.sampling_tips.strip() + assert config.synthesis_tips.strip() +``` + +- [ ] **Step 2: Write the failing Coding rollout-asset test** + +Create `rollout/tests/test_code_example_assets.py` with: + +```python +import json +from pathlib import Path + +from rollout.core.config import RolloutConfig + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def _read_jsonl(path: Path): + return [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + + +def test_code_rollout_config_contract(): + config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + config = RolloutConfig.from_json(str(config_path)) + + assert config.benchmark_name == "code_trajectory" + assert config.data_path == "benchmark/code_benchmark.jsonl" + assert config.available_tools == ["code-*"] + assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" + assert config.sandbox_auto_start is False + assert config.resource_types == ["code"] + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + assert config.evaluate_results is False + assert config.trajectory_only is True + assert config.save_trajectories is True + assert config.save_summary is False + + +def test_code_seed_file_contract(): + rows = _read_jsonl(REPO_ROOT / "seeds" / "code" / "seeds.jsonl") + + assert len(rows) == 2 + assert all(set(row.keys()) == {"content", "kwargs"} for row in rows) + assert all(isinstance(row["content"], str) and row["content"].strip() for row in rows) + assert all(row["kwargs"] == {} for row in rows) + + +def test_code_benchmark_contract(): + rows = _read_jsonl(REPO_ROOT / "benchmark" / "code_benchmark.jsonl") + + assert len(rows) == 2 + assert all({"id", "question", "answer"} <= set(row.keys()) for row in rows) + assert rows[0]["id"] == "code_read_001" + assert "metadata" not in rows[0] + assert rows[1]["id"] == "code_edit_001" + assert "tests/smoke_test.py" in rows[1]["question"] + assert rows[1]["answer"] == "smoke test passed" + assert rows[1]["metadata"] == { + "target_files": ["app.py"], + "check_command": "python tests/smoke_test.py", + } + assert all("/home/" not in json.dumps(row, ensure_ascii=False) for row in rows) + assert all("DataFlow" not in json.dumps(row, ensure_ascii=False) for row in rows) + + +def test_code_demo_repo_contract(): + repo_root = REPO_ROOT / "seeds" / "code" / "seed" / "demo_repo" + + required_paths = [ + repo_root / "README.md", + repo_root / "app.py", + repo_root / "config" / "app_config.json", + repo_root / "lib" / "helpers.py", + repo_root / "tests" / "smoke_test.py", + ] + for path in required_paths: + assert path.exists(), path + + smoke_test = (repo_root / "tests" / "smoke_test.py").read_text(encoding="utf-8") + assert "build_message" in smoke_test + assert "SMOKE_OK" in smoke_test +``` + +- [ ] **Step 3: Run the new Coding tests and verify they fail because the assets do not exist yet** + +Run: + +```bash +pytest \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + -v +``` + +Expected: FAIL with missing-file errors for the new Coding config, seed, benchmark, and demo repo files. + +### Task 6: Create the Coding demo repo, seeds, benchmark, and configs + +**Files:** +- Create: `configs/synthesis/code_config.json` +- Create: `configs/trajectory/code_trajectory.json` +- Create: `seeds/code/seeds.jsonl` +- Create: `seeds/code/seed/demo_repo/README.md` +- Create: `seeds/code/seed/demo_repo/app.py` +- Create: `seeds/code/seed/demo_repo/config/app_config.json` +- Create: `seeds/code/seed/demo_repo/lib/helpers.py` +- Create: `seeds/code/seed/demo_repo/tests/smoke_test.py` +- Create: `benchmark/code_benchmark.jsonl` + +- [ ] **Step 1: Create the committed demo repo contents** + +Use these exact file contents. + +`seeds/code/seed/demo_repo/README.md` + +```md +# Coding Example Demo Repo + +This tiny repository is bundled for AgentFlow's CodingAgent example. + +- `app.py` builds a greeting string. +- `config/app_config.json` stores the expected name and suffix. +- `lib/helpers.py` contains the formatting helper. +- `tests/smoke_test.py` is the verification command used by the rollout example. +``` + +`seeds/code/seed/demo_repo/config/app_config.json` + +```json +{ + "default_name": "AgentFlow", + "suffix": "!" +} +``` + +`seeds/code/seed/demo_repo/lib/helpers.py` + +```python +def render_greeting(name: str, suffix: str) -> str: + return f"Hello, {name}{suffix}" +``` + +`seeds/code/seed/demo_repo/app.py` + +```python +import json +from pathlib import Path + +from lib.helpers import render_greeting + + +CONFIG_PATH = Path(__file__).parent / "config" / "app_config.json" + + +def load_config() -> dict: + return json.loads(CONFIG_PATH.read_text(encoding="utf-8")) + + +def build_message() -> str: + config = load_config() + return render_greeting(config["default_name"], "?") + + +if __name__ == "__main__": + print(build_message()) +``` + +`seeds/code/seed/demo_repo/tests/smoke_test.py` + +```python +from app import build_message + + +def main() -> None: + message = build_message() + assert message == "Hello, AgentFlow!", message + print("SMOKE_OK") + + +if __name__ == "__main__": + main() +``` + +Keep the `app.py` / `smoke_test.py` mismatch intentional here: the hard-coded `"?"` is the planned edit-task bug that the bundled Coding benchmark will exercise later. + +- [ ] **Step 2: Create the Coding seed and benchmark files** + +`seeds/code/seeds.jsonl` must contain exactly: + +```jsonl +{"content": "Inspect the demo repository and trace how the greeting is assembled from config and helper code.", "kwargs": {}} +{"content": "Look for a small repository bug that can be fixed with a minimal edit and validated with the committed smoke test.", "kwargs": {}} +``` + +`benchmark/code_benchmark.jsonl` must contain exactly two rows: + +```jsonl +{"id": "code_read_001", "question": "Use code tools to inspect the demo repository. What default name does the app greet? Reply with the name only.", "answer": "AgentFlow"} +{"id": "code_edit_001", "question": "Update the demo repository so `python tests/smoke_test.py` succeeds. Preserve the config-driven greeting behavior, verify the fix with that command, then reply with exactly `smoke test passed`.", "answer": "smoke test passed", "metadata": {"target_files": ["app.py"], "check_command": "python tests/smoke_test.py"}} +``` + +- [ ] **Step 3: Create the Coding synthesis config** + +Create `configs/synthesis/code_config.json` with this exact contract: + +```json +{ + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_depth": 10, + "branching_factor": 2, + "depth_threshold": 2, + "min_depth": 2, + "max_selected_traj": 1, + "path_similarity_threshold": 0.7, + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "available_tools": ["code-*"], + "sampling_tips": [ + "Inspect the repository before proposing edits.", + "Use code-bash only for lightweight checks that fit the bundled demo repo." + ], + "synthesis_tips": [ + "Generate repo-grounded QA only.", + "Prefer file-path, function-behavior, and small edit-validation questions over open-ended design prompts." + ], + "qa_examples": [ + { + "question": "Which file stores the greeting suffix used by the demo app? Reply with the relative file path only.", + "answer": "config/app_config.json" + }, + { + "question": "What string does `build_message()` return before any edits? Reply with the exact string only.", + "answer": "Hello, AgentFlow?" + } + ], + "seed_description": "Coding demo repository prompts", + "seeds_file": "seeds/code/seeds.jsonl", + "output_dir": "results/code" +} +``` + +- [ ] **Step 4: Create the Coding rollout config** + +Create `configs/trajectory/code_trajectory.json` with this exact contract: + +```json +{ + "benchmark_name": "code_trajectory", + "model_name": "openai/gpt-oss-120b", + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "max_turns": 12, + "available_tools": ["code-*"], + "sandbox_server_url": "http://127.0.0.1:18890", + "sandbox_auto_start": false, + "sandbox_config_path": "configs/sandbox-server/code_config.json", + "resource_types": ["code"], + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + }, + "system_prompt": [ + "You are a coding assistant working inside a small repository.", + "Inspect files before editing them.", + "When a task asks for verification, run the requested command inside the coding workspace before giving the final answer." + ], + "evaluate_results": false, + "data_path": "benchmark/code_benchmark.jsonl", + "output_dir": "trajectory_results/code", + "save_results": true, + "save_trajectories": true, + "trajectory_only": true, + "save_summary": false +} +``` + +- [ ] **Step 5: Run the Coding config and asset tests plus deterministic backend coverage** + +Run: + +```bash +pytest \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 6: Defer the credential-dependent live Coding rollout to Chunk 4** + +Do not block this chunk on external credentials or a running sandbox. The required gate for Chunk 3 is the deterministic pytest suite from Step 5. Perform the representative live Coding rollout later using Chunk 4 Task 10. + +- [ ] **Step 7: Commit the Coding assets** + +```bash +git add \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + configs/synthesis/code_config.json \ + configs/trajectory/code_trajectory.json \ + seeds/code \ + benchmark/code_benchmark.jsonl +git commit -m "feat: add Coding example assets" +``` + +### Task 7: Add the Coding guide and lock the doc contract + +**Files:** +- Create: `rollout/tests/test_code_example_doc.py` +- Create: `examples/CodingAgent.md` + +- [ ] **Step 1: Write the failing Coding doc contract test** + +Create `rollout/tests/test_code_example_doc.py` with: + +```python +from pathlib import Path + + +REPO_ROOT = Path(__file__).resolve().parents[2] + + +def test_coding_example_doc_has_required_sections_and_repo_root_contract(): + content = (REPO_ROOT / "examples" / "CodingAgent.md").read_text(encoding="utf-8") + + required_sections = [ + "## Overview", + "## Prerequisites", + "## Pipeline Overview", + "## Step 1: Start the Sandbox Server", + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + "## Configuration Reference", + "## FAQ", + ] + for section in required_sections: + assert section in content + + required_strings = [ + "cd AgentFlow", + "export AGENTFLOW_REPO_ROOT=$(pwd)", + "code-*", + "configs/sandbox-server/code_config.json", + "configs/synthesis/code_config.json", + "configs/trajectory/code_trajectory.json", + "benchmark/code_benchmark.jsonl", + "seeds/code/seeds.jsonl", + "seeds/code/seed/demo_repo", + "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo", + "source_dir", + "./start_sandbox_server.sh --config configs/sandbox-server/code_config.json", + "python tests/smoke_test.py", + "training / deployment / infer are not covered yet", + ] + for needle in required_strings: + assert needle in content + + assert "/home/a1/sdb/dxd/DataFlow" not in content + assert "DataFlow" not in content + assert "Step 4" not in content + assert "Step 5" not in content +``` + +- [ ] **Step 2: Run the Coding doc test and confirm it fails** + +Run: + +```bash +pytest rollout/tests/test_code_example_doc.py -v +``` + +Expected: FAIL because `examples/CodingAgent.md` does not exist yet. + +- [ ] **Step 3: Write `examples/CodingAgent.md` in DS-style structure** + +The guide must: + +- use a DS-style three-step title +- explain that CodingAgent uses the `code` backend’s six tools through `code-*` +- include prerequisites for: + - `cd AgentFlow` + - `export OPENAI_API_KEY=...` + - `export OPENAI_API_URL=...` + - `export AGENTFLOW_REPO_ROOT=$(pwd)` +- document the default committed repo path `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` +- say users can replace `resource_init_configs.code.content.source_dir` with their own repo path if desired +- use `configs/sandbox-server/code_config.json` for Step 1 +- use `configs/synthesis/code_config.json` for Step 2 +- use `configs/trajectory/code_trajectory.json` for Step 3 +- mention `benchmark/code_benchmark.jsonl` +- mention the bundled read-only plus edit-task example style +- explicitly say later training / deployment / infer are not covered yet +- keep the explanation example-oriented; do not mention the user’s local DataFlow path + +- [ ] **Step 4: Run the Coding doc test and the targeted Coding suite** + +Run: + +```bash +pytest \ + rollout/tests/test_code_example_doc.py \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_code_example_assets.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 5: Commit the Coding guide** + +```bash +git add rollout/tests/test_code_example_doc.py examples/CodingAgent.md +git commit -m "docs: add Coding example guide" +``` + +## Chunk 4: Final Verification and Local Dry Runs + +### Task 8: Run the required deterministic verification suite + +**Files:** +- No planned file changes. If any failures appear here, fix them in the owning chunk and create a normal corrective commit before reporting completion. + +- [ ] **Step 1: Run the full targeted example suite** + +Run: + +```bash +pytest \ + sandbox/tests/test_sandbox_config_loading.py \ + sandbox/tests/test_mcp_backend.py \ + sandbox/tests/test_mcp_client.py \ + sandbox/tests/test_code_backend.py \ + sandbox/tests/test_code_tool_schemas.py \ + synthesis/tests/test_mcp_example_synthesis_configs.py \ + synthesis/tests/test_code_example_synthesis_config.py \ + rollout/tests/test_mcp_example_assets.py \ + rollout/tests/test_mcp_example_doc.py \ + rollout/tests/test_code_example_assets.py \ + rollout/tests/test_code_example_doc.py \ + -v +``` + +Expected: PASS. + +- [ ] **Step 2: Record final status without creating a verification-only commit** + +If Step 1 is green, continue to Task 9 to finalize the MCP benchmark answers. If a verification failure requires code or doc fixes, make the fix in the owning chunk, rerun the affected tests, and create a normal corrective commit rather than a “verification only” commit. + +### Task 9: Finalize MCP benchmark answers against the prepared Toolathlon-GYM environment + +**Files:** +- Modify: `benchmark/mcp_canvas_benchmark.jsonl` +- Modify: `benchmark/mcp_snowflake_benchmark.jsonl` +- Modify: `benchmark/mcp_woocommerce_benchmark.jsonl` +- Modify: `benchmark/mcp_yahoo_finance_benchmark.jsonl` +- Modify: `benchmark/mcp_youtube_benchmark.jsonl` +- Modify: `benchmark/mcp_train_benchmark.jsonl` + +This task is required to convert the placeholder MCP benchmark answers from Chunk 2 into real benchmark answers. If `OPENAI_API_KEY`, `OPENAI_API_URL`, or the prepared Toolathlon-GYM environment are unavailable, stop here and report a blocker instead of inventing answers. + +- [ ] **Step 1: Start the MCP sandbox server** + +Run in a dedicated terminal from the AgentFlow repo root: + +```bash +export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym +./start_sandbox_server.sh --config configs/sandbox-server/mcp_config.json +``` + +Expected: the MCP sandbox stays running while Step 2 and Step 3 execute. + +- [ ] **Step 2: Run one MCP synthesis smoke check** + +Run: + +```bash +export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +python synthesis/pipeline.py \ + --config configs/synthesis/mcp_canvas_config.json \ + --seeds seeds/mcp/canvas_seeds.jsonl \ + --output-dir /tmp/agentflow-mcp-canvas-synth +``` + +Expected: the command starts successfully with the checked-in MCP config. If the current synthesis pipeline still writes to the repo’s shared aggregation directory, confirm that new QA / trajectory rows appear there; otherwise confirm output appears under `/tmp/agentflow-mcp-canvas-synth/`. + +- [ ] **Step 3: Run the six MCP rollout configs and transcribe grounded answers** + +Run: + +```bash +export TOOLATHLON_GYM_ROOT=/path/to/toolathlon_gym +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +python -m rollout.pipeline --config configs/trajectory/mcp_canvas_trajectory.json --output-dir /tmp/agentflow-mcp-canvas-check +python -m rollout.pipeline --config configs/trajectory/mcp_snowflake_trajectory.json --output-dir /tmp/agentflow-mcp-snowflake-check +python -m rollout.pipeline --config configs/trajectory/mcp_woocommerce_trajectory.json --output-dir /tmp/agentflow-mcp-woocommerce-check +python -m rollout.pipeline --config configs/trajectory/mcp_yahoo_finance_trajectory.json --output-dir /tmp/agentflow-mcp-yahoo-finance-check +python -m rollout.pipeline --config configs/trajectory/mcp_youtube_trajectory.json --output-dir /tmp/agentflow-mcp-youtube-check +python -m rollout.pipeline --config configs/trajectory/mcp_train_trajectory.json --output-dir /tmp/agentflow-mcp-train-check +``` + +For each domain: + +- open the newest results JSONL under the matching `/tmp/agentflow-mcp--check/` directory +- for each task, verify the candidate answer against the saved trajectory’s MCP tool output (`trajectory.messages` tool entries and/or `trajectory.tool_calls[*].result`) rather than trusting only `predicted_answer` +- copy the tool-supported final textual answers into the corresponding two rows in `benchmark/mcp__benchmark.jsonl` + +Expected: all six rollout commands start successfully, each output directory contains a results JSONL file, and every committed MCP benchmark answer is backed by observed tool output. + +- [ ] **Step 4: Validate that all MCP benchmark answers are now populated** + +Run: + +```bash +python - <<'PY' +import json +from pathlib import Path + +for path in sorted(Path("benchmark").glob("mcp_*_benchmark.jsonl")): + rows = [ + json.loads(line) + for line in path.read_text(encoding="utf-8").splitlines() + if line.strip() + ] + assert rows, path + assert all(isinstance(row["answer"], str) and row["answer"].strip() for row in rows), path +print("MCP benchmark answers verified") +PY +``` + +Expected: the script prints `MCP benchmark answers verified`. + +- [ ] **Step 5: Re-run the MCP asset contract test after editing the benchmark files** + +Run: + +```bash +pytest rollout/tests/test_mcp_example_assets.py -v +``` + +Expected: PASS. + +- [ ] **Step 6: Commit the grounded MCP benchmark answers** + +```bash +git add \ + benchmark/mcp_canvas_benchmark.jsonl \ + benchmark/mcp_snowflake_benchmark.jsonl \ + benchmark/mcp_woocommerce_benchmark.jsonl \ + benchmark/mcp_yahoo_finance_benchmark.jsonl \ + benchmark/mcp_youtube_benchmark.jsonl \ + benchmark/mcp_train_benchmark.jsonl +git commit -m "test: ground MCP benchmark answers" +``` + +### Task 10: Run optional Coding live smoke checks when LLM credentials are available + +**Files:** +- No planned file changes. Skip this task if `OPENAI_API_KEY` or `OPENAI_API_URL` are unavailable. Record the skip reason in the execution report. + +- [ ] **Step 1: Start the Code sandbox server** + +Run in a dedicated terminal from the AgentFlow repo root: + +```bash +./start_sandbox_server.sh --config configs/sandbox-server/code_config.json +``` + +Expected: the Code sandbox stays running while Step 2-4 execute. + +- [ ] **Step 2: Run one Coding synthesis smoke check with the bundled repo** + +Run: + +```bash +# from the AgentFlow repo root +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +export AGENTFLOW_REPO_ROOT=$(pwd) +python synthesis/pipeline.py \ + --config configs/synthesis/code_config.json \ + --seeds seeds/code/seeds.jsonl \ + --output-dir /tmp/agentflow-code-synth-check +``` + +Expected: the command starts successfully with the checked-in Coding synthesis config. If the current synthesis pipeline still writes to the repo’s shared aggregation directory, confirm that new QA / trajectory rows appear there; otherwise confirm output appears under `/tmp/agentflow-code-synth-check/`. + +- [ ] **Step 3: Run one Coding rollout smoke check with the bundled repo** + +Run: + +```bash +# from the AgentFlow repo root +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +export AGENTFLOW_REPO_ROOT=$(pwd) +python -m rollout.pipeline \ + --config configs/trajectory/code_trajectory.json \ + --task-ids code_edit_001 \ + --output-dir /tmp/agentflow-code-final-check +``` + +Expected: the rollout starts successfully, copies the bundled demo repo into the code workspace, and writes results JSONL under `/tmp/agentflow-code-final-check/`. + +- [ ] **Step 4: Optionally prove the documented `source_dir` override path works with another local repo** + +Run: + +```bash +# from the AgentFlow repo root +export OPENAI_API_KEY=... +export OPENAI_API_URL=... +export AGENTFLOW_REPO_ROOT=$(pwd) +export LOCAL_CODE_REPO=/abs/path/to/local/repo +python - <<'PY' +import json +import os +from pathlib import Path + +src = Path("configs/trajectory/code_trajectory.json") +dst = Path("/tmp/code_trajectory_local_repo.json") +payload = json.loads(src.read_text(encoding="utf-8")) +payload["resource_init_configs"]["code"]["content"]["source_dir"] = os.environ["LOCAL_CODE_REPO"] +payload["data_path"] = "/tmp/code_override_benchmark.jsonl" +dst.write_text(json.dumps(payload, indent=2), encoding="utf-8") +Path("/tmp/code_override_benchmark.jsonl").write_text( + json.dumps( + { + "id": "code_override_read_001", + "question": "Use code tools to inspect the repository and reply with the relative path of any one file located at the repository root.", + } + ) + + "\n", + encoding="utf-8", +) +print(dst) +PY +python -m rollout.pipeline \ + --config /tmp/code_trajectory_local_repo.json \ + --max-tasks 1 \ + --output-dir /tmp/agentflow-code-local-repo-check +``` + +Expected: the override-repo rollout starts successfully against the temporary generic benchmark and writes a result file, proving the documented `source_dir` override works without hard-coding any machine-local repo path into committed assets. + +Plan complete and saved to `docs/superpowers/plans/2026-04-20-mcp-and-coding-examples-implementation.md`. Ready to execute? diff --git a/examples/CodingAgent.md b/examples/CodingAgent.md index 2009751..8d754c2 100644 --- a/examples/CodingAgent.md +++ b/examples/CodingAgent.md @@ -77,6 +77,11 @@ python3 synthesis/pipeline.py \ --output-dir results/code ``` +> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`. + +- QA: `results/ds_synthesized_qa/synthesized_qa.jsonl` +- Trajectory: `results/ds_synthesized_qa/trajectories.jsonl` + By default, the synthesis config initializes the code resource from `${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo` through `resource_init_configs.code.content.source_dir`. The committed prompts are designed around the bundled demo repository and support both repository-reading questions and a lightweight edit-validation workflow. diff --git a/examples/MCPAgent.md b/examples/MCPAgent.md index 62c42cd..6c48595 100644 --- a/examples/MCPAgent.md +++ b/examples/MCPAgent.md @@ -95,6 +95,11 @@ python3 synthesis/pipeline.py \ --output-dir results/mcp_train ``` +> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`. + +- QA: `results/ds_synthesized_qa/synthesized_qa.jsonl` +- Trajectory: `results/ds_synthesized_qa/trajectories.jsonl` + These runs synthesize QA pairs and save the corresponding tool-use traces for the selected MCP domain. ## Step 3: Synthesize Trajectory Data diff --git a/rollout/core/config.py b/rollout/core/config.py index b67ca75..ac8c532 100644 --- a/rollout/core/config.py +++ b/rollout/core/config.py @@ -8,6 +8,8 @@ from typing import Dict, List, Any, Optional from dataclasses import dataclass, field, fields +from sandbox.server.config_loader import expand_env_vars + # Optional yaml support yaml = None try: @@ -84,6 +86,7 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> 'RolloutConfig': valid_fields = {f.name for f in fields(cls)} filtered = {k: v for k, v in config_dict.items() if k in valid_fields} + filtered = expand_env_vars(filtered) # Normalize text fields (allow list[str] for easier editing) def _normalize_text_field(v: Any) -> str: diff --git a/rollout/tests/test_code_example_assets.py b/rollout/tests/test_code_example_assets.py index 7019992..87200e1 100644 --- a/rollout/tests/test_code_example_assets.py +++ b/rollout/tests/test_code_example_assets.py @@ -17,8 +17,9 @@ def _read_jsonl(path: Path): ] -def test_code_rollout_config_contract(): +def test_code_rollout_config_contract_expands_repo_root_when_set(monkeypatch): config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + monkeypatch.setenv("AGENTFLOW_REPO_ROOT", str(REPO_ROOT)) config = RolloutConfig.from_json(str(config_path)) assert config.benchmark_name == "code_trajectory" @@ -30,7 +31,7 @@ def test_code_rollout_config_contract(): assert config.resource_init_configs == { "code": { "content": { - "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + "source_dir": f"{REPO_ROOT}/seeds/code/seed/demo_repo" } } } @@ -40,6 +41,55 @@ def test_code_rollout_config_contract(): assert config.save_summary is False +def test_code_rollout_config_preserves_placeholder_when_repo_root_unset(monkeypatch): + config_path = REPO_ROOT / "configs" / "trajectory" / "code_trajectory.json" + monkeypatch.delenv("AGENTFLOW_REPO_ROOT", raising=False) + + config = RolloutConfig.from_json(str(config_path)) + + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + + +def test_rollout_config_from_dict_expands_nested_env_values(monkeypatch): + monkeypatch.setenv("CODE_ROOT", "/tmp/demo") + monkeypatch.delenv("UNSET_VALUE", raising=False) + + config = RolloutConfig.from_dict( + { + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${CODE_ROOT}/repo", + "fallback_dir": "${UNSET_VALUE:-/tmp/fallback}", + "preserved_dir": "${UNSET_VALUE}/repo", + "artifacts": [ + "${CODE_ROOT}/one", + "${UNSET_VALUE:-/tmp/two}", + "${UNSET_VALUE}/three", + ], + } + } + } + } + ) + + content = config.resource_init_configs["code"]["content"] + assert content["source_dir"] == "/tmp/demo/repo" + assert content["fallback_dir"] == "/tmp/fallback" + assert content["preserved_dir"] == "${UNSET_VALUE}/repo" + assert content["artifacts"] == [ + "/tmp/demo/one", + "/tmp/two", + "${UNSET_VALUE}/three", + ] + + def test_code_seed_file_contract(): rows = _read_jsonl(REPO_ROOT / "seeds" / "code" / "seeds.jsonl") diff --git a/rollout/tests/test_code_example_doc.py b/rollout/tests/test_code_example_doc.py index fcaa367..b4b470d 100644 --- a/rollout/tests/test_code_example_doc.py +++ b/rollout/tests/test_code_example_doc.py @@ -4,6 +4,12 @@ REPO_ROOT = Path(__file__).resolve().parents[2] +def _get_section(content: str, heading: str, next_heading: str) -> str: + start = content.index(heading) + end = content.index(next_heading, start) + return content[start:end] + + def test_coding_example_doc_has_required_sections_and_repo_root_contract(): content = (REPO_ROOT / "examples" / "CodingAgent.md").read_text(encoding="utf-8") @@ -42,6 +48,31 @@ def test_coding_example_doc_has_required_sections_and_repo_root_contract(): for needle in required_strings: assert needle in content + step_2_content = _get_section( + content, + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + ) + + step_2_requirements = [ + "configs/synthesis/code_config.json", + "results/code", + "results/ds_synthesized_qa/", + "results/ds_synthesized_qa/synthesized_qa.jsonl", + "results/ds_synthesized_qa/trajectories.jsonl", + "> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`.", + ] + for needle in step_2_requirements: + assert needle in step_2_content + + expected_command = """python3 synthesis/pipeline.py \\ + --config configs/synthesis/code_config.json \\ + --seeds seeds/code/seeds.jsonl \\ + --output-dir results/code""" + assert expected_command in step_2_content + + assert "### Output files" not in step_2_content + assert "/home/a1/sdb/dxd/DataFlow" not in content assert "DataFlow" not in content assert "Step 4" not in content diff --git a/rollout/tests/test_mcp_example_assets.py b/rollout/tests/test_mcp_example_assets.py index 9308fbc..b330d6a 100644 --- a/rollout/tests/test_mcp_example_assets.py +++ b/rollout/tests/test_mcp_example_assets.py @@ -6,6 +6,8 @@ from rollout.core.config import RolloutConfig REPO_ROOT = Path(__file__).resolve().parents[2] +OPENAI_API_KEY = "secret" +OPENAI_API_URL = "https://example.test/v1" EXPECTED = { "canvas": { "tools": ["mcp:canvas.*", "mcp:filesystem.*"], @@ -52,9 +54,27 @@ def _read_jsonl(path: Path): ] +def _set_openai_env(monkeypatch: pytest.MonkeyPatch, enabled: bool) -> None: + if enabled: + monkeypatch.setenv("OPENAI_API_KEY", OPENAI_API_KEY) + monkeypatch.setenv("OPENAI_API_URL", OPENAI_API_URL) + return + + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_URL", raising=False) + + +def _expected_openai_value(enabled: bool, env_value: str, placeholder: str) -> str: + if enabled: + return env_value + return placeholder + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) @pytest.mark.parametrize("domain", sorted(EXPECTED)) -def test_mcp_rollout_config_contract(domain): +def test_mcp_rollout_config_contract(domain, env_enabled, monkeypatch): expected = EXPECTED[domain] + _set_openai_env(monkeypatch, env_enabled) config_path = REPO_ROOT / "configs" / "trajectory" / f"mcp_{domain}_trajectory.json" raw = json.loads(config_path.read_text(encoding="utf-8")) config = RolloutConfig.from_json(str(config_path)) @@ -63,8 +83,12 @@ def test_mcp_rollout_config_contract(domain): assert config.benchmark_name == expected["benchmark_name"] assert config.data_path == expected["benchmark"] assert config.model_name == "openai/gpt-oss-120b" - assert config.api_key == "${OPENAI_API_KEY}" - assert config.base_url == "${OPENAI_API_URL}" + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) assert config.max_turns == 20 assert config.available_tools == expected["tools"] assert config.sandbox_config_path == "configs/sandbox-server/mcp_config.json" @@ -82,6 +106,31 @@ def test_mcp_rollout_config_contract(domain): assert config.save_trajectories is True +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) +def test_mcp_rollout_config_ignores_unknown_env_placeholders( + caplog, monkeypatch, env_enabled +): + _set_openai_env(monkeypatch, env_enabled) + monkeypatch.delenv("IGNORED_ROLLOUT_VAR", raising=False) + + with caplog.at_level("WARNING", logger="ConfigLoader"): + config = RolloutConfig.from_dict( + { + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "unknown_field": "${IGNORED_ROLLOUT_VAR}", + } + ) + + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) + assert "IGNORED_ROLLOUT_VAR" not in caplog.text + + @pytest.mark.parametrize("domain", sorted(EXPECTED)) def test_mcp_seed_files_are_two_row_jsonl(domain): seed_path = REPO_ROOT / "seeds" / "mcp" / f"{domain}_seeds.jsonl" diff --git a/rollout/tests/test_mcp_example_doc.py b/rollout/tests/test_mcp_example_doc.py index f7efa4b..fef80be 100644 --- a/rollout/tests/test_mcp_example_doc.py +++ b/rollout/tests/test_mcp_example_doc.py @@ -4,6 +4,12 @@ REPO_ROOT = Path(__file__).resolve().parents[2] +def _get_section(content: str, heading: str, next_heading: str) -> str: + start = content.index(heading) + end = content.index(next_heading, start) + return content[start:end] + + def test_mcp_example_doc_has_required_sections_and_exact_prerequisite_contract(): content = (REPO_ROOT / "examples" / "MCPAgent.md").read_text(encoding="utf-8") @@ -50,6 +56,58 @@ def test_mcp_example_doc_has_required_sections_and_exact_prerequisite_contract() for needle in required_strings: assert needle in content + step_2_content = _get_section( + content, + "## Step 2: Synthesize QA Data", + "## Step 3: Synthesize Trajectory Data", + ) + + step_2_requirements = [ + "configs/synthesis/mcp_canvas_config.json", + "configs/synthesis/mcp_snowflake_config.json", + "configs/synthesis/mcp_woocommerce_config.json", + "configs/synthesis/mcp_yahoo_finance_config.json", + "configs/synthesis/mcp_youtube_config.json", + "configs/synthesis/mcp_train_config.json", + "results/ds_synthesized_qa/", + "results/ds_synthesized_qa/synthesized_qa.jsonl", + "results/ds_synthesized_qa/trajectories.jsonl", + "> Note: in this repo, synthesis currently writes to the fixed aggregation directory `results/ds_synthesized_qa/`, even if you provide a different `--output-dir`.", + ] + for needle in step_2_requirements: + assert needle in step_2_content + + expected_commands = [ + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_canvas_config.json \\ + --seeds seeds/mcp/canvas_seeds.jsonl \\ + --output-dir results/mcp_canvas""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_snowflake_config.json \\ + --seeds seeds/mcp/snowflake_seeds.jsonl \\ + --output-dir results/mcp_snowflake""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_woocommerce_config.json \\ + --seeds seeds/mcp/woocommerce_seeds.jsonl \\ + --output-dir results/mcp_woocommerce""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_yahoo_finance_config.json \\ + --seeds seeds/mcp/yahoo_finance_seeds.jsonl \\ + --output-dir results/mcp_yahoo_finance""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_youtube_config.json \\ + --seeds seeds/mcp/youtube_seeds.jsonl \\ + --output-dir results/mcp_youtube""", + """python3 synthesis/pipeline.py \\ + --config configs/synthesis/mcp_train_config.json \\ + --seeds seeds/mcp/train_seeds.jsonl \\ + --output-dir results/mcp_train""", + ] + for command in expected_commands: + assert command in step_2_content + + assert "### Output files" not in step_2_content + lowered = content.lower() assert "/home/" not in content assert "training" in lowered diff --git a/sandbox/result_formatter.py b/sandbox/result_formatter.py index e134433..30b8dfe 100644 --- a/sandbox/result_formatter.py +++ b/sandbox/result_formatter.py @@ -251,6 +251,9 @@ def to_str(self, verbose: bool = False) -> str: error_msg = self.metadata.get("message", "Code execution failed") return f"[Error] {error_msg}" + if isinstance(self.raw_data, str): + return self.raw_data + stdout = self.raw_data.get("stdout", "") stderr = self.raw_data.get("stderr", "") return_code = self.raw_data.get("return_code", 0) @@ -359,6 +362,56 @@ def to_str(self, verbose: bool = False) -> str: return json.dumps(self.raw_data, indent=2, ensure_ascii=False) +# ============================================================================ +# MCP tool result. +# ============================================================================ + +class MCPResult(ToolResult): + """MCP tool result.""" + + def to_str(self, verbose: bool = False) -> str: + del verbose + + if not self.success: + error_msg = self.metadata.get("message", "MCP tool execution failed") + return f"[Error] {error_msg}" + + if isinstance(self.raw_data, str): + return self.raw_data + + content = self.raw_data.get("content", []) + if not isinstance(content, list): + return json.dumps(self.raw_data, indent=2, ensure_ascii=False) + + lines = [] + has_text_content = False + for item in content: + if isinstance(item, dict): + if item.get("type") == "text": + text = str(item.get("text", "")) + lines.append(text) + if text.strip(): + has_text_content = True + else: + item_type = item.get("type", "content") + lines.append(f"[{item_type} content]") + else: + text = str(item) + lines.append(text) + if text.strip(): + has_text_content = True + + rendered_content = "\n".join(lines) + if has_text_content: + return rendered_content + + structured_content = self.raw_data.get("structuredContent") + if structured_content is not None: + return json.dumps(structured_content, indent=2, ensure_ascii=False) + + return rendered_content + + # ============================================================================ # Web search tool result. # ============================================================================ @@ -724,6 +777,7 @@ class ResultFormatter: "vm": VMResult, "doc": DocResult, "ds": DocResult, + "mcp": MCPResult, } @classmethod diff --git a/sandbox/server/backends/resources/code_vendor/file_tools.py b/sandbox/server/backends/resources/code_vendor/file_tools.py index 1a61ff2..dd47f0f 100644 --- a/sandbox/server/backends/resources/code_vendor/file_tools.py +++ b/sandbox/server/backends/resources/code_vendor/file_tools.py @@ -89,6 +89,10 @@ def _format_command_error(tool_name: str, returncode: int, stdout: str, stderr: return f"{summary}\n{details}" +def _resolve_search_base(args: dict[str, Any], ctx: Any) -> Path: + return Path(args.get("path") or ctx.cwd) + + class ReadTool(Tool): name = "Read" description = "Read a file and return its contents with line numbers." @@ -141,7 +145,7 @@ def input_schema(self) -> dict[str, Any]: } async def call(self, args: dict[str, Any], ctx: Any) -> str: - base = Path(args.get("path", ctx.cwd)) + base = _resolve_search_base(args, ctx) pattern = args["pattern"] matches = sorted(base.glob(pattern)) return "\n".join(str(match) for match in matches) or "(no matches)" @@ -168,7 +172,7 @@ def input_schema(self) -> dict[str, Any]: } async def call(self, args: dict[str, Any], ctx: Any) -> str: - base = Path(args.get("path", ctx.cwd)) + base = _resolve_search_base(args, ctx) cmd = ["grep", "-r", "-n"] if "glob" in args: cmd += ["--include", args["glob"]] diff --git a/sandbox/server/backends/resources/mcp/client.py b/sandbox/server/backends/resources/mcp/client.py index da83acc..7171c3f 100644 --- a/sandbox/server/backends/resources/mcp/client.py +++ b/sandbox/server/backends/resources/mcp/client.py @@ -17,6 +17,10 @@ _PLACEHOLDER_PATTERN = re.compile(r"\$\{([^}]+)\}") _SUPPORTED_PLACEHOLDERS = {"local_servers_paths", "agent_workspace", "task_dir"} _BUNDLED_CONFIG_DIR = Path(__file__).parent / "configs" +_BUNDLED_PYTHON_SERVER_FALLBACKS = { + "yahoo-finance": ("yahoo-finance-mcp", "server.py"), + "youtube-transcript": ("mcp-youtube-transcript", "run_server.py"), +} @dataclass @@ -80,6 +84,65 @@ def build_server_env( return {k: str(v) for k, v in merged.items() if v is not None} +def discover_mcp_config_dir(mcp_servers_path: str | Path | None) -> Path | None: + if not mcp_servers_path: + return None + + servers_path = Path(mcp_servers_path) + if servers_path.name != "local_servers" or not servers_path.is_dir(): + return None + + candidate = servers_path.parent / "configs" / "mcp_servers" + if candidate.is_dir(): + return candidate + return None + + +def _is_usable_executable(path: str) -> bool: + if not path: + return False + + executable = Path(path) + return executable.is_file() and os.access(executable, os.X_OK) + + +def _is_toolathlon_config_dir(config_dir: Path, mcp_servers_path: str | Path | None) -> bool: + resolved_config_dir = config_dir.resolve() + if resolved_config_dir == _BUNDLED_CONFIG_DIR.resolve(): + return True + + discovered_config_dir = discover_mcp_config_dir(mcp_servers_path) + if discovered_config_dir is None: + return False + + return resolved_config_dir == discovered_config_dir.resolve() + + +def _apply_bundled_python_server_fallback( + *, + server_name: str, + config_dir: Path, + command: str, + args: list[str], + cwd: str, + local_servers_path: str, +) -> tuple[str, list[str], str]: + fallback = _BUNDLED_PYTHON_SERVER_FALLBACKS.get(server_name) + if fallback is None or not _is_toolathlon_config_dir(config_dir, local_servers_path): + return command, args, cwd + + if ".venv/bin/python3" not in command or _is_usable_executable(command): + return command, args, cwd + + project_dir_name, entrypoint = fallback + project_dir = Path(local_servers_path) / project_dir_name if local_servers_path else Path(cwd) + if not (project_dir / entrypoint).is_file(): + return command, args, cwd + + project_dir_str = str(project_dir) + return "uv", ["--directory", project_dir_str, "run", "python", entrypoint], project_dir_str + + def load_mcp_process_config( *, server_name: str, @@ -99,6 +162,9 @@ def load_mcp_process_config( if mcp_servers_path is None: mcp_servers_path = str(toolathlon_root / "local_servers") + if config_dir is None: + config_dir = discover_mcp_config_dir(mcp_servers_path) + config_path = Path(config_dir) if config_dir else _BUNDLED_CONFIG_DIR if not config_path.exists(): raise FileNotFoundError(f"MCP config dir not found: {config_path}") @@ -155,7 +221,22 @@ def load_mcp_process_config( cwd_value = resolve(params.get("cwd", agent_workspace)) runtime_env = dict(os.environ) if process_env is None else dict(process_env) + command, args, cwd_value = _apply_bundled_python_server_fallback( + server_name=server_name, + config_dir=config_path, + command=command, + args=args, + cwd=cwd_value, + local_servers_path=local_servers_path, + ) + full_env = build_server_env(yaml_env=env_values, process_env=runtime_env) + if ( + command == "uv" + and "UV_CACHE_DIR" not in full_env + and _is_toolathlon_config_dir(config_path, mcp_servers_path) + ): + full_env["UV_CACHE_DIR"] = str(Path(workspace) / ".cache" / "uv") timeout_seconds = float(cfg.get("client_session_timeout_seconds", 60.0)) diff --git a/sandbox/server/backends/resources/mcp/configs/12306.yaml b/sandbox/server/backends/resources/mcp/configs/12306.yaml index a969de2..fcc9c16 100644 --- a/sandbox/server/backends/resources/mcp/configs/12306.yaml +++ b/sandbox/server/backends/resources/mcp/configs/12306.yaml @@ -1,19 +1,19 @@ -# 12306 China Railway Ticket Query System +# 12306 China Railway Ticket Query System (PostgreSQL-backed, no real API calls) # Source: https://github.com/Joooook/12306-mcp -# Toolathlon Version: https://github.com/lockon-n/12306-mcp -# Mock: PG-backed local server (replaces real kyfw.12306.cn API) +# Toolathlon Version: local pg-backed version in local_mcp_servers_copy/12306-mcp +# Schema: train (stations, trains, train_seats, train_routes) +# Toolathlon_GYM: ../../local_servers/12306-mcp type: stdio name: rail_12306 params: command: node args: - - "${local_servers_paths}/12306-mcp/environment/build/index.js" - cwd: "${agent_workspace}" + - "${local_servers_paths}/12306-mcp/build/index.js" env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" + PG_HOST: "postgres" + PG_PORT: "5432" + PG_DATABASE: "toolathlon" + PG_USER: "postgres" + PG_PASSWORD: "postgres" cache_tools_list: true -client_session_timeout_seconds: 60 +client_session_timeout_seconds: 20 diff --git a/sandbox/server/backends/resources/mcp/configs/canvas.yaml b/sandbox/server/backends/resources/mcp/configs/canvas.yaml index f3c2ffc..8d66f72 100644 --- a/sandbox/server/backends/resources/mcp/configs/canvas.yaml +++ b/sandbox/server/backends/resources/mcp/configs/canvas.yaml @@ -1,23 +1,20 @@ # canvas # Source: https://github.com/DMontgomery40/mcp-canvas-lms # Toolathlon Version: https://github.com/lockon-n/mcp-canvas-lms -# Mock: PG-backed local server +# Toolathlon_GYM: ../../local_servers/mcp-canvas-lms type: stdio name: canvas params: command: node args: - - "${local_servers_paths}/mcp-canvas-lms/environment/build/index.js" + - "${local_servers_paths}/mcp-canvas-lms/build/index.js" env: - CANVAS_API_TOKEN: "${token.canvas_api_token}" - CANVAS_STUDENT_EMAIL: "${token.canvas_student_email}" + CANVAS_API_TOKEN: "placeholder" CANVAS_DOMAIN: "localhost:8080" + # skip self-signed certificate detection, set NODE_TLS_REJECT_UNAUTHORIZED=0 NODE_TLS_REJECT_UNAUTHORIZED: "0" - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" + # if you don't need to load custom CA certificates, you can comment out the following line + # NODE_EXTRA_CA_CERTS: "deployment/canvas/logs/cert.pem" cwd: "${agent_workspace}" -client_session_timeout_seconds: 60 +client_session_timeout_seconds: 10 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/filesystem.yaml b/sandbox/server/backends/resources/mcp/configs/filesystem.yaml index 077ffb9..f9d43b1 100644 --- a/sandbox/server/backends/resources/mcp/configs/filesystem.yaml +++ b/sandbox/server/backends/resources/mcp/configs/filesystem.yaml @@ -1,13 +1,14 @@ # file system server - for reading and writing files # Source: https://github.com/modelcontextprotocol/servers/tree/main/src/filesystem -# Mock: local server +# Toolathlon Version: The same +# Toolathlon_GYM: ../../local_servers/filesystem type: stdio name: filesystem params: command: node args: - - "${local_servers_paths}/filesystem/environment/dist/index.js" + - "${local_servers_paths}/filesystem/dist/index.js" - "${agent_workspace}" cwd: "${agent_workspace}" -client_session_timeout_seconds: 900 +client_session_timeout_seconds: 300 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/snowflake.yaml b/sandbox/server/backends/resources/mcp/configs/snowflake.yaml index 2daa5f6..81f20ba 100644 --- a/sandbox/server/backends/resources/mcp/configs/snowflake.yaml +++ b/sandbox/server/backends/resources/mcp/configs/snowflake.yaml @@ -1,14 +1,14 @@ # snowflake # Source: https://github.com/isaacwasserman/mcp-snowflake-server # Toolathlon Version: https://github.com/lockon-n/mcp-snowflake-server -# Mock: PG-backed local server +# Toolathlon_GYM: ../../local_servers/mcp-snowflake-server type: stdio name: snowflake params: command: uv args: - "--directory" - - "${local_servers_paths}/mcp-snowflake-server/environment" + - "${local_servers_paths}/mcp-snowflake-server" - "run" - "mcp_snowflake_server" - "--account" @@ -19,20 +19,20 @@ params: - "eigent" - "--private_key_path" - "" + - "--role" + - "PUBLIC" - "--database" - - "HR_ANALYTICS" + - "toolathlon_gym" - "--schema" - "sf" - - "--allowed_databases" - - "HR_ANALYTICS,SUPPORT_CENTER,SALES_DW" - "--allow_write" - "--exclude-json-results" env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" PG_USER: "eigent" PG_PASSWORD: "camel" + PG_HOST: "toolathlon_pg" + PG_PORT: "5432" + PG_DATABASE: "toolathlon_gym" cwd: "${agent_workspace}" client_session_timeout_seconds: 120 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml b/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml index eacd62c..883add3 100644 --- a/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml +++ b/sandbox/server/backends/resources/mcp/configs/woocommerce.yaml @@ -1,21 +1,17 @@ # WooCommerce MCP Server # Source: https://github.com/lockon-n/woocommerce-mcp -# Mock: PG-backed local server +# Toolathlon Version: The same +# Toolathlon_GYM: ../../local_servers/woocommerce-mcp type: stdio name: woocommerce params: command: node args: - - "${local_servers_paths}/woocommerce-mcp/environment/dist/index.js" + - "${local_servers_paths}/woocommerce-mcp/dist/index.js" env: - WORDPRESS_SITE_URL: "http://localhost" + WORDPRESS_SITE_URL: "http://localhost:8081" WOOCOMMERCE_CONSUMER_KEY: "placeholder" WOOCOMMERCE_CONSUMER_SECRET: "placeholder" - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" cwd: "${agent_workspace}" -client_session_timeout_seconds: 60 +client_session_timeout_seconds: 10 cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml b/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml index 1dc350f..66f437e 100644 --- a/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml +++ b/sandbox/server/backends/resources/mcp/configs/yahoo-finance.yaml @@ -1,19 +1,15 @@ # Yahoo Finance # Source: https://github.com/Alex2Yang97/yahoo-finance-mcp # Toolathlon Version: https://github.com/lockon-n/yahoo-finance-mcp +# Toolathlon_GYM: ../../local_servers/yahoo-finance-mcp type: stdio name: yahoo-finance params: - command: uv + # Use the server venv directly instead of `uv run` so stdio passes through to FastMCP + # and the server resolves dependencies from its own project environment. + command: "${local_servers_paths}/yahoo-finance-mcp/.venv/bin/python3" args: - - "run" - - "${local_servers_paths}/yahoo-finance-mcp/environment/server.py" - env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" - # cwd: "${agent_workspace}" # do not add this for compatibility -client_session_timeout_seconds: 180 -cache_tools_list: true \ No newline at end of file + - "server.py" + cwd: "${local_servers_paths}/yahoo-finance-mcp" +client_session_timeout_seconds: 60 +cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/youtube.yaml b/sandbox/server/backends/resources/mcp/configs/youtube.yaml index edbc164..9b7e299 100644 --- a/sandbox/server/backends/resources/mcp/configs/youtube.yaml +++ b/sandbox/server/backends/resources/mcp/configs/youtube.yaml @@ -1,19 +1,20 @@ -# mcp 4 youtube +# YouTube MCP Server (PostgreSQL-backed, no real API calls) # Source: https://github.com/ZubeidHendricks/youtube-mcp-server -# Toolathlon Version: https://github.com/lockon-n/youtube-mcp-server +# Toolathlon Version: local pg-backed version in local_mcp_servers_copy/youtube-mcp-server +# Schema: youtube (channels, videos, playlists, playlist_items, transcripts) +# Toolathlon_GYM: ../../local_servers/youtube-mcp-server type: stdio name: youtube params: command: node args: - - "${local_servers_paths}/youtube-mcp-server/environment/dist/index.js" + - "${local_servers_paths}/youtube-mcp-server/dist/index.js" env: - YOUTUBE_API_KEY: "${token.google_cloud_console_api_key}" - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" + PG_HOST: "postgres" + PG_PORT: "5432" + PG_DATABASE: "toolathlon" + PG_USER: "postgres" + PG_PASSWORD: "postgres" # cwd: "${agent_workspace}" # do not add this for compatibility -client_session_timeout_seconds: 60 -cache_tools_list: true \ No newline at end of file +client_session_timeout_seconds: 120 +cache_tools_list: true diff --git a/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml b/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml index 35539e6..a9e078f 100644 --- a/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml +++ b/sandbox/server/backends/resources/mcp/configs/youtube_transcript.yaml @@ -1,21 +1,23 @@ -# YouTube Transcript +# YouTube Transcript MCP Server (PostgreSQL-backed, no real YouTube API calls) # Source: https://github.com/jkawamoto/mcp-youtube-transcript -# Mock: PG-backed local server +# Toolathlon Version: local pg-backed version in local_mcp_servers_copy/mcp-youtube-transcript +# Schema: youtube (transcripts, videos tables) +# Toolathlon_GYM: ../../local_servers/mcp-youtube-transcript type: stdio name: youtube-transcript params: - command: uv + # Use venv python directly instead of `uv run` to avoid stdin being consumed by uv + # (uv run reads stdin before passing to the child process, causing FastMCP to see EOF and exit) + command: "${local_servers_paths}/mcp-youtube-transcript/.venv/bin/python3" args: - - "--directory" - - "${local_servers_paths}/mcp-youtube-transcript/environment" - - "run" - - "mcp-youtube-transcript" + - "-c" + - "import sys,os; os.chdir('${local_servers_paths}/mcp-youtube-transcript'); from mcp_youtube_transcript import server; import anyio; anyio.run(server(50000).run_stdio_async)" env: - PG_HOST: "${config.pg_host}" - PG_PORT: "5434" - PG_DATABASE: "${config.pg_database}" - PG_USER: "eigent" - PG_PASSWORD: "camel" - cwd: "${agent_workspace}" -client_session_timeout_seconds: 60 + PG_HOST: "postgres" + PG_PORT: "5432" + PG_DATABASE: "toolathlon" + PG_USER: "postgres" + PG_PASSWORD: "postgres" + cwd: "${local_servers_paths}/mcp-youtube-transcript" +client_session_timeout_seconds: 20 cache_tools_list: true diff --git a/sandbox/tests/test_code_vendor_tools.py b/sandbox/tests/test_code_vendor_tools.py index 4bd5893..945d60d 100644 --- a/sandbox/tests/test_code_vendor_tools.py +++ b/sandbox/tests/test_code_vendor_tools.py @@ -149,6 +149,32 @@ def test_glob_tool_returns_sorted_matches(tmp_path): assert result == f"{tmp_path / 'a.py'}\n{tmp_path / 'pkg' / 'b.py'}" +def test_glob_tool_uses_ctx_cwd_when_path_is_empty_string(tmp_path): + target = tmp_path / "target.py" + target.write_text("print('target')\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "target.py", "path": ""}, + make_ctx(tmp_path), + ) + + assert result == str(target) + + +def test_glob_tool_uses_ctx_cwd_when_path_is_omitted(tmp_path): + target = tmp_path / "target.py" + target.write_text("print('target')\n", encoding="utf-8") + + result = call_tool( + GlobTool(), + {"pattern": "target.py"}, + make_ctx(tmp_path), + ) + + assert result == str(target) + + def test_grep_tool_returns_matches_with_line_numbers_for_filtered_files(tmp_path): first = tmp_path / "first.txt" second = tmp_path / "second.txt" @@ -189,6 +215,32 @@ def test_grep_tool_searches_recursively_without_glob_filter(tmp_path): } +def test_grep_tool_uses_ctx_cwd_when_path_is_empty_string(tmp_path): + target = tmp_path / "target.txt" + target.write_text("needle\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle", "path": ""}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:needle\n" + + +def test_grep_tool_uses_ctx_cwd_when_path_is_omitted(tmp_path): + target = tmp_path / "target.txt" + target.write_text("needle\n", encoding="utf-8") + + result = call_tool( + GrepTool(), + {"pattern": "needle"}, + make_ctx(tmp_path), + ) + + assert result == f"{target}:1:needle\n" + + def test_grep_tool_returns_no_matches_for_exit_code_one(tmp_path): target = tmp_path / "sample.txt" target.write_text("alpha\nbeta\n", encoding="utf-8") diff --git a/sandbox/tests/test_mcp_client.py b/sandbox/tests/test_mcp_client.py index 6861ec0..ed79ca5 100644 --- a/sandbox/tests/test_mcp_client.py +++ b/sandbox/tests/test_mcp_client.py @@ -28,6 +28,12 @@ def load_mcp_client_module(): return module +def make_executable(path: Path) -> None: + path.parent.mkdir(parents=True, exist_ok=True) + path.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + path.chmod(0o755) + + def test_resolve_toolathlon_placeholders(tmp_path): module = load_mcp_client_module() @@ -419,6 +425,68 @@ def test_load_mcp_process_config_resolves_with_mcp_servers_path(tmp_path): assert config.timeout_seconds == 42 +def test_load_mcp_process_config_prefers_toolathlon_configs_next_to_local_servers(tmp_path): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + config_dir = toolathlon_root / "configs" / "mcp_servers" + local_servers_dir = toolathlon_root / "local_servers" + config_dir.mkdir(parents=True) + local_servers_dir.mkdir() + (config_dir / "canvas.yaml").write_text( + """ +type: stdio +name: canvas +params: + command: node + args: + - ${local_servers_paths}/mcp-canvas-lms/build/index.js + env: + CANVAS_API_TOKEN: placeholder + cwd: ${agent_workspace} +client_session_timeout_seconds: 10 + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="canvas", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "node" + assert config.args == [str(local_servers_dir / "mcp-canvas-lms" / "build" / "index.js")] + assert config.env["CANVAS_API_TOKEN"] == "placeholder" + assert config.cwd == str(tmp_path / "workspace") + assert config.timeout_seconds == 10 + + +def test_discover_mcp_config_dir_requires_toolathlon_local_servers_layout(tmp_path): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + config_dir = toolathlon_root / "configs" / "mcp_servers" + config_dir.mkdir(parents=True) + (toolathlon_root / "custom_servers").mkdir() + (toolathlon_root / "local_servers").mkdir() + + assert module.discover_mcp_config_dir(toolathlon_root / "custom_servers") is None + assert module.discover_mcp_config_dir(toolathlon_root / "local_servers") == config_dir + + +def test_discover_mcp_config_dir_requires_real_directory(tmp_path): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + local_servers_dir = toolathlon_root / "local_servers" + local_servers_dir.mkdir(parents=True) + config_path = toolathlon_root / "configs" / "mcp_servers" + config_path.parent.mkdir(parents=True) + config_path.write_text("not a directory\n", encoding="utf-8") + + assert module.discover_mcp_config_dir(local_servers_dir) is None + + def test_load_mcp_process_config_resolves_toolathlon_local_servers_path(tmp_path): module = load_mcp_client_module() config_dir = tmp_path / "configs" / "mcp_servers" @@ -509,3 +577,394 @@ def test_load_mcp_process_config_uses_process_env_overrides(tmp_path): assert config.env["PG_HOST"] == "from_process" assert config.env["PG_PORT"] == "15432" + + +def test_load_mcp_process_config_sets_workspace_uv_cache_for_uv_servers(tmp_path): + module = load_mcp_client_module() + workspace = tmp_path / "workspace" + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + (local_servers_dir / "mcp-snowflake-server").mkdir(parents=True) + + config = module.load_mcp_process_config( + server_name="snowflake", + agent_workspace=str(workspace), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.env["UV_CACHE_DIR"] == str(workspace / ".cache" / "uv") + + +def test_load_mcp_process_config_preserves_existing_uv_cache_dir(tmp_path): + module = load_mcp_client_module() + workspace = tmp_path / "workspace" + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + (local_servers_dir / "mcp-snowflake-server").mkdir(parents=True) + + config = module.load_mcp_process_config( + server_name="snowflake", + agent_workspace=str(workspace), + mcp_servers_path=str(local_servers_dir), + process_env={"UV_CACHE_DIR": "/tmp/custom-uv-cache"}, + ) + + assert config.env["UV_CACHE_DIR"] == "/tmp/custom-uv-cache" + + +def test_load_mcp_process_config_does_not_inject_uv_cache_for_custom_config_dir(tmp_path): + module = load_mcp_client_module() + config_dir = tmp_path / "custom-configs" + config_dir.mkdir(parents=True) + (config_dir / "custom-uv.yaml").write_text( + """ +type: stdio +name: custom-uv +params: + command: uv + args: + - run + - python + - server.py + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name="custom-uv", + agent_workspace=str(tmp_path / "workspace"), + config_dir=config_dir, + process_env={}, + ) + + assert config.command == "uv" + assert "UV_CACHE_DIR" not in config.env + + +def test_load_mcp_process_config_keeps_direct_python_fast_path_when_venv_exists(tmp_path): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "yahoo-finance-mcp" + make_executable(project_dir / ".venv" / "bin" / "python3") + (project_dir / "server.py").write_text("print('ok')\n", encoding="utf-8") + + config = module.load_mcp_process_config( + server_name="yahoo-finance", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == str(project_dir / ".venv" / "bin" / "python3") + assert config.args == ["server.py"] + assert config.cwd == str(project_dir) + + +def test_load_mcp_process_config_falls_back_to_uv_for_yahoo_finance_without_venv_launcher(tmp_path): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "yahoo-finance-mcp" + project_dir.mkdir(parents=True) + (project_dir / "server.py").write_text("print('ok')\n", encoding="utf-8") + + config = module.load_mcp_process_config( + server_name="yahoo-finance", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.args == ["--directory", str(project_dir), "run", "python", "server.py"] + assert config.cwd == str(project_dir) + + +def test_load_mcp_process_config_does_not_fallback_when_bundled_entrypoint_is_missing(tmp_path): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "yahoo-finance-mcp" + project_dir.mkdir(parents=True) + + config = module.load_mcp_process_config( + server_name="yahoo-finance", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == str(project_dir / ".venv" / "bin" / "python3") + assert config.args == ["server.py"] + assert config.cwd == str(project_dir) + + +def test_load_mcp_process_config_falls_back_to_uv_for_youtube_transcript_when_launcher_unusable( + tmp_path, +): + module = load_mcp_client_module() + local_servers_dir = tmp_path / "toolathlon" / "local_servers" + project_dir = local_servers_dir / "mcp-youtube-transcript" + launcher = project_dir / ".venv" / "bin" / "python3" + launcher.parent.mkdir(parents=True, exist_ok=True) + launcher.write_text("#!/bin/sh\nexit 0\n", encoding="utf-8") + launcher.chmod(0o644) + (project_dir / "run_server.py").write_text("print('ok')\n", encoding="utf-8") + + config = module.load_mcp_process_config( + server_name="youtube-transcript", + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.args == ["--directory", str(project_dir), "run", "python", "run_server.py"] + assert config.cwd == str(project_dir) + + +@pytest.mark.parametrize( + ("server_name", "project_subdir", "entrypoint"), + [ + ("yahoo-finance", "yahoo-finance-mcp", "server.py"), + ("youtube-transcript", "mcp-youtube-transcript", "run_server.py"), + ], +) +def test_load_mcp_process_config_falls_back_to_uv_for_discovered_toolathlon_python_servers( + tmp_path, + server_name, + project_subdir, + entrypoint, +): + module = load_mcp_client_module() + toolathlon_root = tmp_path / "toolathlon" + config_dir = toolathlon_root / "configs" / "mcp_servers" + local_servers_dir = toolathlon_root / "local_servers" + project_dir = local_servers_dir / project_subdir + config_dir.mkdir(parents=True) + project_dir.mkdir(parents=True) + (project_dir / entrypoint).write_text("print('ok')\n", encoding="utf-8") + (config_dir / f"{server_name}.yaml").write_text( + f""" +type: stdio +name: {server_name} +params: + command: ${{local_servers_paths}}/{project_subdir}/.venv/bin/python3 + args: + - {entrypoint} + cwd: ${{local_servers_paths}}/{project_subdir} + """.strip() + + "\n", + encoding="utf-8", + ) + + config = module.load_mcp_process_config( + server_name=server_name, + agent_workspace=str(tmp_path / "workspace"), + mcp_servers_path=str(local_servers_dir), + process_env={}, + ) + + assert config.command == "uv" + assert config.args == ["--directory", str(project_dir), "run", "python", entrypoint] + assert config.cwd == str(project_dir) + + +def assert_resolved_runtime_paths(config) -> None: + all_values = [config.command, config.cwd, *config.args] + for value in all_values: + assert "${local_servers_paths}" not in value + assert "/environment/" not in value + + +def assert_arg_contains_path(config, expected_path: str) -> None: + assert expected_path in config.args + + +def assert_python_server_launch(config, server_subdir: str, fallback_entrypoint: str) -> None: + expected_project_dir = f"/toolathlon/local_servers/{server_subdir}" + assert config.cwd == expected_project_dir + + if config.command.endswith("/.venv/bin/python3"): + assert config.command == f"{expected_project_dir}/.venv/bin/python3" + if fallback_entrypoint == "server.py": + assert config.args == ["server.py"] + else: + assert config.args[0] == "-c" + assert "mcp_youtube_transcript" in config.args[1] + assert expected_project_dir in config.args[1] + return + + assert "--directory" in config.args + assert_arg_contains_path(config, expected_project_dir) + assert "run" in config.args + assert fallback_entrypoint in config.args + + +@pytest.mark.parametrize( + ("server_name", "expected"), + [ + ( + "canvas", + { + "launch": "node", + "server_subdir": "mcp-canvas-lms", + "entrypoint_suffix": "/build/index.js", + "cwd": "/workspace", + "timeout_seconds": 10, + "env_subset": { + "CANVAS_API_TOKEN": "placeholder", + "CANVAS_DOMAIN": "localhost:8080", + "NODE_TLS_REJECT_UNAUTHORIZED": "0", + }, + }, + ), + ( + "snowflake", + { + "launch": "uv", + "server_subdir": "mcp-snowflake-server", + "entrypoint": "mcp_snowflake_server", + "cwd": "/workspace", + "timeout_seconds": 120, + "env_subset": { + "PG_HOST": "toolathlon_pg", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon_gym", + "PG_USER": "eigent", + "PG_PASSWORD": "camel", + }, + }, + ), + ( + "woocommerce", + { + "launch": "node", + "server_subdir": "woocommerce-mcp", + "entrypoint_suffix": "/dist/index.js", + "cwd": "/workspace", + "timeout_seconds": 10, + "env_subset": { + "WORDPRESS_SITE_URL": "http://localhost:8081", + "WOOCOMMERCE_CONSUMER_KEY": "placeholder", + "WOOCOMMERCE_CONSUMER_SECRET": "placeholder", + }, + }, + ), + ( + "yahoo-finance", + { + "launch": "python_or_uv", + "server_subdir": "yahoo-finance-mcp", + "fallback_entrypoint": "server.py", + "cwd": "/toolathlon/local_servers/yahoo-finance-mcp", + "timeout_seconds": 60, + "env_subset": {}, + }, + ), + ( + "youtube", + { + "launch": "node", + "server_subdir": "youtube-mcp-server", + "entrypoint_suffix": "/dist/index.js", + "cwd": "/workspace", + "timeout_seconds": 120, + "env_subset": { + "PG_HOST": "postgres", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon", + "PG_USER": "postgres", + "PG_PASSWORD": "postgres", + }, + }, + ), + ( + "youtube-transcript", + { + "launch": "python_or_uv", + "server_subdir": "mcp-youtube-transcript", + "fallback_entrypoint": "run_server.py", + "cwd": "/toolathlon/local_servers/mcp-youtube-transcript", + "timeout_seconds": 20, + "env_subset": { + "PG_HOST": "postgres", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon", + "PG_USER": "postgres", + "PG_PASSWORD": "postgres", + }, + }, + ), + ( + "rail_12306", + { + "launch": "node", + "server_subdir": "12306-mcp", + "entrypoint_suffix": "/build/index.js", + "cwd": "/workspace", + "timeout_seconds": 20, + "env_subset": { + "PG_HOST": "postgres", + "PG_PORT": "5432", + "PG_DATABASE": "toolathlon", + "PG_USER": "postgres", + "PG_PASSWORD": "postgres", + }, + }, + ), + ( + "filesystem", + { + "launch": "node_with_workspace_arg", + "server_subdir": "filesystem", + "entrypoint_suffix": "/dist/index.js", + "cwd": "/workspace", + "timeout_seconds": 300, + "env_subset": {}, + }, + ), + ], +) +def test_bundled_mcp_runtime_configs_match_current_toolathlon_layout(server_name, expected): + module = load_mcp_client_module() + + config = module.load_mcp_process_config( + server_name=server_name, + agent_workspace="/workspace", + mcp_servers_path="/toolathlon/local_servers", + process_env={}, + ) + + assert_resolved_runtime_paths(config) + assert config.cwd == expected["cwd"] + assert config.timeout_seconds == expected["timeout_seconds"] + + launch = expected["launch"] + if launch == "node": + assert_arg_contains_path( + config, + f"/toolathlon/local_servers/{expected['server_subdir']}{expected['entrypoint_suffix']}", + ) + elif launch == "node_with_workspace_arg": + assert_arg_contains_path( + config, + f"/toolathlon/local_servers/{expected['server_subdir']}{expected['entrypoint_suffix']}", + ) + assert_arg_contains_path(config, "/workspace") + elif launch == "uv": + expected_project_dir = f"/toolathlon/local_servers/{expected['server_subdir']}" + assert "--directory" in config.args + assert_arg_contains_path(config, expected_project_dir) + assert "run" in config.args + assert expected["entrypoint"] in config.args + elif launch == "python_or_uv": + assert_python_server_launch( + config, + server_subdir=expected["server_subdir"], + fallback_entrypoint=expected["fallback_entrypoint"], + ) + else: + raise AssertionError(f"Unknown launch mode: {launch}") + + for key, value in expected["env_subset"].items(): + assert config.env[key] == value diff --git a/sandbox/tests/test_result_formatter.py b/sandbox/tests/test_result_formatter.py new file mode 100644 index 0000000..8904585 --- /dev/null +++ b/sandbox/tests/test_result_formatter.py @@ -0,0 +1,166 @@ +from sandbox.result_formatter import format_tool_result + + +def test_format_tool_result_returns_plain_string_for_successful_code_response(): + response = { + "code": 0, + "message": "success", + "data": " 1→hello", + "meta": { + "tool": "code:read", + "resource_type": "code", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == " 1→hello" + + +def test_format_tool_result_preserves_whitespace_only_plain_string_for_successful_code_response(): + response = { + "code": 0, + "message": "success", + "data": " \n\t ", + "meta": { + "tool": "code:read", + "resource_type": "code", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == " \n\t " + + +def test_format_tool_result_keeps_dict_style_successful_code_response_behavior(): + response = { + "code": 0, + "message": "success", + "data": { + "stdout": "print('ok')\n", + "stderr": "", + "return_code": 0, + }, + "meta": { + "tool": "code:run", + "resource_type": "code", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == "print('ok')" + + +def test_format_tool_result_preserves_error_behavior_for_failed_code_response(): + response = { + "code": 1, + "message": "read failed", + "data": "ignored plain string payload", + "meta": { + "tool": "code:read", + "resource_type": "code", + }, + } + + assert format_tool_result(response) == "[Error] read failed" + + +def test_format_tool_result_returns_text_for_successful_mcp_response(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [ + {"type": "text", "text": "ok"}, + ] + }, + "meta": { + "tool": "mcp:canvas.canvas_list_courses", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == "ok" + + +def test_format_tool_result_handles_mixed_mcp_content_without_crashing(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [ + {"type": "text", "text": "first"}, + {"type": "image", "mimeType": "image/png"}, + {"type": "text", "text": "second"}, + ] + }, + "meta": { + "tool": "mcp:canvas.canvas_list_courses", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == "first\n[image content]\nsecond" + + +def test_format_tool_result_falls_back_to_structured_content_for_successful_mcp_response(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [], + "structuredContent": { + "name": "Canvas", + "status": "ok", + }, + }, + "meta": { + "tool": "mcp:canvas.canvas_health_check", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == '{\n "name": "Canvas",\n "status": "ok"\n}' + + +def test_format_tool_result_prefers_structured_content_when_successful_mcp_content_has_no_text(): + response = { + "code": 0, + "message": "success", + "data": { + "content": [ + {"type": "image", "mimeType": "image/png"}, + ], + "structuredContent": { + "name": "Canvas", + "status": "ok", + }, + }, + "meta": { + "tool": "mcp:canvas.canvas_health_check", + "resource_type": "mcp", + "execution_time_ms": 1.2, + }, + } + + assert format_tool_result(response) == '{\n "name": "Canvas",\n "status": "ok"\n}' + + +def test_format_tool_result_preserves_error_behavior_for_failed_mcp_response(): + response = { + "code": 1, + "message": "mcp failed", + "data": { + "content": [ + {"type": "text", "text": "ignored"}, + ] + }, + "meta": { + "tool": "mcp:canvas.canvas_list_courses", + "resource_type": "mcp", + }, + } + + assert format_tool_result(response) == "[Error] mcp failed" diff --git a/synthesis/core/config.py b/synthesis/core/config.py index b8527d9..eaa491b 100644 --- a/synthesis/core/config.py +++ b/synthesis/core/config.py @@ -7,6 +7,8 @@ from typing import Dict, List, Any, Optional from dataclasses import dataclass, field, fields +from sandbox.server.config_loader import expand_env_vars + @dataclass class SynthesisConfig: @@ -67,6 +69,7 @@ def from_dict(cls, config_dict: Dict[str, Any]) -> 'SynthesisConfig': valid_fields = {f.name for f in fields(cls)} filtered = {k: v for k, v in config_dict.items() if k in valid_fields} + filtered = expand_env_vars(filtered) # Normalize text fields (allow list[str] for easier editing) def _normalize_text_field(v: Any) -> str: diff --git a/synthesis/tests/test_code_example_synthesis_config.py b/synthesis/tests/test_code_example_synthesis_config.py index c3ed52c..dddc9e6 100644 --- a/synthesis/tests/test_code_example_synthesis_config.py +++ b/synthesis/tests/test_code_example_synthesis_config.py @@ -6,9 +6,10 @@ REPO_ROOT = Path(__file__).resolve().parents[2] -def test_code_synthesis_config_contract(): +def test_code_synthesis_config_contract_expands_repo_root_when_set(monkeypatch): config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" raw = json.loads(config_path.read_text(encoding="utf-8")) + monkeypatch.setenv("AGENTFLOW_REPO_ROOT", str(REPO_ROOT)) config = SynthesisConfig.from_json(str(config_path)) assert config.sandbox_config_path == "configs/sandbox-server/code_config.json" @@ -17,7 +18,7 @@ def test_code_synthesis_config_contract(): assert config.resource_init_configs == { "code": { "content": { - "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + "source_dir": f"{REPO_ROOT}/seeds/code/seed/demo_repo" } } } @@ -27,3 +28,52 @@ def test_code_synthesis_config_contract(): assert len(config.qa_examples) >= 2 assert config.sampling_tips.strip() assert config.synthesis_tips.strip() + + +def test_code_synthesis_config_preserves_placeholder_when_repo_root_unset(monkeypatch): + config_path = REPO_ROOT / "configs" / "synthesis" / "code_config.json" + monkeypatch.delenv("AGENTFLOW_REPO_ROOT", raising=False) + + config = SynthesisConfig.from_json(str(config_path)) + + assert config.resource_init_configs == { + "code": { + "content": { + "source_dir": "${AGENTFLOW_REPO_ROOT}/seeds/code/seed/demo_repo" + } + } + } + + +def test_synthesis_config_from_dict_expands_nested_env_values(monkeypatch): + monkeypatch.setenv("CODE_ROOT", "/tmp/demo") + monkeypatch.delenv("UNSET_VALUE", raising=False) + + config = SynthesisConfig.from_dict( + { + "resource_init_configs": { + "code": { + "content": { + "source_dir": "${CODE_ROOT}/repo", + "fallback_dir": "${UNSET_VALUE:-/tmp/fallback}", + "preserved_dir": "${UNSET_VALUE}/repo", + "artifacts": [ + "${CODE_ROOT}/one", + "${UNSET_VALUE:-/tmp/two}", + "${UNSET_VALUE}/three", + ], + } + } + } + } + ) + + content = config.resource_init_configs["code"]["content"] + assert content["source_dir"] == "/tmp/demo/repo" + assert content["fallback_dir"] == "/tmp/fallback" + assert content["preserved_dir"] == "${UNSET_VALUE}/repo" + assert content["artifacts"] == [ + "/tmp/demo/one", + "/tmp/two", + "${UNSET_VALUE}/three", + ] diff --git a/synthesis/tests/test_mcp_example_synthesis_configs.py b/synthesis/tests/test_mcp_example_synthesis_configs.py index 253879a..806edeb 100644 --- a/synthesis/tests/test_mcp_example_synthesis_configs.py +++ b/synthesis/tests/test_mcp_example_synthesis_configs.py @@ -6,6 +6,8 @@ from synthesis.core.config import SynthesisConfig REPO_ROOT = Path(__file__).resolve().parents[2] +OPENAI_API_KEY = "secret" +OPENAI_API_URL = "https://example.test/v1" EXPECTED = { "canvas": { "tools": ["mcp:canvas.*", "mcp:filesystem.*"], @@ -44,9 +46,27 @@ } +def _set_openai_env(monkeypatch: pytest.MonkeyPatch, enabled: bool) -> None: + if enabled: + monkeypatch.setenv("OPENAI_API_KEY", OPENAI_API_KEY) + monkeypatch.setenv("OPENAI_API_URL", OPENAI_API_URL) + return + + monkeypatch.delenv("OPENAI_API_KEY", raising=False) + monkeypatch.delenv("OPENAI_API_URL", raising=False) + + +def _expected_openai_value(enabled: bool, env_value: str, placeholder: str) -> str: + if enabled: + return env_value + return placeholder + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) @pytest.mark.parametrize("domain", sorted(EXPECTED)) -def test_mcp_synthesis_config_contract(domain): +def test_mcp_synthesis_config_contract(domain, env_enabled, monkeypatch): expected = EXPECTED[domain] + _set_openai_env(monkeypatch, env_enabled) config_path = REPO_ROOT / "configs" / "synthesis" / f"mcp_{domain}_config.json" raw = json.loads(config_path.read_text(encoding="utf-8")) config = SynthesisConfig.from_json(str(config_path)) @@ -58,8 +78,12 @@ def test_mcp_synthesis_config_contract(domain): assert raw_init in ({}, {"mcp": {"content": {}}}) assert config.resource_init_configs in ({}, {"mcp": {"content": {}}}) assert config.model_name == "openai/gpt-oss-120b" - assert config.api_key == "${OPENAI_API_KEY}" - assert config.base_url == "${OPENAI_API_URL}" + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) assert config.max_depth == 12 assert config.branching_factor == 2 assert config.depth_threshold == 2 @@ -73,3 +97,28 @@ def test_mcp_synthesis_config_contract(domain): assert len(config.qa_examples) >= 2 assert config.sampling_tips.strip() assert config.synthesis_tips.strip() + + +@pytest.mark.parametrize("env_enabled", [False, True], ids=["env-unset", "env-set"]) +def test_mcp_synthesis_config_ignores_unknown_env_placeholders( + caplog, monkeypatch, env_enabled +): + _set_openai_env(monkeypatch, env_enabled) + monkeypatch.delenv("IGNORED_SYNTHESIS_VAR", raising=False) + + with caplog.at_level("WARNING", logger="ConfigLoader"): + config = SynthesisConfig.from_dict( + { + "api_key": "${OPENAI_API_KEY}", + "base_url": "${OPENAI_API_URL}", + "unknown_field": "${IGNORED_SYNTHESIS_VAR}", + } + ) + + assert config.api_key == _expected_openai_value( + env_enabled, OPENAI_API_KEY, "${OPENAI_API_KEY}" + ) + assert config.base_url == _expected_openai_value( + env_enabled, OPENAI_API_URL, "${OPENAI_API_URL}" + ) + assert "IGNORED_SYNTHESIS_VAR" not in caplog.text