From c533b49141a022a8c7518066ac52d62b1ddd2943 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Thu, 21 May 2026 20:33:10 -0500 Subject: [PATCH 1/9] test(cli): add opt-in end-to-end orchestrator<->agent tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add real-container E2E tests that validate the full round-trip between the orchestrator (the q CLI) and agents running in Apple containers: - test_claude_agent_e2e.py: spawn a Claude agent, then assert status.json, the persisted log, the [agent:status] markers and the worktree commit all reach the orchestrator. - test_pi_agent_e2e.py: the same round-trip for a PI agent (local mlx_lm backend), asserting the report is tagged agent_kind=pi. Tests are gated by STACKAI_E2E=1 via collect_ignore_glob, so the default pytest run and the mutmut suite never collect them — they require Apple Container CLI and are local-only, like the acceptance suite. Adds the `e2e` marker and a `make e2e-test` target (excluded from CI and local-qa). Co-Authored-By: Claude Opus 4.7 --- app/cli/Makefile | 7 + app/cli/pyproject.toml | 3 + app/cli/tests/e2e/__init__.py | 0 app/cli/tests/e2e/conftest.py | 171 +++++++++++++++++++++ app/cli/tests/e2e/test_claude_agent_e2e.py | 68 ++++++++ app/cli/tests/e2e/test_pi_agent_e2e.py | 70 +++++++++ 6 files changed, 319 insertions(+) create mode 100644 app/cli/tests/e2e/__init__.py create mode 100644 app/cli/tests/e2e/conftest.py create mode 100644 app/cli/tests/e2e/test_claude_agent_e2e.py create mode 100644 app/cli/tests/e2e/test_pi_agent_e2e.py diff --git a/app/cli/Makefile b/app/cli/Makefile index b6568a9..a0fc9ee 100644 --- a/app/cli/Makefile +++ b/app/cli/Makefile @@ -42,6 +42,13 @@ test-all: eval-skills: claude -p "/skill-creator:skill-creator run evals for the spawn-agent skill at ~/.claude/skills/spawn-agent/" +# End-to-end container tests — opt-in, local-only (needs Apple Container CLI). +# Spawns real agent containers; the Claude round-trip spends Claude credits. +# Excluded from CI and from `local-qa` by design. +.PHONY: e2e-test +e2e-test: + STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e + # Pre-PR quality gate local-qa: acceptance-test eval-skills diff --git a/app/cli/pyproject.toml b/app/cli/pyproject.toml index 310893f..f29f2d1 100644 --- a/app/cli/pyproject.toml +++ b/app/cli/pyproject.toml @@ -25,6 +25,9 @@ dev = [ [tool.pytest.ini_options] testpaths = ["tests", "tests/acceptance"] python_files = ["test_*.py", "*_steps.py"] +markers = [ + "e2e: real-container end-to-end tests (opt-in via STACKAI_E2E=1, local-only)", +] [tool.mutmut] paths_to_mutate = ["src/container_cli/"] diff --git a/app/cli/tests/e2e/__init__.py b/app/cli/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/cli/tests/e2e/conftest.py b/app/cli/tests/e2e/conftest.py new file mode 100644 index 0000000..66ec2d8 --- /dev/null +++ b/app/cli/tests/e2e/conftest.py @@ -0,0 +1,171 @@ +"""Shared fixtures and collection gating for the opt-in end-to-end tests. + +These tests spawn real Apple Container CLI containers, so they are: + +* **local-only** — GitHub Actions has no Apple Container CLI; +* **opt-in** — collected only when ``STACKAI_E2E=1`` is exported; +* **slow and billable** — the Claude round-trip spends real Claude credits. + +Run them with ``make e2e-test`` (from ``app/cli/``), which exports the gate +variable. Without it, pytest never imports the test modules, so the default +``uv run pytest`` and the mutmut suite ignore them entirely. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import sys +import time +import urllib.error +import urllib.request +from collections.abc import Callable, Iterator +from pathlib import Path + +import pytest + +# Collection gate: without STACKAI_E2E=1, pytest skips collecting these modules. +collect_ignore_glob = [] if os.environ.get("STACKAI_E2E") == "1" else ["test_*.py"] + +# Terminal phases written by config/entrypoint*.sh into .agent/status.json. +_TERMINAL_PHASES = frozenset({"completed", "errored"}) + + +def _run_q(*args: str, timeout: float = 600) -> subprocess.CompletedProcess[str]: + """Invoke the real q CLI in a subprocess, inheriting the current environment.""" + return subprocess.run( + [sys.executable, "-m", "container_cli.main", *args], + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + + +def _image_present(image: str) -> bool | None: + """Report whether a container image exists locally. + + Returns: + True or False when the Apple Container CLI answers, None when the + command is unavailable so the caller should not skip on that basis. + """ + result = subprocess.run( + ["container", "image", "list"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return image.split(":")[0] in result.stdout + + +@pytest.fixture() +def run_q() -> Callable[..., subprocess.CompletedProcess[str]]: + """Return a helper that runs the q CLI and captures its output.""" + return _run_q + + +@pytest.fixture() +def require_container_cli() -> None: + """Skip the test unless the Apple `container` CLI is on PATH.""" + if shutil.which("container") is None: + pytest.skip("Apple Container CLI ('container') not found — E2E needs macOS 26+ ARM64") + + +@pytest.fixture() +def require_claude_token() -> None: + """Skip the test unless a Claude container OAuth token is exported.""" + if not os.environ.get("CLAUDE_CONTAINER_OAUTH_TOKEN"): + pytest.skip("CLAUDE_CONTAINER_OAUTH_TOKEN not set — required to spawn a Claude agent") + + +@pytest.fixture() +def require_claude_image() -> None: + """Skip the test unless the claude-agent:wolfi image is built.""" + if _image_present("claude-agent:wolfi") is False: + pytest.skip("image claude-agent:wolfi not built — run `q build` first") + + +@pytest.fixture() +def require_pi_image() -> None: + """Skip the test unless the claude-pi:ubuntu image is built.""" + if _image_present("claude-pi:ubuntu") is False: + pytest.skip("image claude-pi:ubuntu not built — run `q pi build` first") + + +@pytest.fixture() +def require_mlx_server() -> None: + """Skip the test unless the local mlx_lm.server is reachable on the host.""" + url = os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models") + try: + with urllib.request.urlopen(url, timeout=5) as resp: + if resp.status != 200: + pytest.skip(f"mlx_lm.server at {url} returned HTTP {resp.status}") + except (urllib.error.URLError, OSError) as exc: + pytest.skip(f"mlx_lm.server unreachable at {url} ({exc}) — run `uv run iac server start`") + + +@pytest.fixture() +def isolated_agents_home(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Point AGENTS_HOME at a throwaway directory for the duration of the test.""" + home = tmp_path / "worktrees" + home.mkdir() + monkeypatch.setenv("AGENTS_HOME", str(home)) + return home + + +@pytest.fixture() +def unique_branch() -> str: + """A collision-free, flat branch name (no slashes — see docs/agents/cli.md).""" + return f"e2e-{int(time.time())}-{os.getpid()}" + + +@pytest.fixture() +def wait_for_terminal_phase() -> Callable[..., dict]: + """Return a poller that blocks until an agent writes a terminal status.json.""" + + def _wait(agents_home: Path, branch: str, timeout: float = 600) -> dict: + status_file = agents_home / branch / ".agent" / "status.json" + deadline = time.monotonic() + timeout + last_phase = "" + while time.monotonic() < deadline: + if status_file.exists(): + try: + data = json.loads(status_file.read_text()) + except json.JSONDecodeError: + data = {} + last_phase = data.get("phase", "") + if last_phase in _TERMINAL_PHASES: + return data + time.sleep(5) + raise AssertionError( + f"agent '{branch}' did not reach a terminal phase within {timeout}s " + f"(last phase seen: {last_phase})" + ) + + return _wait + + +@pytest.fixture() +def agent_cleanup(isolated_agents_home: Path) -> Iterator[Callable[..., None]]: + """Register (branch, kind) pairs to be stopped and pruned after the test.""" + registered: list[tuple[str, str]] = [] + + def _register(branch: str, kind: str = "claude") -> None: + registered.append((branch, kind)) + + yield _register + + for branch, kind in registered: + stop = ["pi", "stop"] if kind == "pi" else ["agents", "stop"] + _run_q(*stop, "--branch", branch, timeout=120) + worktree = isolated_agents_home / branch + for cmd in ( + ["git", "worktree", "remove", "--force", str(worktree)], + ["git", "worktree", "prune"], + ["git", "branch", "-D", branch], + ): + subprocess.run(cmd, capture_output=True, text=True, check=False) diff --git a/app/cli/tests/e2e/test_claude_agent_e2e.py b/app/cli/tests/e2e/test_claude_agent_e2e.py new file mode 100644 index 0000000..1eaecdc --- /dev/null +++ b/app/cli/tests/e2e/test_claude_agent_e2e.py @@ -0,0 +1,68 @@ +"""End-to-end test: orchestrator <-> Claude agent communication. + +Spawns a real Claude agent container through the ``q`` CLI, waits for it to +finish, and asserts the orchestrator receives the agent's report through every +channel: ``status.json``, the persisted log, the ``[agent:status]`` markers, +and the git commit on the worktree branch. + +Opt-in and local-only — see ``tests/e2e/conftest.py``. +""" + +from __future__ import annotations + +import json + +import pytest + +_TASK = ( + "Create a file named E2E_OK.txt in the current directory containing " + "exactly the text PASS, then stage and commit it with git." +) + + +@pytest.mark.e2e +def test_claude_agent_round_trip( + require_container_cli, + require_claude_token, + require_claude_image, + isolated_agents_home, + unique_branch, + run_q, + wait_for_terminal_phase, + agent_cleanup, +): + """A spawned Claude agent does the task and reports back to the orchestrator.""" + branch = unique_branch + + # Outbound — the orchestrator ensures the network exists, then dispatches. + network = run_q("network") + assert network.returncode == 0, network.stderr + + spawn = run_q("spawn", "--branch", branch, "--task", _TASK) + agent_cleanup(branch, "claude") + assert spawn.returncode == 0, f"spawn failed:\n{spawn.stdout}\n{spawn.stderr}" + + # Inbound channel 1 — status.json reaches a terminal phase. + status = wait_for_terminal_phase(isolated_agents_home, branch) + assert status["phase"] == "completed", f"agent did not complete: {status}" + assert status["exit_code"] == 0, f"agent exited non-zero: {status}" + assert status["commits"] >= 1, f"agent reported no commit: {status}" + + # Outbound result — the task produced real work in the shared worktree. + artifact = isolated_agents_home / branch / "E2E_OK.txt" + assert artifact.is_file(), f"agent did not create {artifact}" + assert artifact.read_text().strip() == "PASS" + + # Inbound channel 2 — `q agents status` surfaces the report to the orchestrator. + cli_status = run_q("agents", "status", "--branch", branch) + assert cli_status.returncode == 0, cli_status.stderr + assert json.loads(cli_status.stdout)["phase"] == "completed" + + # Inbound channel 3 — the persisted log and the [agent:status] markers. + logs = run_q("agents", "logs", "--branch", branch) + assert logs.returncode == 0, logs.stderr + assert logs.stdout.strip(), "agent log is empty" + + summary = run_q("agents", "summary", "--branch", branch) + assert summary.returncode == 0, summary.stderr + assert "PHASE=completed" in summary.stdout, summary.stdout diff --git a/app/cli/tests/e2e/test_pi_agent_e2e.py b/app/cli/tests/e2e/test_pi_agent_e2e.py new file mode 100644 index 0000000..32ceaa3 --- /dev/null +++ b/app/cli/tests/e2e/test_pi_agent_e2e.py @@ -0,0 +1,70 @@ +"""End-to-end test: orchestrator <-> PI agent communication. + +Spawns a real PI agent container (local ``mlx_lm.server`` backend) through the +``q pi`` CLI and asserts the orchestrator receives the agent's report. PI +agents need no Claude token — they authenticate against the local model — but +they do need the model server running on the host. + +This test validates the *communication* round-trip, not model quality: a +terminal ``status.json`` carrying ``agent_kind="pi"`` plus a captured log proves +the task was delivered and the report came back. + +``MAX_PI_AGENTS`` enforcement is intentionally not E2E-tested here — it would +require a racy spawn-while-running attempt; it is covered by the Makefile logic +and the mocked acceptance suite. + +Opt-in and local-only — see ``tests/e2e/conftest.py``. +""" + +from __future__ import annotations + +import json + +import pytest + +_TASK = ( + "Create a file named E2E_PI_OK.txt in the current directory containing exactly the text PASS." +) + + +@pytest.mark.e2e +def test_pi_agent_round_trip( + require_container_cli, + require_pi_image, + require_mlx_server, + isolated_agents_home, + unique_branch, + run_q, + wait_for_terminal_phase, + agent_cleanup, +): + """A spawned PI agent runs against the local model and reports back.""" + branch = unique_branch + + # Outbound — dispatch the task to a PI container (no Claude token needed). + network = run_q("network") + assert network.returncode == 0, network.stderr + + spawn = run_q("pi", "spawn", "--branch", branch, "--task", _TASK) + agent_cleanup(branch, "pi") + assert spawn.returncode == 0, f"pi spawn failed:\n{spawn.stdout}\n{spawn.stderr}" + + # Inbound channel 1 — status.json reaches a terminal phase, tagged as PI. + status = wait_for_terminal_phase(isolated_agents_home, branch) + assert status["phase"] == "completed", f"PI agent did not complete: {status}" + assert status["agent_kind"] == "pi", f"status not tagged as PI: {status}" + + # Inbound channel 2 — `q pi status` surfaces the PI report to the orchestrator. + cli_status = run_q("pi", "status", "--branch", branch) + assert cli_status.returncode == 0, cli_status.stderr + assert json.loads(cli_status.stdout)["agent_kind"] == "pi" + + # Inbound channel 3 — `q pi list` filters this worktree in by agent_kind=pi. + pi_list = run_q("pi", "list") + assert pi_list.returncode == 0, pi_list.stderr + assert branch in pi_list.stdout, f"branch {branch} not listed by `q pi list`" + + # Inbound channel 4 — the persisted log is captured. + logs = run_q("pi", "logs", "--branch", branch) + assert logs.returncode == 0, logs.stderr + assert logs.stdout.strip(), "PI agent log is empty" From 783b049c8b8f565f7e6aeb45cd9478c144c0b326 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Thu, 21 May 2026 20:37:42 -0500 Subject: [PATCH 2/9] refactor(cli): centralise Makefile target names in a contract module MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The CLI's knowledge of which config/Makefile targets exist was scattered across five command modules as bare string literals, with nothing to catch a renamed target until a user hit the failure at runtime. Add src/container_cli/targets.py — a Target StrEnum that is the single source of truth for the 19 targets the CLI invokes. The command modules now reference Target.SPAWN etc.; run_make is typed against Target. Because StrEnum members equal their string value, every existing assertion stays green unchanged. tests/test_targets.py adds a contract test that parses config/Makefile and fails if any Target member is missing — a rename is now caught in CI. Co-Authored-By: Claude Opus 4.7 --- app/cli/src/container_cli/commands/agents.py | 13 +++--- app/cli/src/container_cli/commands/build.py | 9 ++-- app/cli/src/container_cli/commands/network.py | 3 +- .../src/container_cli/commands/pi_agents.py | 13 +++--- app/cli/src/container_cli/commands/run.py | 5 ++- app/cli/src/container_cli/targets.py | 45 +++++++++++++++++++ app/cli/src/container_cli/utils.py | 9 +++- app/cli/tests/test_targets.py | 32 +++++++++++++ 8 files changed, 108 insertions(+), 21 deletions(-) create mode 100644 app/cli/src/container_cli/targets.py create mode 100644 app/cli/tests/test_targets.py diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py index 30ef3fb..0b1f706 100644 --- a/app/cli/src/container_cli/commands/agents.py +++ b/app/cli/src/container_cli/commands/agents.py @@ -7,6 +7,7 @@ import typer +from container_cli.targets import Target from container_cli.utils import check_token, find_git_root, run_make app = typer.Typer(help="Agent lifecycle commands") @@ -37,13 +38,13 @@ def spawn( make_vars["MEMORY"] = memory if image: make_vars["IMAGE"] = image - run_make("spawn", make_vars) + run_make(Target.SPAWN, make_vars) @app.command(name="list") def list_agents() -> None: """List active agent containers and worktrees.""" - run_make("list-agents") + run_make(Target.LIST_AGENTS) @app.command() @@ -51,7 +52,7 @@ def logs( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Show logs for a branch agent.""" - run_make("logs-agent", {"BRANCH": branch}) + run_make(Target.LOGS_AGENT, {"BRANCH": branch}) @app.command() @@ -59,7 +60,7 @@ def follow( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Follow live streaming logs for a branch agent.""" - run_make("follow-agent", {"BRANCH": branch}, tty=True) + run_make(Target.FOLLOW_AGENT, {"BRANCH": branch}, tty=True) @app.command() @@ -67,7 +68,7 @@ def stop( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Stop a branch agent container.""" - run_make("stop-agent", {"BRANCH": branch}) + run_make(Target.STOP_AGENT, {"BRANCH": branch}) @app.command() @@ -90,4 +91,4 @@ def summary( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Show structured lifecycle events for a branch agent.""" - run_make("summary-agent", {"BRANCH": branch}) + run_make(Target.SUMMARY_AGENT, {"BRANCH": branch}) diff --git a/app/cli/src/container_cli/commands/build.py b/app/cli/src/container_cli/commands/build.py index ed72b03..d1bab21 100644 --- a/app/cli/src/container_cli/commands/build.py +++ b/app/cli/src/container_cli/commands/build.py @@ -4,6 +4,7 @@ import typer +from container_cli.targets import Target from container_cli.utils import run_make app = typer.Typer(help="Image build commands") @@ -20,22 +21,22 @@ def build( make_vars["IMAGE"] = image if dockerfile: make_vars["DOCKERFILE"] = dockerfile - run_make("build", make_vars) + run_make(Target.BUILD, make_vars) @app.command() def clean() -> None: """Remove the container image.""" - run_make("clean") + run_make(Target.CLEAN) @app.command(name="clean-network") def clean_network() -> None: """Remove the bridge network.""" - run_make("clean-network") + run_make(Target.CLEAN_NETWORK) @app.command(name="clean-all") def clean_all() -> None: """Remove image and network.""" - run_make("clean-all") + run_make(Target.CLEAN_ALL) diff --git a/app/cli/src/container_cli/commands/network.py b/app/cli/src/container_cli/commands/network.py index 75f08dc..714b60f 100644 --- a/app/cli/src/container_cli/commands/network.py +++ b/app/cli/src/container_cli/commands/network.py @@ -4,6 +4,7 @@ import typer +from container_cli.targets import Target from container_cli.utils import run_make app = typer.Typer(help="Network management commands") @@ -20,4 +21,4 @@ def network( make_vars["SUBNET"] = subnet if network_name: make_vars["NETWORK"] = network_name - run_make("network", make_vars) + run_make(Target.NETWORK, make_vars) diff --git a/app/cli/src/container_cli/commands/pi_agents.py b/app/cli/src/container_cli/commands/pi_agents.py index 0241acb..f624dfa 100644 --- a/app/cli/src/container_cli/commands/pi_agents.py +++ b/app/cli/src/container_cli/commands/pi_agents.py @@ -17,6 +17,7 @@ import typer +from container_cli.targets import Target from container_cli.utils import find_git_root, run_make app = typer.Typer(help="PI agent lifecycle (local mlx_lm.server backend)") @@ -43,7 +44,7 @@ def build( make_vars["PI_IMAGE"] = image if dockerfile: make_vars["PI_DOCKERFILE"] = dockerfile - run_make("build-pi", make_vars) + run_make(Target.BUILD_PI, make_vars) @app.command() @@ -87,13 +88,13 @@ def spawn( make_vars["PI_BASE_URL"] = base_url if model_id: make_vars["PI_MODEL_ID"] = model_id - run_make("spawn-pi", make_vars) + run_make(Target.SPAWN_PI, make_vars) @app.command(name="list") def list_agents() -> None: """List active PI agent containers and PI worktrees.""" - run_make("list-pi-agents") + run_make(Target.LIST_PI_AGENTS) @app.command() @@ -101,7 +102,7 @@ def logs( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Show logs for a PI agent (live container or persisted log).""" - run_make("logs-pi-agent", {"BRANCH": branch}) + run_make(Target.LOGS_PI_AGENT, {"BRANCH": branch}) @app.command() @@ -109,7 +110,7 @@ def follow( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Follow live streaming logs for a PI agent.""" - run_make("follow-pi-agent", {"BRANCH": branch}, tty=True) + run_make(Target.FOLLOW_PI_AGENT, {"BRANCH": branch}, tty=True) @app.command() @@ -117,7 +118,7 @@ def stop( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Stop a PI agent container.""" - run_make("stop-pi-agent", {"BRANCH": branch}) + run_make(Target.STOP_PI_AGENT, {"BRANCH": branch}) @app.command() diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py index 1f17828..12dc3ba 100644 --- a/app/cli/src/container_cli/commands/run.py +++ b/app/cli/src/container_cli/commands/run.py @@ -4,6 +4,7 @@ import typer +from container_cli.targets import Target from container_cli.utils import check_token, run_make app = typer.Typer(help="Run coordinator container") @@ -24,7 +25,7 @@ def run( make_vars["MEMORY"] = memory if name: make_vars["NAME"] = name - run_make("run", make_vars, tty=True) + run_make(Target.RUN, make_vars, tty=True) @app.command() @@ -42,4 +43,4 @@ def shell( make_vars["MEMORY"] = memory if name: make_vars["NAME"] = name - run_make("shell", make_vars, tty=True) + run_make(Target.SHELL, make_vars, tty=True) diff --git a/app/cli/src/container_cli/targets.py b/app/cli/src/container_cli/targets.py new file mode 100644 index 0000000..69e8def --- /dev/null +++ b/app/cli/src/container_cli/targets.py @@ -0,0 +1,45 @@ +"""Single source of truth for the config/Makefile targets the q CLI invokes. + +Centralising the target names here turns the implicit, scattered functional +coupling between the CLI and the orchestration Makefile into one explicit +contract: a renamed target now has a single Python counterpart to update, and +``tests/test_targets.py`` verifies every member against the real Makefile. + +``Target`` is a ``StrEnum``, so each member compares equal to its string value +and can be passed anywhere a plain target string is expected. +""" + +from enum import StrEnum + + +class Target(StrEnum): + """Names of the config/Makefile targets invoked through ``utils.run_make``.""" + + # Image build and cleanup. + BUILD = "build" + CLEAN = "clean" + CLEAN_NETWORK = "clean-network" + CLEAN_ALL = "clean-all" + + # Bridge network. + NETWORK = "network" + + # Interactive coordinator container. + RUN = "run" + SHELL = "shell" + + # Claude agent lifecycle. + SPAWN = "spawn" + LIST_AGENTS = "list-agents" + LOGS_AGENT = "logs-agent" + FOLLOW_AGENT = "follow-agent" + STOP_AGENT = "stop-agent" + SUMMARY_AGENT = "summary-agent" + + # PI agent lifecycle (local mlx_lm backend). + BUILD_PI = "build-pi" + SPAWN_PI = "spawn-pi" + LIST_PI_AGENTS = "list-pi-agents" + LOGS_PI_AGENT = "logs-pi-agent" + FOLLOW_PI_AGENT = "follow-pi-agent" + STOP_PI_AGENT = "stop-pi-agent" diff --git a/app/cli/src/container_cli/utils.py b/app/cli/src/container_cli/utils.py index c22fdde..8fd77b4 100644 --- a/app/cli/src/container_cli/utils.py +++ b/app/cli/src/container_cli/utils.py @@ -6,6 +6,8 @@ import typer +from container_cli.targets import Target + def find_git_root() -> Path: """Return the absolute path of the repository root. @@ -50,11 +52,14 @@ def check_token() -> None: raise typer.Exit(1) -def run_make(target: str, extra_vars: dict[str, str] | None = None, *, tty: bool = False) -> None: +def run_make( + target: Target, extra_vars: dict[str, str] | None = None, *, tty: bool = False +) -> None: """Invoke a Makefile target inside the project's `config/` directory. Args: - target: The Make target name to execute. + target: The Makefile target to execute (a `Target` member, which is a + `StrEnum` and therefore usable directly as the make argument). extra_vars: Optional mapping of variables passed as `KEY=VALUE` to make. tty: If True, replace the current process with `make` so it inherits the TTY (used by interactive commands like `run` and `follow`). diff --git a/app/cli/tests/test_targets.py b/app/cli/tests/test_targets.py new file mode 100644 index 0000000..5818e9e --- /dev/null +++ b/app/cli/tests/test_targets.py @@ -0,0 +1,32 @@ +from __future__ import annotations + +import re +from pathlib import Path + +from container_cli.targets import Target + +_REPO_ROOT = Path(__file__).resolve().parents[3] +_MAKEFILE = _REPO_ROOT / "config" / "Makefile" +_TARGET_DEF = re.compile(r"^([a-zA-Z0-9_-]+):", re.MULTILINE) + + +class TestTargetValues: + def test_every_member_value_is_kebab_case(self): + for target in Target: + assert re.fullmatch(r"[a-z][a-z-]*", target.value), target + + def test_member_equals_its_string_value(self): + assert Target.SPAWN == "spawn" + assert Target.LIST_AGENTS == "list-agents" + assert Target.SPAWN_PI == "spawn-pi" + + def test_no_duplicate_values(self): + values = [t.value for t in Target] + assert len(values) == len(set(values)) + + +class TestMakefileContract: + def test_every_target_exists_in_the_makefile(self): + defined = set(_TARGET_DEF.findall(_MAKEFILE.read_text())) + missing = {t.value for t in Target} - defined + assert not missing, f"Target members absent from config/Makefile: {sorted(missing)}" From fe58b191dac7f6417d6b99319f77b2b4500c5e1b Mon Sep 17 00:00:00 2001 From: deimagjas Date: Thu, 21 May 2026 20:41:51 -0500 Subject: [PATCH 3/9] refactor(cli): consolidate worktree and status logic in utils.py MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit _agents_home() was duplicated byte-for-byte in agents.py and pi_agents.py, and the status command bodies were near-identical copies that also knew the private .agent/status.json layout. run/shell shared an identical body too. - Add utils.agents_home() — the single definition of the worktree-root rule (was duplicated 2x in Python). - Add utils.print_agent_status(branch, label=...) — the single reader of .agent/status.json; agents.status and pi.status now delegate to it, keeping their [status] / [pi-status] message tags via the label arg. - Extract run._coordinator() so run and shell stop duplicating their body. Behaviour is unchanged: status still reads the file directly and still exits 1 when it is missing, so no .feature file changes. The acceptance conftest now patches find_git_root in utils (one patch instead of two), and the moved TestAgentsHome tests live in test_utils.py alongside new coverage for print_agent_status. Co-Authored-By: Claude Opus 4.7 --- app/cli/src/container_cli/commands/agents.py | 22 +------- .../src/container_cli/commands/pi_agents.py | 21 +------ app/cli/src/container_cli/commands/run.py | 31 +++++------ app/cli/src/container_cli/utils.py | 36 +++++++++++- app/cli/tests/acceptance/conftest.py | 3 +- app/cli/tests/test_agents.py | 17 +----- app/cli/tests/test_pi_agents.py | 19 ------- app/cli/tests/test_utils.py | 55 ++++++++++++++++++- 8 files changed, 109 insertions(+), 95 deletions(-) diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py index 0b1f706..9c1c3d5 100644 --- a/app/cli/src/container_cli/commands/agents.py +++ b/app/cli/src/container_cli/commands/agents.py @@ -1,26 +1,15 @@ """Agent lifecycle commands (spawn, list, logs, follow, stop, status, summary).""" -import json -import os -from pathlib import Path from typing import Annotated import typer from container_cli.targets import Target -from container_cli.utils import check_token, find_git_root, run_make +from container_cli.utils import check_token, print_agent_status, run_make app = typer.Typer(help="Agent lifecycle commands") -def _agents_home() -> Path: - """Resolve AGENTS_HOME, falling back to sibling .worktrees/ directory.""" - env_val = os.environ.get("AGENTS_HOME") - if env_val: - return Path(env_val) - return find_git_root().parent / ".worktrees" - - @app.command() def spawn( branch: Annotated[str, typer.Option("--branch", help="Git branch for the agent worktree")], @@ -76,14 +65,7 @@ def status( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Show agent status from persisted status.json file.""" - status_file = _agents_home() / branch / ".agent" / "status.json" - if not status_file.exists(): - typer.echo(f"[status] No status file found for branch '{branch}'.") - typer.echo(f"[status] Expected at: {status_file}") - raise typer.Exit(1) - - data = json.loads(status_file.read_text()) - typer.echo(json.dumps(data, indent=2)) + print_agent_status(branch, label="status") @app.command() diff --git a/app/cli/src/container_cli/commands/pi_agents.py b/app/cli/src/container_cli/commands/pi_agents.py index f624dfa..09bb976 100644 --- a/app/cli/src/container_cli/commands/pi_agents.py +++ b/app/cli/src/container_cli/commands/pi_agents.py @@ -10,27 +10,16 @@ from __future__ import annotations -import json -import os -from pathlib import Path from typing import Annotated import typer from container_cli.targets import Target -from container_cli.utils import find_git_root, run_make +from container_cli.utils import print_agent_status, run_make app = typer.Typer(help="PI agent lifecycle (local mlx_lm.server backend)") -def _agents_home() -> Path: - """Resolve AGENTS_HOME, falling back to sibling .worktrees/ directory.""" - env_val = os.environ.get("AGENTS_HOME") - if env_val: - return Path(env_val) - return find_git_root().parent / ".worktrees" - - @app.command() def build( image: Annotated[str | None, typer.Option("--image", help="PI image tag")] = None, @@ -126,10 +115,4 @@ def status( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Show PI agent status from persisted status.json file.""" - status_file = _agents_home() / branch / ".agent" / "status.json" - if not status_file.exists(): - typer.echo(f"[pi-status] No status file found for branch '{branch}'.") - typer.echo(f"[pi-status] Expected at: {status_file}") - raise typer.Exit(1) - data = json.loads(status_file.read_text()) - typer.echo(json.dumps(data, indent=2)) + print_agent_status(branch, label="pi-status") diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py index 12dc3ba..6d853b8 100644 --- a/app/cli/src/container_cli/commands/run.py +++ b/app/cli/src/container_cli/commands/run.py @@ -10,13 +10,8 @@ app = typer.Typer(help="Run coordinator container") -@app.command() -def run( - cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, - memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, - name: Annotated[str | None, typer.Option("--name", help="Container name")] = None, -) -> None: - """Run an interactive coordinator shell (hands off TTY).""" +def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str | None) -> None: + """Validate the token and launch the coordinator container via `target`.""" check_token() make_vars: dict[str, str] = {} if cpus is not None: @@ -25,7 +20,17 @@ def run( make_vars["MEMORY"] = memory if name: make_vars["NAME"] = name - run_make(Target.RUN, make_vars, tty=True) + run_make(target, make_vars, tty=True) + + +@app.command() +def run( + cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, + memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, + name: Annotated[str | None, typer.Option("--name", help="Container name")] = None, +) -> None: + """Run an interactive coordinator shell (hands off TTY).""" + _coordinator(Target.RUN, cpus, memory, name) @app.command() @@ -35,12 +40,4 @@ def shell( name: Annotated[str | None, typer.Option("--name", help="Container name")] = None, ) -> None: """Alias for run — open an interactive shell in the coordinator container.""" - check_token() - make_vars: dict[str, str] = {} - if cpus is not None: - make_vars["CPUS"] = str(cpus) - if memory: - make_vars["MEMORY"] = memory - if name: - make_vars["NAME"] = name - run_make(Target.SHELL, make_vars, tty=True) + _coordinator(Target.SHELL, cpus, memory, name) diff --git a/app/cli/src/container_cli/utils.py b/app/cli/src/container_cli/utils.py index 8fd77b4..b004724 100644 --- a/app/cli/src/container_cli/utils.py +++ b/app/cli/src/container_cli/utils.py @@ -1,5 +1,6 @@ -"""Shared helpers: git root resolution, Makefile runner, and token validation.""" +"""Shared helpers: git/worktree paths, the Makefile runner, token and status I/O.""" +import json import os import subprocess from pathlib import Path @@ -37,6 +38,19 @@ def makefile_dir() -> Path: return find_git_root() / "config" +def agents_home() -> Path: + """Return the directory that holds agent worktrees. + + Returns: + The `AGENTS_HOME` environment variable when set, otherwise a sibling + `.worktrees/` directory next to the repository root. + """ + env_val = os.environ.get("AGENTS_HOME") + if env_val: + return Path(env_val) + return find_git_root().parent / ".worktrees" + + def check_token() -> None: """Verify that the Claude container OAuth token is exported. @@ -76,3 +90,23 @@ def run_make( result = subprocess.run(cmd) if result.returncode != 0: raise typer.Exit(result.returncode) + + +def print_agent_status(branch: str, *, label: str) -> None: + """Print the persisted `status.json` for an agent worktree branch. + + Args: + branch: The agent branch whose status should be displayed. + label: Tag used in the not-found messages (e.g. `status`, `pi-status`). + + Raises: + typer.Exit: With code 1 when no status file exists for the branch. + """ + status_file = agents_home() / branch / ".agent" / "status.json" + if not status_file.exists(): + typer.echo(f"[{label}] No status file found for branch '{branch}'.") + typer.echo(f"[{label}] Expected at: {status_file}") + raise typer.Exit(1) + + data = json.loads(status_file.read_text()) + typer.echo(json.dumps(data, indent=2)) diff --git a/app/cli/tests/acceptance/conftest.py b/app/cli/tests/acceptance/conftest.py index 6d00b06..7406de6 100644 --- a/app/cli/tests/acceptance/conftest.py +++ b/app/cli/tests/acceptance/conftest.py @@ -42,8 +42,7 @@ def invocation_context( patch("container_cli.commands.run.run_make") as m_run, patch("container_cli.commands.network.run_make") as m_network, patch("container_cli.commands.pi_agents.run_make") as m_pi, - patch("container_cli.commands.agents.find_git_root", return_value=repo), - patch("container_cli.commands.pi_agents.find_git_root", return_value=repo), + patch("container_cli.utils.find_git_root", return_value=repo), ): ctx = InvocationContext( runner=CliRunner(), diff --git a/app/cli/tests/test_agents.py b/app/cli/tests/test_agents.py index c229235..719e9c8 100644 --- a/app/cli/tests/test_agents.py +++ b/app/cli/tests/test_agents.py @@ -7,7 +7,6 @@ import typer from container_cli.commands.agents import ( - _agents_home, follow, list_agents, logs, @@ -98,22 +97,8 @@ def test_reads_status_file(self, tmp_path, monkeypatch): def test_agents_home_fallback(self, monkeypatch): monkeypatch.delenv("AGENTS_HOME", raising=False) with ( - patch("container_cli.commands.agents.find_git_root", return_value=Path("/fake/repo")), + patch("container_cli.utils.find_git_root", return_value=Path("/fake/repo")), pytest.raises(typer.Exit) as exc_info, ): status(branch="nonexistent-branch") assert exc_info.value.exit_code == 1 - - -class TestAgentsHome: - def test_uses_env_var(self, tmp_path, monkeypatch): - monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) - assert _agents_home() == tmp_path - - def test_fallback_path_name(self, monkeypatch): - monkeypatch.delenv("AGENTS_HOME", raising=False) - with patch( - "container_cli.commands.agents.find_git_root", return_value=Path("/home/user/repo") - ): - result = _agents_home() - assert result == Path("/home/user/.worktrees") diff --git a/app/cli/tests/test_pi_agents.py b/app/cli/tests/test_pi_agents.py index 5193ad4..21b10d2 100644 --- a/app/cli/tests/test_pi_agents.py +++ b/app/cli/tests/test_pi_agents.py @@ -6,14 +6,10 @@ from __future__ import annotations -from pathlib import Path -from unittest.mock import patch - import pytest import typer from container_cli.commands.pi_agents import ( - _agents_home, build, follow, list_agents, @@ -148,18 +144,3 @@ def test_reads_status_file(self, tmp_path, monkeypatch): status_file.parent.mkdir(parents=True) status_file.write_text('{"phase": "completed", "agent_kind": "pi"}') status(branch="pi/x") - - -class TestAgentsHome: - def test_uses_env_var(self, tmp_path, monkeypatch): - monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) - assert _agents_home() == tmp_path - - def test_fallback_path_name(self, monkeypatch): - monkeypatch.delenv("AGENTS_HOME", raising=False) - with patch( - "container_cli.commands.pi_agents.find_git_root", - return_value=Path("/home/user/repo"), - ): - result = _agents_home() - assert result == Path("/home/user/.worktrees") diff --git a/app/cli/tests/test_utils.py b/app/cli/tests/test_utils.py index 554fe8a..efa26e8 100644 --- a/app/cli/tests/test_utils.py +++ b/app/cli/tests/test_utils.py @@ -7,7 +7,14 @@ import pytest import typer -from container_cli.utils import check_token, find_git_root, makefile_dir, run_make +from container_cli.utils import ( + agents_home, + check_token, + find_git_root, + makefile_dir, + print_agent_status, + run_make, +) # ---------- find_git_root ---------- @@ -132,3 +139,49 @@ def test_default_tty_uses_subprocess(self, fake_git_root: Path): run_make("build") m_sub.assert_called_once() m_exec.assert_not_called() + + +# ---------- agents_home ---------- + + +class TestAgentsHome: + def test_uses_env_var(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + assert agents_home() == tmp_path + + def test_fallback_path_name(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("AGENTS_HOME", raising=False) + with patch("container_cli.utils.find_git_root", return_value=Path("/home/user/repo")): + assert agents_home() == Path("/home/user/.worktrees") + + +# ---------- print_agent_status ---------- + + +class TestPrintAgentStatus: + def test_exits_when_status_file_missing( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys + ): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + with pytest.raises(typer.Exit) as exc_info: + print_agent_status("feat-x", label="status") + assert exc_info.value.exit_code == 1 + assert "[status] No status file found" in capsys.readouterr().out + + def test_prints_json_when_status_file_exists( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys + ): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + status_file = tmp_path / "feat-x" / ".agent" / "status.json" + status_file.parent.mkdir(parents=True) + status_file.write_text('{"phase": "completed"}') + print_agent_status("feat-x", label="status") + assert "completed" in capsys.readouterr().out + + def test_label_tags_the_not_found_message( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys + ): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + with pytest.raises(typer.Exit): + print_agent_status("feat-x", label="pi-status") + assert "[pi-status]" in capsys.readouterr().out From 94c7c37be9941d9718e2b1a70e7e33f45ad80c29 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Thu, 21 May 2026 20:44:26 -0500 Subject: [PATCH 4/9] refactor(cli): drop the dead Typer apps from build/network/run MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit build.py, network.py and run.py each created a typer.Typer() app and decorated their functions with @app.command(), but main.py never mounted those apps — it registers the bare functions directly. The app objects were dead code that misled readers into expecting a mounted sub-app. Remove the unused typer.Typer() and @app.command() decorators from the three modules; their functions are now plain `def`s, registered by main.py exactly as before. agents.py and pi_agents.py keep their Typer app because they genuinely are mounted as sub-apps — that becomes the one consistent pattern. The q CLI surface is unchanged. Co-Authored-By: Claude Opus 4.7 --- app/cli/src/container_cli/commands/build.py | 11 ++++------- app/cli/src/container_cli/commands/network.py | 8 ++++---- app/cli/src/container_cli/commands/run.py | 9 ++++----- 3 files changed, 12 insertions(+), 16 deletions(-) diff --git a/app/cli/src/container_cli/commands/build.py b/app/cli/src/container_cli/commands/build.py index d1bab21..3d3bb89 100644 --- a/app/cli/src/container_cli/commands/build.py +++ b/app/cli/src/container_cli/commands/build.py @@ -1,4 +1,7 @@ -"""Image build and cleanup commands.""" +"""Image build and cleanup commands. + +These functions are registered as top-level commands by `container_cli.main`. +""" from typing import Annotated @@ -7,10 +10,7 @@ from container_cli.targets import Target from container_cli.utils import run_make -app = typer.Typer(help="Image build commands") - -@app.command() def build( image: Annotated[str | None, typer.Option("--image", help="Image tag")] = None, dockerfile: Annotated[str | None, typer.Option("--dockerfile", help="Dockerfile path")] = None, @@ -24,19 +24,16 @@ def build( run_make(Target.BUILD, make_vars) -@app.command() def clean() -> None: """Remove the container image.""" run_make(Target.CLEAN) -@app.command(name="clean-network") def clean_network() -> None: """Remove the bridge network.""" run_make(Target.CLEAN_NETWORK) -@app.command(name="clean-all") def clean_all() -> None: """Remove image and network.""" run_make(Target.CLEAN_ALL) diff --git a/app/cli/src/container_cli/commands/network.py b/app/cli/src/container_cli/commands/network.py index 714b60f..a39374f 100644 --- a/app/cli/src/container_cli/commands/network.py +++ b/app/cli/src/container_cli/commands/network.py @@ -1,4 +1,7 @@ -"""Bridge network management commands.""" +"""Bridge network management command. + +Registered as a top-level command by `container_cli.main`. +""" from typing import Annotated @@ -7,10 +10,7 @@ from container_cli.targets import Target from container_cli.utils import run_make -app = typer.Typer(help="Network management commands") - -@app.command() def network( subnet: Annotated[str | None, typer.Option("--subnet", help="Subnet CIDR")] = None, network_name: Annotated[str | None, typer.Option("--network-name", help="Network name")] = None, diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py index 6d853b8..5c9da95 100644 --- a/app/cli/src/container_cli/commands/run.py +++ b/app/cli/src/container_cli/commands/run.py @@ -1,4 +1,7 @@ -"""Interactive coordinator container (`run` and `shell` aliases).""" +"""Interactive coordinator container (`run` and `shell` aliases). + +Registered as top-level commands by `container_cli.main`. +""" from typing import Annotated @@ -7,8 +10,6 @@ from container_cli.targets import Target from container_cli.utils import check_token, run_make -app = typer.Typer(help="Run coordinator container") - def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str | None) -> None: """Validate the token and launch the coordinator container via `target`.""" @@ -23,7 +24,6 @@ def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str run_make(target, make_vars, tty=True) -@app.command() def run( cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, @@ -33,7 +33,6 @@ def run( _coordinator(Target.RUN, cpus, memory, name) -@app.command() def shell( cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, From 6511dec48e46ab523d286a86cc572337e3c1ceb4 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Fri, 22 May 2026 07:14:23 -0500 Subject: [PATCH 5/9] fix(cli): locate config/Makefile by walking up in the contract test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit test_targets.py resolved the repo root with a fixed parent index, which pointed at the wrong directory when the test ran from the mutmut copy under mutants/ (one extra path level). Walk up from the test file until a config/Makefile is found instead — correct both in tests/ and mutants/tests/. Co-Authored-By: Claude Opus 4.7 --- app/cli/tests/test_targets.py | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/app/cli/tests/test_targets.py b/app/cli/tests/test_targets.py index 5818e9e..971ffbb 100644 --- a/app/cli/tests/test_targets.py +++ b/app/cli/tests/test_targets.py @@ -5,11 +5,25 @@ from container_cli.targets import Target -_REPO_ROOT = Path(__file__).resolve().parents[3] -_MAKEFILE = _REPO_ROOT / "config" / "Makefile" _TARGET_DEF = re.compile(r"^([a-zA-Z0-9_-]+):", re.MULTILINE) +def _find_makefile() -> Path: + """Locate config/Makefile by walking up from this test file. + + Walking up (rather than a fixed parent index) keeps the lookup correct when + the test runs from the mutmut copy under ``mutants/``. + """ + for parent in Path(__file__).resolve().parents: + candidate = parent / "config" / "Makefile" + if candidate.is_file(): + return candidate + raise FileNotFoundError("config/Makefile not found in any parent directory") + + +_MAKEFILE = _find_makefile() + + class TestTargetValues: def test_every_member_value_is_kebab_case(self): for target in Target: From 125e5b1c15ada8d964d8f5511b4da92de7055ea4 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Fri, 22 May 2026 07:14:23 -0500 Subject: [PATCH 6/9] docs: document the E2E tests and refresh the CLI reference - Add docs/agents/e2e-tests.md: prerequisites, how to run `make e2e-test`, what each round-trip test asserts, and the cost/CI caveats. - cli.md: add targets.py to the package structure, note the Target contract and the tests/e2e/ suite. - CLAUDE.md: list `make e2e-test` in the Testing section. Co-Authored-By: Claude Opus 4.7 --- CLAUDE.md | 3 ++ docs/agents/cli.md | 12 +++++-- docs/agents/e2e-tests.md | 77 ++++++++++++++++++++++++++++++++++++++++ 3 files changed, 89 insertions(+), 3 deletions(-) create mode 100644 docs/agents/e2e-tests.md diff --git a/CLAUDE.md b/CLAUDE.md index 96508f2..5c4be47 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,6 +53,9 @@ uv run pytest -k test_spawn # single test by name # Acceptance tests only (Gherkin/BDD, local — no real containers) make acceptance-test +# End-to-end tests (real containers, opt-in, local-only — see docs/agents/e2e-tests.md) +make e2e-test + # Mutation testing gate (≥ 70% kill rate) make mutation-ci-threshold diff --git a/docs/agents/cli.md b/docs/agents/cli.md index e91bc61..b5d5dc0 100644 --- a/docs/agents/cli.md +++ b/docs/agents/cli.md @@ -235,16 +235,22 @@ app/cli/ ├── pyproject.toml ← uv project, entry point: q └── src/ └── container_cli/ - ├── main.py ← registers all commands + agents sub-app - ├── utils.py ← find_git_root, makefile_dir, check_token, run_make + ├── main.py ← registers all commands + agents/pi sub-apps + ├── targets.py ← Target enum: the config/Makefile target contract + ├── utils.py ← git/worktree paths, run_make, check_token, print_agent_status └── commands/ ├── build.py ← build, clean, clean-network, clean-all ├── network.py ← network ├── run.py ← run, shell - ├── agents.py ← spawn, list, logs, follow, stop + ├── agents.py ← spawn, list, logs, follow, stop, status, summary └── pi_agents.py ← pi build/spawn/list/logs/follow/stop/status ``` +> Target names are not free strings: every Makefile target the CLI invokes is a +> member of the `Target` enum in `targets.py`, and `tests/test_targets.py` +> verifies each one against `config/Makefile`. Real-container round-trip tests +> live in `tests/e2e/` — see [e2e-tests.md](./e2e-tests.md). + --- ## CLI ↔ Makefile mapping diff --git a/docs/agents/e2e-tests.md b/docs/agents/e2e-tests.md new file mode 100644 index 0000000..39b8302 --- /dev/null +++ b/docs/agents/e2e-tests.md @@ -0,0 +1,77 @@ +# End-to-end tests — orchestrator ↔ agent + +The E2E tests spawn **real Apple Container CLI containers** and validate the +full round-trip between the orchestrator (the `q` CLI) and an agent running in +a container. They complement — they do not replace — the mocked acceptance +suite: acceptance tests are the fast contract, E2E is the real integration +smoke. + +**Location:** `app/cli/tests/e2e/` + +## Why opt-in and local-only + +The tests need Apple Container CLI (macOS 26+, ARM64), which GitHub Actions +does not have — exactly like the acceptance suite. They are also slow and +billable. So they are gated: + +- Collection is guarded by `STACKAI_E2E=1`. Without it, pytest never imports + the test modules, so the default `uv run pytest` and the `mutmut` suite + ignore them entirely. +- `make e2e-test` (from `app/cli/`) exports the variable and runs them. +- They are excluded from CI and from `make local-qa` by design. + +## Prerequisites + +Each test skips itself, with a reason, when a prerequisite is missing. + +| Test | Needs | +|---|---| +| both | Apple Container CLI on `PATH`; the bridge network (created automatically) | +| E2E-1 (Claude) | `CLAUDE_CONTAINER_OAUTH_TOKEN` exported; `claude-agent:wolfi` built (`q build`) | +| E2E-2 (PI) | `claude-pi:ubuntu` built (`q pi build`); `mlx_lm.server` reachable (`uv run iac server start`) | + +> **Cost:** E2E-1 spends real Claude credits — it runs a live headless agent. +> E2E-2 uses the local model, so it is free but needs the model downloaded and +> the server running. + +The PI server URL defaults to `http://localhost:8080/v1/models`; override the +reachability check with `STACKAI_E2E_MLX_URL` if your server listens elsewhere. + +## How to run + +```bash +cd app/cli +make e2e-test # STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e +``` + +Run a single test: + +```bash +STACKAI_E2E=1 uv run pytest tests/e2e/test_claude_agent_e2e.py -v +``` + +## What each test asserts + +Both tests dispatch a deterministic task, wait for the agent to reach a +terminal phase, then assert the orchestrator receives the report through every +channel. Each test runs in an isolated `AGENTS_HOME` (a temp directory) and +tears down its container, worktree and branch afterwards. + +### E2E-1 — Claude agent (`test_claude_agent_e2e.py`) + +1. `q network`, then `q spawn` with a task that creates `E2E_OK.txt` and commits. +2. `status.json` reaches `phase=completed`, `exit_code=0`, `commits>=1`. +3. The committed file exists in the shared worktree. +4. `q agents status`, `q agents logs` and `q agents summary` surface the + report (status JSON, persisted log, `[agent:status]` markers). + +### E2E-2 — PI agent (`test_pi_agent_e2e.py`) + +1. `q pi spawn` against the local `mlx_lm.server` backend (no Claude token). +2. `status.json` reaches `phase=completed` and is tagged `agent_kind=pi`. +3. `q pi status`, `q pi list` and `q pi logs` surface the PI report; `q pi list` + filters this worktree in by `agent_kind=pi`. + +`MAX_PI_AGENTS` enforcement is not E2E-tested — it would need a racy +spawn-while-running attempt; it is covered by the Makefile logic and the mocked +acceptance suite. From 74747651e3725590c4d128ff735383aa6fff235b Mon Sep 17 00:00:00 2001 From: deimagjas Date: Fri, 22 May 2026 07:20:20 -0500 Subject: [PATCH 7/9] add: modularity review --- .../2026-05-21/modularity-review.html | 473 ++++++++++++++++++ .../2026-05-21/modularity-review.md | 128 +++++ 2 files changed, 601 insertions(+) create mode 100644 docs/modularity-review/2026-05-21/modularity-review.html create mode 100644 docs/modularity-review/2026-05-21/modularity-review.md diff --git a/docs/modularity-review/2026-05-21/modularity-review.html b/docs/modularity-review/2026-05-21/modularity-review.html new file mode 100644 index 0000000..95eb179 --- /dev/null +++ b/docs/modularity-review/2026-05-21/modularity-review.html @@ -0,0 +1,473 @@ + + + + + +Modularity Review + + + + + +
+

Modularity Review

+

Scope: The q Python CLI (app/cli/) and its integration boundary with config/Makefile
+Date: 2026-05-21

+ +

Executive Summary

+

stackai orchestrates parallel Claude Code agents in sandboxed Apple Containers; the q CLI is a typed Typer front end that wraps the config/Makefile orchestration targets. In the small the CLI is healthy — utils.run_make cleanly encapsulates the mechanism of invoking make, and every module is short and single-purpose. The modularity weakness is concentrated at one boundary: the CLI's knowledge of which Makefile targets exist and what variables they accept is functionally coupled and implicit — encoded as bare string literals scattered across five command modules with no contract to make the dependency explicit or fail loudly. Because the Makefile is the core subdomain of the product and evolves continuously, this is the most important finding: the CLI silently breaks at runtime when a target is renamed. A second, self-inflicted issue compounds it — the status command reaches past the Makefile into the container's private .agent/status.json file, reproducing an existing status-agent target with intrusive coupling.

+

Overall status: needs attention. There is no critical distributed-monolith failure, but two unbalanced-and-volatile integrations will cause avoidable pain as the Makefile evolves, and two smaller cohesion defects add cognitive friction.

+ +

Coupling Overview

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IntegrationStrengthDistanceVolatilityBalanced?
commands/*config/Makefile (target & variable names)Functional, implicitHigh — separate language, separate directory, subprocessHigh — orchestration is the core subdomain, targets evolveNo — unbalanced & volatile
agents.status / pi.status → container .agent/status.jsonIntrusiveHigh — bash entrypoint, runs inside the containerMedium–High — monitoring actively developedNo — unbalanced & volatile
agents._agents_homepi_agents._agents_homeFunctional, duplicatedLow — sibling modules, one packageLow–MediumNo — duplication mistaken for cohesion
_agents_home() fallback ≡ Makefile AGENTS_HOMEFunctional, duplicatedHigh — cross-languageLowTolerable — low volatility neutralizes
main.pycommands/* (registration)Model, inconsistentLow — one packageLowBorderline — low cohesion
check_token() ≡ Makefile _check-tokenFunctional, duplicatedHigh — cross-languageLowYes — low volatility neutralizes
+

The last row is included deliberately as a contrast: it is duplicated, unbalanced coupling, but volatility is low enough that the balance rule(STRENGTH XOR DISTANCE) OR NOT VOLATILE — is satisfied. Not every imbalance is a defect; only the volatile ones below are.

+ +
+

Issue: Makefile interface is an implicit, scattered contract

+

Integration: container_cli/commands/*config/Makefile
+Severity: Significant

+ +

Knowledge Leakage

+

Every command function knows, as hard-coded strings, two things about the Makefile: the name of a target ("spawn", "list-agents", "spawn-pi", "summary-agent", …) and the names of the variables that target accepts (BRANCH, TASK, CPUS, MEMORY, IMAGE, DOCKERFILE, SUBNET, NETWORK, PI_IMAGE, PI_DOCKERFILE, PI_BASE_URL, PI_MODEL_ID, NAME). That is the Makefile's functional requirements — what it does and how it must be invoked — replicated inside Python. The shared knowledge is implicit: there is no enum, no schema, no generated mapping. The only place the correspondence is written down is a prose table in docs/agents/cli.md, which nothing keeps honest. run_make centralizes the mechanism (make -C config <target> KEY=VALUE) but not the vocabulary, so each of the five commands/* modules independently owns a fragment of an undeclared contract.

+

This is functional coupling — the components must change together when a target's interface changes — and it is implicit functional coupling, which the model singles out as particularly dangerous: there is no compiler, type checker, or test that links agents.py:33's "BRANCH" to the Makefile's BRANCH ?=.

+ +

Complexity Impact

+

A developer changing a Makefile target name or variable cannot see, from the Makefile, that Python depends on it — the dependency is invisible at the point of change. To safely rename one target the developer must simultaneously hold in mind: the Makefile target, the run_make call site in commands/*, the cli.md mapping table, and the acceptance test. That is knowledge spread across four artifacts in two languages for what should be a trivial rename. It exceeds the 4±1 working-memory budget, which is precisely what turns a change from modular (predictable outcome) into complex (outcome discovered only by running it).

+ +

Cascading Changes

+
    +
  • Renaming list-agentsagents-list in the Makefile: the Python literal "list-agents" still type-checks, still passes ruff, still imports cleanly. The break surfaces only when a user runs q agents list and make reports No rule to make target. The acceptance tests mock run_make, so they assert which string was passed, not that the target exists — there is no test-time signal either.
  • +
  • Adding a variable to an existing target (e.g. a --no-cache option for build): requires coordinated edits in build.py, the Makefile, and cli.md, with nothing enforcing that the three agree.
  • +
  • The coupling is genuinely tight: high integration strength (functional) across high distance — separate language, separate top-level directory, subprocess boundary. The Makefile is a core subdomain: the orchestration logic is the product, and git history shows targets added continuously (PI agents, apply-sensors). High strength + high distance + high volatility fails the balance rule.
  • +
+

Why Significant and not Critical: the same person maintains both sides, so socio-technical distance is low — there is no cross-team coordination cost — and Makefile change is mostly additive (new targets), so existing literals rarely break. The danger is real but not yet frequent.

+ +

Recommended Improvement

+

Introduce a single contract module — e.g. container_cli/makefile.py — as the only place target names and their permitted variables are written down. One small structure per target (target name + allowed variable keys) converts the implicit functional coupling into explicit contract coupling localized in one file; command modules then reference Targets.SPAWN instead of the literal "spawn". Optionally add one test that runs make -C config -n <target> (dry-run) for every target in the registry, so a renamed or removed target fails CI instead of a user's terminal.

+

This deliberately does not reduce distance — the CLI must wrap the Makefile, that is its purpose, and decomposing further would only raise distance. It reduces strength by encapsulating the contract, and it concentrates the volatile knowledge so a Makefile change has exactly one Python counterpart to update. Trade-off: a thin indirection layer plus the discipline of keeping the registry current — cheap relative to silent runtime breakage in the product's most-changed area.

+
+ +
+

Issue: status command bypasses the Makefile and reads the container's private file

+

Integration: agents.status / pi_agents.status → container worktree .agent/status.json
+Severity: Significant

+ +

Knowledge Leakage

+

agents.py:78 and pi_agents.py:128 build the path $AGENTS_HOME/<branch>/.agent/status.json and read it directly off disk. .agent/ is internal bookkeeping created by the container entrypoint (config/entrypoint.sh / entrypoint-pi.sh) for its own use; the CLI treats that private layout as if it were a public interface. This is intrusive coupling — the highest integration strength — because the CLI depends on an implementation detail of a different component.

+

The leak is twofold: the directory layout (.agent/status.json) and even the fallback message text ([status] No status file found…, [status] Expected at: …) are reproduced in Python. They are reproduced because the Makefile already has a status-agent target (Makefile:179) that performs the identical read and prints near-identical messages. So this is also functional coupling through duplication — the knowledge is copied, not shared.

+ +

Complexity Impact

+

There are now two implementations of "show agent status" — status-agent in Make and status() in Python — and nothing keeps them consistent. q agents summary routes through make summary-agent, but q agents status does not route through make status-agent; a reader cannot predict from the CLI surface which commands delegate and which reimplement. The Python file read looks like ordinary I/O, not an integration point, so it is easy to miss when reasoning about what depends on the entrypoint's layout — the integration is hidden, which is the hallmark of complexity.

+ +

Cascading Changes

+
    +
  • The entrypoint relocates status.json (say to .agent/state/status.json): status-agent, status-pi-agent, agents.status, and pi_agents.status all break — four sites, two languages — and the two Python ones give no signal until invoked.
  • +
  • The "no status file" UX is reworded in the Makefile only: q … status silently keeps the old wording; the two surfaces drift apart.
  • +
  • status.json is part of actively-developed structured monitoring (recent commits #7 and #9), so this sits in a volatile area — the duplication will be exercised. Intrusive strength + high distance + volatility is unbalanced and not neutralized.
  • +
+ +

Recommended Improvement

+

Route status through the target that already exists: run_make("status-agent", {"BRANCH": branch}) — exactly as summary already does. This removes the intrusive coupling and the duplication in one move: the Makefile becomes the single component that knows the .agent/ layout, and the CLI holds only a target name (governed by the contract module proposed in the first issue). The documented rationale for the direct read — "works after the container has exited" — is already satisfied by status-agent, which is a pure file read with no live-container dependency, so no capability is lost.

+

Trade-off: the Python status currently raises typer.Exit(1) when the file is missing, whereas the Makefile target prints a message and exits 0. If a non-zero exit code is contractually required, add an explicit exit 1 to the missing-file branch of status-agent rather than reimplementing the read in Python. The cost is one line of Make; the gain is eliminating an intrusive, duplicated integration in a volatile area.

+
+ +
+

Issue: Worktree-path logic duplicated across command modules

+

Integration: agents._agents_homepi_agents._agents_home (and both ≡ Makefile AGENTS_HOME)
+Severity: Minor

+ +

Knowledge Leakage

+

_agents_home() is byte-for-byte identical in agents.py:15 and pi_agents.py:25: "use $AGENTS_HOME if set, else <git-root>/../.worktrees". The same rule is the Makefile's AGENTS_HOME ?= $(dirname GIT_ROOT)/.worktrees (Makefile:54). One business rule — where agent worktrees live — implemented three times. This is functional coupling through duplicated knowledge. The run/shell pair in run.py is a smaller instance of the same pattern: two functions with identical bodies that differ only in the target string passed to run_make.

+ +

Complexity Impact

+

The two Python copies sit at low distance — sibling modules in one package. High strength at low distance would normally read as high cohesion, which is modular. But this is duplication, not cohesion: the copies are not one deliberate unit, they are two units that secretly share a rule. The STRENGTH XOR DISTANCE formula says "balanced" only if you mistake the duplication for cohesion. The real cost is the hidden obligation every developer carries — to remember that editing one copy means editing the twin.

+ +

Cascading Changes

+
    +
  • Changing the fallback (e.g. .worktrees.agent-worktrees) requires editing both Python copies; missing one makes q agents … and q pi … resolve different worktree roots — a confusing, hard-to-spot bug.
  • +
  • The Python ↔ Makefile copy is unbalanced (high strength, high cross-language distance), but volatility is low — the worktree convention rarely changes. By the balance rule, low volatility neutralizes it: tolerable technical debt, not urgent.
  • +
+ +

Recommended Improvement

+

Hoist _agents_home() into utils.py, beside find_git_root, and have both command modules import it — one definition, genuine high cohesion instead of duplication. Collapse run/shell the same way, into one shared helper parameterised by target name. Both are pure DRY fixes at low distance, with no trade-off.

+

Leave the Python-vs-Makefile AGENTS_HOME duplication as-is. Its low volatility makes the imbalance acceptable, and unifying it would mean either the CLI parsing the Makefile or the Makefile shelling into Python — both add more coupling than they remove. This is the model's pragmatism in action: not every imbalance is worth fixing.

+
+ +
+

Issue: Two inconsistent command-registration patterns

+

Integration: main.pycommands/*
+Severity: Minor

+ +

Knowledge Leakage

+

main.py mounts agents and pi_agents as Typer sub-apps via add_typer(agents.app, …), but registers build, network, and run by cherry-picking individual functions: app.command("build")(build.build). Meanwhile build.py, network.py, and run.py each still create a typer.Typer() app (e.g. build.py:9) and decorate their functions with @app.command() — yet that app object is never mounted anywhere. main.py carries an inconsistent model of what a "command module" is: sometimes a sub-app, sometimes a bag of functions.

+ +

Complexity Impact

+

This is a low cohesion signal: the app object in build.py/network.py/run.py is dead code that actively misleads. A reader who has seen agents.app mounted reasonably assumes build.app is mounted too, and must trace main.py to discover it is not. The cost is small and contained to one short file, but every new command module forces an unstated choice between two patterns — accidental complexity with no functional purpose.

+ +

Cascading Changes

+

Low. main.py and commands/* are at minimal distance — one package — and registration changes are infrequent. There is no volatile cascade here; the harm is purely cognitive friction, which is why this is Minor.

+ +

Recommended Improvement

+

Pick one pattern and apply it everywhere. The simplest: every command module exposes a Typer app, and main.py only ever calls add_typer. If flat top-level commands like q build must be preserved (rather than q build build), then drop the unused typer.Typer() from build.py, network.py, and run.py so their functions are honestly just functions, registered directly. Either way the rule "a command module looks like X" becomes true everywhere, and the dead app objects disappear. The only cost is a one-time edit; the gain is one fewer decision per future command module.

+
+ +
+ +
+ + + + diff --git a/docs/modularity-review/2026-05-21/modularity-review.md b/docs/modularity-review/2026-05-21/modularity-review.md new file mode 100644 index 0000000..d80b520 --- /dev/null +++ b/docs/modularity-review/2026-05-21/modularity-review.md @@ -0,0 +1,128 @@ +# Modularity Review + +**Scope**: The `q` Python CLI (`app/cli/`) and its integration boundary with `config/Makefile` +**Date**: 2026-05-21 + +## Executive Summary + +stackai orchestrates parallel Claude Code agents in sandboxed Apple Containers; the `q` CLI is a typed Typer front end that wraps the `config/Makefile` orchestration targets. In the small the CLI is healthy — `utils.run_make` cleanly encapsulates the *mechanism* of invoking `make`, and every module is short and single-purpose. The [modularity](https://coupling.dev/posts/core-concepts/modularity/) weakness is concentrated at one boundary: the CLI's knowledge of *which* Makefile targets exist and *what variables they accept* is [functionally coupled](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) and **implicit** — encoded as bare string literals scattered across five command modules with no contract to make the dependency explicit or fail loudly. Because the Makefile is the [core subdomain](https://coupling.dev/posts/dimensions-of-coupling/volatility/) of the product and evolves continuously, this is the most important finding: the CLI silently breaks at runtime when a target is renamed. A second, self-inflicted issue compounds it — the `status` command reaches *past* the Makefile into the container's private `.agent/status.json` file, reproducing an existing `status-agent` target with [intrusive coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/). + +Overall status: **needs attention**. There is no critical distributed-monolith failure, but two unbalanced-and-volatile integrations will cause avoidable pain as the Makefile evolves, and two smaller cohesion defects add cognitive friction. + +## Coupling Overview + +| Integration | [Strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) | [Distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) | [Volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) | [Balanced?](https://coupling.dev/posts/core-concepts/balance/) | +| --- | --- | --- | --- | --- | +| `commands/*` → `config/Makefile` (target & variable names) | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), implicit | High — separate language, separate directory, subprocess | High — orchestration is the core subdomain, targets evolve | **No** — unbalanced & volatile | +| `agents.status` / `pi.status` → container `.agent/status.json` | [Intrusive](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) | High — bash entrypoint, runs inside the container | Medium–High — monitoring actively developed | **No** — unbalanced & volatile | +| `agents._agents_home` ≡ `pi_agents._agents_home` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | Low — sibling modules, one package | Low–Medium | **No** — duplication mistaken for cohesion | +| `_agents_home()` fallback ≡ Makefile `AGENTS_HOME` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | High — cross-language | Low | Tolerable — low volatility neutralizes | +| `main.py` → `commands/*` (registration) | [Model](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), inconsistent | Low — one package | Low | Borderline — low cohesion | +| `check_token()` ≡ Makefile `_check-token` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | High — cross-language | Low | Yes — low volatility neutralizes | + +The last row is included deliberately as a contrast: it *is* duplicated, unbalanced coupling, but [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) is low enough that the [balance rule](https://coupling.dev/posts/core-concepts/balance/) — `(STRENGTH XOR DISTANCE) OR NOT VOLATILE` — is satisfied. Not every imbalance is a defect; only the volatile ones below are. + +## Issue: Makefile interface is an implicit, scattered contract + +**Integration**: `container_cli/commands/*` → `config/Makefile` +**Severity**: Significant + +### Knowledge Leakage + +Every command function knows, as hard-coded strings, two things about the Makefile: the **name of a target** (`"spawn"`, `"list-agents"`, `"spawn-pi"`, `"summary-agent"`, …) and the **names of the variables that target accepts** (`BRANCH`, `TASK`, `CPUS`, `MEMORY`, `IMAGE`, `DOCKERFILE`, `SUBNET`, `NETWORK`, `PI_IMAGE`, `PI_DOCKERFILE`, `PI_BASE_URL`, `PI_MODEL_ID`, `NAME`). That is the Makefile's *functional requirements* — what it does and how it must be invoked — replicated inside Python. The shared knowledge is **implicit**: there is no enum, no schema, no generated mapping. The only place the correspondence is written down is a prose table in `docs/agents/cli.md`, which nothing keeps honest. `run_make` centralizes the *mechanism* (`make -C config KEY=VALUE`) but not the *vocabulary*, so each of the five `commands/*` modules independently owns a fragment of an undeclared contract. + +This is [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — the components must change together when a target's interface changes — and it is implicit functional coupling, which the model singles out as particularly dangerous: there is no compiler, type checker, or test that links `agents.py:33`'s `"BRANCH"` to the Makefile's `BRANCH ?=`. + +### Complexity Impact + +A developer changing a Makefile target name or variable cannot see, *from the Makefile*, that Python depends on it — the dependency is invisible at the point of change. To safely rename one target the developer must simultaneously hold in mind: the Makefile target, the `run_make` call site in `commands/*`, the `cli.md` mapping table, and the acceptance test. That is knowledge spread across four artifacts in two languages for what should be a trivial rename. It exceeds the 4±1 working-memory budget, which is precisely what turns a change from [modular](https://coupling.dev/posts/core-concepts/modularity/) (predictable outcome) into [complex](https://coupling.dev/posts/core-concepts/complexity/) (outcome discovered only by running it). + +### Cascading Changes + +- **Renaming `list-agents` → `agents-list` in the Makefile**: the Python literal `"list-agents"` still type-checks, still passes `ruff`, still imports cleanly. The break surfaces only when a user runs `q agents list` and `make` reports `No rule to make target`. The acceptance tests mock `run_make`, so they assert *which string was passed*, not that the target *exists* — there is no test-time signal either. +- **Adding a variable to an existing target** (e.g. a `--no-cache` option for `build`): requires coordinated edits in `build.py`, the Makefile, and `cli.md`, with nothing enforcing that the three agree. +- The coupling is genuinely [tight](https://coupling.dev/posts/core-concepts/balance/): high [integration strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) (functional) across high [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — separate language, separate top-level directory, subprocess boundary. The Makefile is a [core subdomain](https://coupling.dev/posts/dimensions-of-coupling/volatility/): the orchestration logic *is* the product, and git history shows targets added continuously (PI agents, `apply-sensors`). High strength + high distance + high [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) fails the balance rule. + +Why **Significant** and not Critical: the same person maintains both sides, so [socio-technical distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) is low — there is no cross-team coordination cost — and Makefile change is mostly *additive* (new targets), so existing literals rarely break. The danger is real but not yet frequent. + +### Recommended Improvement + +Introduce a single [contract](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) module — e.g. `container_cli/makefile.py` — as the *only* place target names and their permitted variables are written down. One small structure per target (target name + allowed variable keys) converts the implicit functional coupling into explicit [contract coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) localized in one file; command modules then reference `Targets.SPAWN` instead of the literal `"spawn"`. Optionally add one test that runs `make -C config -n ` (dry-run) for every target in the registry, so a renamed or removed target fails CI instead of a user's terminal. + +This deliberately does **not** reduce distance — the CLI must wrap the Makefile, that is its purpose, and decomposing further would only raise distance. It reduces *strength* by encapsulating the contract, and it concentrates the volatile knowledge so a Makefile change has exactly one Python counterpart to update. Trade-off: a thin indirection layer plus the discipline of keeping the registry current — cheap relative to silent runtime breakage in the product's most-changed area. + +## Issue: `status` command bypasses the Makefile and reads the container's private file + +**Integration**: `agents.status` / `pi_agents.status` → container worktree `.agent/status.json` +**Severity**: Significant + +### Knowledge Leakage + +`agents.py:78` and `pi_agents.py:128` build the path `$AGENTS_HOME//.agent/status.json` and read it directly off disk. `.agent/` is internal bookkeeping created by the container entrypoint (`config/entrypoint.sh` / `entrypoint-pi.sh`) for its own use; the CLI treats that private layout as if it were a public interface. This is [intrusive coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — the highest [integration strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — because the CLI depends on an implementation detail of a different component. + +The leak is twofold: the *directory layout* (`.agent/status.json`) and even the *fallback message text* (`[status] No status file found…`, `[status] Expected at: …`) are reproduced in Python. They are reproduced because the Makefile **already has** a `status-agent` target (`Makefile:179`) that performs the identical read and prints near-identical messages. So this is also [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) through duplication — the knowledge is copied, not shared. + +### Complexity Impact + +There are now two implementations of "show agent status" — `status-agent` in Make and `status()` in Python — and nothing keeps them consistent. `q agents summary` routes through `make summary-agent`, but `q agents status` does *not* route through `make status-agent`; a reader cannot predict from the CLI surface which commands delegate and which reimplement. The Python file read looks like ordinary I/O, not an integration point, so it is easy to miss when reasoning about what depends on the entrypoint's layout — the integration is hidden, which is the hallmark of [complexity](https://coupling.dev/posts/core-concepts/complexity/). + +### Cascading Changes + +- **The entrypoint relocates `status.json`** (say to `.agent/state/status.json`): `status-agent`, `status-pi-agent`, `agents.status`, and `pi_agents.status` all break — four sites, two languages — and the two Python ones give no signal until invoked. +- **The "no status file" UX is reworded in the Makefile only**: `q … status` silently keeps the old wording; the two surfaces drift apart. +- `status.json` is part of actively-developed structured monitoring (recent commits #7 and #9), so this sits in a [volatile](https://coupling.dev/posts/dimensions-of-coupling/volatility/) area — the duplication *will* be exercised. Intrusive strength + high distance + volatility is unbalanced and not neutralized. + +### Recommended Improvement + +Route `status` through the target that already exists: `run_make("status-agent", {"BRANCH": branch})` — exactly as `summary` already does. This removes the intrusive coupling *and* the duplication in one move: the Makefile becomes the single component that knows the `.agent/` layout, and the CLI holds only a target name (governed by the contract module proposed in the first issue). The documented rationale for the direct read — "works after the container has exited" — is *already* satisfied by `status-agent`, which is a pure file read with no live-container dependency, so no capability is lost. + +Trade-off: the Python `status` currently raises `typer.Exit(1)` when the file is missing, whereas the Makefile target prints a message and exits `0`. If a non-zero exit code is contractually required, add an explicit `exit 1` to the missing-file branch of `status-agent` rather than reimplementing the read in Python. The cost is one line of Make; the gain is eliminating an intrusive, duplicated integration in a volatile area. + +## Issue: Worktree-path logic duplicated across command modules + +**Integration**: `agents._agents_home` ≡ `pi_agents._agents_home` (and both ≡ Makefile `AGENTS_HOME`) +**Severity**: Minor + +### Knowledge Leakage + +`_agents_home()` is byte-for-byte identical in `agents.py:15` and `pi_agents.py:25`: "use `$AGENTS_HOME` if set, else `/../.worktrees`". The same rule is the Makefile's `AGENTS_HOME ?= $(dirname GIT_ROOT)/.worktrees` (`Makefile:54`). One business rule — where agent worktrees live — implemented three times. This is [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) through duplicated knowledge. The `run`/`shell` pair in `run.py` is a smaller instance of the same pattern: two functions with identical bodies that differ only in the target string passed to `run_make`. + +### Complexity Impact + +The two Python copies sit at low [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — sibling modules in one package. High strength at low distance would normally read as [high cohesion](https://coupling.dev/posts/core-concepts/balance/), which is modular. But this is duplication, not cohesion: the copies are not one deliberate unit, they are two units that *secretly* share a rule. The `STRENGTH XOR DISTANCE` formula says "balanced" only if you mistake the duplication for cohesion. The real cost is the hidden obligation every developer carries — to remember that editing one copy means editing the twin. + +### Cascading Changes + +- **Changing the fallback** (e.g. `.worktrees` → `.agent-worktrees`) requires editing both Python copies; missing one makes `q agents …` and `q pi …` resolve *different* worktree roots — a confusing, hard-to-spot bug. +- The Python ↔ Makefile copy is unbalanced (high strength, high cross-language distance), but [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) is low — the worktree convention rarely changes. By the [balance rule](https://coupling.dev/posts/core-concepts/balance/), low volatility neutralizes it: tolerable [technical debt](https://coupling.dev/posts/core-concepts/balance/), not urgent. + +### Recommended Improvement + +Hoist `_agents_home()` into `utils.py`, beside `find_git_root`, and have both command modules import it — one definition, genuine [high cohesion](https://coupling.dev/posts/core-concepts/balance/) instead of duplication. Collapse `run`/`shell` the same way, into one shared helper parameterised by target name. Both are pure DRY fixes at low distance, with no trade-off. + +Leave the Python-vs-Makefile `AGENTS_HOME` duplication as-is. Its low volatility makes the imbalance acceptable, and unifying it would mean either the CLI parsing the Makefile or the Makefile shelling into Python — both add more coupling than they remove. This is the model's pragmatism in action: not every imbalance is worth fixing. + +## Issue: Two inconsistent command-registration patterns + +**Integration**: `main.py` → `commands/*` +**Severity**: Minor + +### Knowledge Leakage + +`main.py` mounts `agents` and `pi_agents` as Typer sub-apps via `add_typer(agents.app, …)`, but registers `build`, `network`, and `run` by cherry-picking individual functions: `app.command("build")(build.build)`. Meanwhile `build.py`, `network.py`, and `run.py` each still create a `typer.Typer()` `app` (e.g. `build.py:9`) and decorate their functions with `@app.command()` — yet that `app` object is never mounted anywhere. `main.py` carries an inconsistent [model](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) of what a "command module" is: sometimes a sub-app, sometimes a bag of functions. + +### Complexity Impact + +This is a [low cohesion](https://coupling.dev/posts/core-concepts/balance/) signal: the `app` object in `build.py`/`network.py`/`run.py` is dead code that actively misleads. A reader who has seen `agents.app` mounted reasonably assumes `build.app` is mounted too, and must trace `main.py` to discover it is not. The cost is small and contained to one short file, but every new command module forces an unstated choice between two patterns — accidental complexity with no functional purpose. + +### Cascading Changes + +Low. `main.py` and `commands/*` are at minimal [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — one package — and registration changes are infrequent. There is no volatile cascade here; the harm is purely cognitive friction, which is why this is Minor. + +### Recommended Improvement + +Pick one pattern and apply it everywhere. The simplest: every command module exposes a Typer `app`, and `main.py` only ever calls `add_typer`. If flat top-level commands like `q build` must be preserved (rather than `q build build`), then drop the unused `typer.Typer()` from `build.py`, `network.py`, and `run.py` so their functions are honestly just functions, registered directly. Either way the rule "a command module looks like *X*" becomes true everywhere, and the dead `app` objects disappear. The only cost is a one-time edit; the gain is one fewer decision per future command module. + +--- + +_This analysis was performed using the [Balanced Coupling](https://coupling.dev) model by [Vlad Khononov](https://vladikk.com)._ From 1401f19bd5ffea775f7a3546236df96143ec11f6 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Fri, 22 May 2026 21:24:18 -0500 Subject: [PATCH 8/9] feat(agents): add --model flag (default opus), persist lifecycle markers, bump bundled claude CLI Add a --model option to `q spawn` so the orchestrator governs the headless agent's Claude model instead of inheriting the host's interactive ~/.claude/settings.json preference (which can belong to a different account than the container's OAuth token). The flag flows CLI -> Makefile (MODEL ?= opus) -> -e AGENT_MODEL -> entrypoint (`claude --model "$AGENT_MODEL"`), default opus. Fixes a second bug that the original failure masked: emit_marker() echoed [agent:status] events only to the container's stdout, so once the --rm container was gone `q agents summary` fell back to agent.log and printed "(no structured events found)". emit_marker now also appends to agent.log (group redirect so the open error is suppressed cleanly in the shellspec mock env), and tee -a preserves the early starting/working markers across the claude run. Bumps the bundled claude CLI in the Wolfi image. The previous version (2.1.138) had a bug rejecting Opus with `400 role 'system' is not supported on this model`. The Dockerfile now ends the install RUN with `&& claude --version`, both as a build-time trace and as a cache-busting tail so a plain rebuild picks up newer CLIs without forcing --no-cache. Tests and docs: - new acceptance scenario for `q spawn --model opus` - unit tests for MODEL presence/absence in make_vars - shellspec asserts default opus and AGENT_MODEL=sonnet override - SKILL.md, docs/agents/{cli,container-agent,spawn-agent-skill}.md Gates after rebuild to claude 2.1.149: ruff, 100 unit/acceptance, 59 shellspec, mutation 95.2%, and make e2e-test (Claude round-trip) all green; PI e2e remains opt-in (local mlx server). Co-Authored-By: Claude Opus 4.7 --- .claude/skills/spawn-agent/SKILL.md | 12 ++++++++++ app/cli/src/container_cli/commands/agents.py | 6 +++++ .../tests/acceptance/features/spawn.feature | 6 +++++ app/cli/tests/test_agents.py | 13 +++++++++- config/Dockerfile.wolfi | 11 +++++++-- config/Makefile | 8 +++++++ config/entrypoint.sh | 24 ++++++++++++++++--- config/spec/entrypoint_spec.sh | 16 ++++++++++++- docs/agents/cli.md | 8 +++++++ docs/agents/container-agent.md | 6 +++-- docs/agents/spawn-agent-skill.md | 2 ++ 11 files changed, 103 insertions(+), 9 deletions(-) diff --git a/.claude/skills/spawn-agent/SKILL.md b/.claude/skills/spawn-agent/SKILL.md index af3dd7e..ec4b3aa 100644 --- a/.claude/skills/spawn-agent/SKILL.md +++ b/.claude/skills/spawn-agent/SKILL.md @@ -46,8 +46,17 @@ PROJECT_NAME=$(basename "$GIT_ROOT") # e.g. stackai WORKTREES_DIR="${AGENTS_HOME}" # from env var, e.g. ~/agents NETWORK=claude-agent-net IMAGE=claude-agent:wolfi +AGENT_MODEL=opus # model the headless agent runs ``` +**Agent model.** The headless agent runs whatever `AGENT_MODEL` is passed via +`-e AGENT_MODEL`; it defaults to `opus` so the agent runs with the most capable +model out of the box. Do **not** rely on the host's `~/.claude/settings.json` +`model` preference — it is copied into the container but a headless agent must +not inherit a personal interactive setting, and the container's OAuth token may +belong to a different plan. Override per spawn when a lighter task warrants it +(`q spawn --model sonnet`, or `-e AGENT_MODEL=sonnet` on a raw `container run`). + Container names follow the pattern: `-` Branch sanitization — each `/`, `_`, or space becomes a single `-`, lowercased: ```bash @@ -143,6 +152,7 @@ container run -d --rm \ -v "${HOME}/.claude:/root/.claudenew:ro" \ -v "${HOME}/.claude.json:/root/.claudenew.json:ro" \ -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \ + -e "AGENT_MODEL=${AGENT_MODEL:-opus}" \ -e "CLAUDE_CODE_OAUTH_TOKEN=${CLAUDE_CONTAINER_OAUTH_TOKEN}" \ claude-agent:wolfi \ --worktree "${BRANCH}" --task "${TASK}" @@ -405,6 +415,8 @@ Full docs: https://github.com/apple/container/blob/main/docs/command-reference.m the container exits so you can review the agent's work. - **CLAUDE_CONTAINER_OAUTH_TOKEN** must be set — containers authenticate with this, not the host Claude session. +- **AGENT_MODEL** controls the agent's Claude model (`opus` by default). The agent + does not inherit the host's `settings.json` model preference — pass it explicitly. - The image `claude-agent:wolfi` must exist. If not: `cd /config && make build` - Multiple agents run in parallel — each gets a unique container + worktree. - Branch names with `/` (e.g., `feat/auth`) are valid for git; sanitized for container diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py index 9c1c3d5..c99c850 100644 --- a/app/cli/src/container_cli/commands/agents.py +++ b/app/cli/src/container_cli/commands/agents.py @@ -17,6 +17,10 @@ def spawn( cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, image: Annotated[str | None, typer.Option("--image", help="Image tag")] = None, + model: Annotated[ + str | None, + typer.Option("--model", help="Claude model the agent runs (e.g. sonnet, opus, haiku)"), + ] = None, ) -> None: """Spawn a detached headless agent container.""" check_token() @@ -27,6 +31,8 @@ def spawn( make_vars["MEMORY"] = memory if image: make_vars["IMAGE"] = image + if model: + make_vars["MODEL"] = model run_make(Target.SPAWN, make_vars) diff --git a/app/cli/tests/acceptance/features/spawn.feature b/app/cli/tests/acceptance/features/spawn.feature index 36d98cc..37e6cb1 100644 --- a/app/cli/tests/acceptance/features/spawn.feature +++ b/app/cli/tests/acceptance/features/spawn.feature @@ -24,3 +24,9 @@ Feature: Spawn agent containers When I run "q spawn --branch feat/bar --task work --cpus 8 --memory 16G --image custom:tag" Then the command exits successfully And the make vars include CPUS="8" and MEMORY="16G" and IMAGE="custom:tag" + + Scenario: Spawn with an explicit agent model + Given the CLAUDE_CONTAINER_OAUTH_TOKEN is set + When I run "q spawn --branch feat/baz --task work --model opus" + Then the command exits successfully + And the make vars include MODEL="opus" diff --git a/app/cli/tests/test_agents.py b/app/cli/tests/test_agents.py index 719e9c8..b6374dd 100644 --- a/app/cli/tests/test_agents.py +++ b/app/cli/tests/test_agents.py @@ -42,8 +42,18 @@ def test_spawn_with_image(self, mock_run_make, mock_check_token): call_vars = mock_run_make["agents"].call_args[0][1] assert call_vars["IMAGE"] == "my-img:latest" + def test_spawn_with_model(self, mock_run_make, mock_check_token): + spawn(branch="b", task="t", cpus=None, memory=None, image=None, model="opus") + call_vars = mock_run_make["agents"].call_args[0][1] + assert call_vars["MODEL"] == "opus" + + def test_spawn_omits_model_when_unset(self, mock_run_make, mock_check_token): + spawn(branch="b", task="t", cpus=None, memory=None, image=None, model=None) + call_vars = mock_run_make["agents"].call_args[0][1] + assert "MODEL" not in call_vars + def test_spawn_all_optional_params(self, mock_run_make, mock_check_token): - spawn(branch="b", task="t", cpus=2, memory="4G", image="img") + spawn(branch="b", task="t", cpus=2, memory="4G", image="img", model="opus") call_vars = mock_run_make["agents"].call_args[0][1] assert call_vars == { "BRANCH": "b", @@ -51,6 +61,7 @@ def test_spawn_all_optional_params(self, mock_run_make, mock_check_token): "CPUS": "2", "MEMORY": "4G", "IMAGE": "img", + "MODEL": "opus", } diff --git a/config/Dockerfile.wolfi b/config/Dockerfile.wolfi index 7a17aa1..1bed1c8 100644 --- a/config/Dockerfile.wolfi +++ b/config/Dockerfile.wolfi @@ -80,10 +80,17 @@ COPY --from=builder /root/.cargo/bin/dust /usr/local/bin/dust COPY --from=builder /root/.cargo/bin/procs /usr/local/bin/procs COPY --from=builder /root/.cargo/bin/btm /usr/local/bin/btm -# Claude Code CLI +# Claude Code CLI — pin the install at build time. CLAUDE_CODE_DISABLE_AUTOUPDATE=1 +# in this image means the CLI never updates inside the container, so the image +# ships exactly the version the installer returns at build time. `claude --version` +# at the end serves two purposes: log the version into the build output and act +# as a cache-busting tail so a `--no-cache-filter` rebuild picks up a newer CLI +# (e.g. 2.1.148+ for the Opus `role 'system'` fix) without forcing a full +# --no-cache that would rebuild the slow Rust builder stage. RUN curl -fsSL https://claude.ai/install.sh -o /tmp/claude-install.sh \ && yes | bash /tmp/claude-install.sh \ - && rm -f /tmp/claude-install.sh + && rm -f /tmp/claude-install.sh \ + && claude --version # OpenCode RUN curl -fsSL https://opencode.ai/install | bash \ diff --git a/config/Makefile b/config/Makefile index ae31d69..d56ba76 100644 --- a/config/Makefile +++ b/config/Makefile @@ -44,6 +44,11 @@ MEMORY ?= 3G # Agent variables BRANCH ?= agent-$(shell date +%s) TASK ?= Explore the codebase and report what you find. +# MODEL: Claude model the headless agent runs with. Defaults to opus — the +# agent doesn't inherit the host's interactive `model` preference (see the +# entrypoint); a fixed default keeps the choice in the orchestrator. Override +# per spawn, e.g. `make spawn ... MODEL=sonnet`. +MODEL ?= opus # Derived paths GIT_ROOT := $(shell git -C $(CURDIR) rev-parse --show-toplevel) @@ -109,6 +114,7 @@ spawn: network _check-token @echo "[spawn] Launching agent: $(PROJECT_NAME)-$(CONTAINER_BRANCH)" @echo "[spawn] Worktree: $(WORKTREES_DIR)/$(BRANCH)" @echo "[spawn] Task: $(TASK)" + @echo "[spawn] Model: $(MODEL)" container run -d --rm \ --name $(PROJECT_NAME)-$(CONTAINER_BRANCH) \ --network $(NETWORK) \ @@ -120,6 +126,7 @@ spawn: network _check-token -v $(HOME)/.claude:/root/.claudenew:ro \ -v $(HOME)/.claude.json:/root/.claudenew.json:ro \ -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \ + -e AGENT_MODEL=$(MODEL) \ -e CLAUDE_CODE_OAUTH_TOKEN=$${$(HOST_TOKEN_VAR)} \ $(IMAGE) \ --worktree "$(BRANCH)" --task "$(TASK)" @@ -392,6 +399,7 @@ help: @printf " %-20s %s\n" "CPUS" "$(CPUS)" @printf " %-20s %s\n" "MEMORY" "$(MEMORY)" @printf " %-20s %s\n" "BRANCH" "$(BRANCH)" + @printf " %-20s %s\n" "MODEL" "$(MODEL)" @printf " %-20s %s\n" "PROJECT_NAME" "$(PROJECT_NAME)" @printf " %-20s %s\n" "GIT_ROOT" "$(GIT_ROOT)" @printf " %-20s %s\n" "AGENTS_HOME" "$(AGENTS_HOME)" diff --git a/config/entrypoint.sh b/config/entrypoint.sh index cbbfd82..7f4ba60 100644 --- a/config/entrypoint.sh +++ b/config/entrypoint.sh @@ -22,6 +22,12 @@ AGENT_TASK="" PROJECT_NAME="" PASSTHROUGH_ARGS=() +# Model the headless agent runs with. The orchestrator passes it via +# -e AGENT_MODEL; it defaults to opus so the agent runs with the most capable +# model by default and never inherits the host user's personal `model` +# preference from the copied settings.json (which can be a different account). +AGENT_MODEL="${AGENT_MODEL:-opus}" + # ── Functions ───────────────────────────────────────────────────────────────── parse_args() { @@ -70,6 +76,7 @@ create_worktree() { setup_agent_perms() { echo "[entrypoint] Starting Claude agent (headless)..." echo "[entrypoint] Task: ${AGENT_TASK}" + echo "[entrypoint] Model: ${AGENT_MODEL}" echo "---" # Make claude's install path traversable for non-root users (installed under /root/) # go+x required: agent is in group root (gid=0), so group bits apply, not others bits @@ -94,7 +101,12 @@ write_status() { emit_marker() { local phase="$1"; shift - echo "[agent:status] PHASE=${phase} BRANCH=${WORKTREE_BRANCH} $*" + local line="[agent:status] PHASE=${phase} BRANCH=${WORKTREE_BRANCH} $*" + # Echo to the container's stdout (read live via `container logs`) and append + # to the persisted log so `summary-agent` still finds the markers once the + # --rm container is gone and `container logs` no longer works. + echo "$line" + { echo "$line" >> "${AGENT_DIR}/agent.log"; } 2>/dev/null || true } run_agent() { @@ -102,6 +114,11 @@ run_agent() { mkdir -p "$AGENT_DIR" chown -R agent:agent "$AGENT_DIR" + # Start the persisted log empty so a re-spawn on an existing worktree does + # not accumulate a previous run's output. Markers and the claude output + # below both append to it. + { : > "${AGENT_DIR}/agent.log"; } 2>/dev/null || true + # Add .agent/ to worktree .gitignore (safe if dir doesn't exist yet) if ! grep -qxF '.agent/' "${WORKTREE_PATH}/.gitignore" 2>/dev/null; then echo '.agent/' >> "${WORKTREE_PATH}/.gitignore" 2>/dev/null || true @@ -117,10 +134,11 @@ run_agent() { write_status "working" emit_marker "working" - # Run claude with tee to persist logs; capture exit code through pipe + # Run claude with tee to persist logs; capture exit code through pipe. + # `tee -a` appends so the "starting"/"working" markers above survive. set +e su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions \ - -p "$AGENT_TASK" 2>&1 | tee "$AGENT_DIR/agent.log" + --model "$AGENT_MODEL" -p "$AGENT_TASK" 2>&1 | tee -a "$AGENT_DIR/agent.log" local exit_code=${PIPESTATUS[0]} set -e diff --git a/config/spec/entrypoint_spec.sh b/config/spec/entrypoint_spec.sh index 0770c0e..c7485c8 100644 --- a/config/spec/entrypoint_spec.sh +++ b/config/spec/entrypoint_spec.sh @@ -342,7 +342,21 @@ WRAPPER_EOF It "runs su-exec with correct claude command" When run run_entrypoint --worktree agent-br --task "do work" - The output should include "[MOCK] su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions -p do work" + The output should include "[MOCK] su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions --model opus -p do work" + The status should equal 0 + End + + It "defaults the agent model to opus" + When run run_entrypoint --worktree agent-br --task "do work" + The output should include "[entrypoint] Model: opus" + The status should equal 0 + End + + It "honors AGENT_MODEL when the orchestrator sets it" + export AGENT_MODEL=sonnet + When run run_entrypoint --worktree agent-br --task "do work" + The output should include "claude --dangerously-skip-permissions --model sonnet -p do work" + The output should include "[entrypoint] Model: sonnet" The status should equal 0 End diff --git a/docs/agents/cli.md b/docs/agents/cli.md index b5d5dc0..f4e1ae8 100644 --- a/docs/agents/cli.md +++ b/docs/agents/cli.md @@ -69,6 +69,7 @@ Spawns a detached headless agent in an isolated git worktree. ```bash q spawn --branch feat/oauth2 --task "Implement OAuth2 with JWT tokens" q spawn --branch test/payments --task "Write unit tests for payments module" --cpus 4 +q spawn --branch feat/quick-fix --task "Tighten the readme" --model sonnet ``` | Option | Required | Description | @@ -78,9 +79,16 @@ q spawn --branch test/payments --task "Write unit tests for payments module" --c | `--cpus` | no | CPU count override | | `--memory` | no | Memory limit override (e.g. `8G`) | | `--image` | no | Image tag override | +| `--model` | no | Claude model the agent runs (`opus` default; e.g. `sonnet`, `haiku`) | Requires `CLAUDE_CONTAINER_OAUTH_TOKEN` to be set. +> **Agent model:** the agent runs `opus` unless `--model` says otherwise. It +> does **not** inherit the host's `~/.claude/settings.json` `model` preference — +> that file is copied into the container for credentials only, and a headless +> agent must not depend on a personal interactive setting. Drop to `sonnet` for +> lighter tasks. The Makefile passes the choice through as `-e AGENT_MODEL`. + > **Branch naming:** avoid `/` in branch names — they create nested subdirectories in `$AGENTS_HOME` which the entrypoint cannot handle. Use flat names like `feat-oauth2` instead of `feat/oauth2`. --- diff --git a/docs/agents/container-agent.md b/docs/agents/container-agent.md index 1af685c..d6f33c6 100644 --- a/docs/agents/container-agent.md +++ b/docs/agents/container-agent.md @@ -127,10 +127,12 @@ container run -d --rm claude-agent:wolfi \ 2. `git -C /workspace worktree add /worktrees/ -b ` - If the branch already exists: `git worktree add /worktrees/ ` 3. `cd /worktrees/` -4. `claude --dangerously-skip-permissions -p ""` +4. `claude --dangerously-skip-permissions --model "$AGENT_MODEL" -p ""` **Why `--dangerously-skip-permissions`:** In headless mode there is no interactive user to approve permissions. The container is a sandboxed environment with access only to the mounted worktree, so it is safe to skip confirmations. +**Why `--model "$AGENT_MODEL"`:** The host's `~/.claude` is copied into the container for credentials, and that copy includes the host user's personal `settings.json` — whose `model` preference must not silently govern a headless agent (the container's OAuth token may belong to a different plan). The entrypoint pins the model explicitly: `AGENT_MODEL` is passed via `-e AGENT_MODEL` and defaults to `opus`. Override per spawn with `q spawn --model ` or `make spawn ... MODEL=`. + **Why run as `agent` (non-root):** Claude CLI blocks `--dangerously-skip-permissions` when the process runs as `root` (uid 0). The entrypoint uses `su-exec` to drop to the `agent` user before executing Claude. IMPORTANT: **Why the worktree is created inside the container:** Git needs access to the repository to register the worktree in `.git/worktrees/`. Since the repo is mounted at `/workspace` inside the container, the worktree must be created from there. If it were created directly from the host, the path registered in git would be the host path (`/Users/...`), which would not exist inside the container. @@ -366,7 +368,7 @@ RUN addgroup -S agent \ │ └─► /home/agent/.claude/ (copied + chown → agent) │ - su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions -p "..." + su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions --model "$AGENT_MODEL" -p "..." ``` The entrypoint: diff --git a/docs/agents/spawn-agent-skill.md b/docs/agents/spawn-agent-skill.md index f7b20d7..78acdfd 100644 --- a/docs/agents/spawn-agent-skill.md +++ b/docs/agents/spawn-agent-skill.md @@ -85,6 +85,7 @@ export CLAUDE_CONTAINER_OAUTH_TOKEN= 5. container run -d --rm ← detached (non-blocking) • -v $GIT_ROOT:/workspace ← full repo (read/write) • -v $AGENTS_HOME:/worktrees ← worktree destination + • -e AGENT_MODEL=$MODEL → claude --model $MODEL (default opus) • --worktree $BRANCH → entrypoint creates the worktree • --task "$TASK" → claude -p "$TASK" in the worktree │ @@ -245,6 +246,7 @@ container run -d --rm \ -v "${HOME}/.claude:/root/.claudenew:ro" \ -v "${HOME}/.claude.json:/root/.claudenew.json:ro" \ -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \ + -e "AGENT_MODEL=${AGENT_MODEL:-opus}" \ -e "CLAUDE_CODE_OAUTH_TOKEN=${CLAUDE_CONTAINER_OAUTH_TOKEN}" \ claude-agent:wolfi \ --worktree "feat/oauth2" --task "${TASK}" From 10ed154185d2dfeec0fd1d662ee7471be40ba289 Mon Sep 17 00:00:00 2001 From: deimagjas Date: Fri, 22 May 2026 22:01:52 -0500 Subject: [PATCH 9/9] test(e2e): opt-in mlx_lm.server autostart fixture for the PI round-trip MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a session-scoped fixture in tests/e2e/conftest.py that — when STACKAI_E2E_AUTOSTART_MLX=1 is exported — spawns mlx_lm.server itself, waits for /v1/models to respond, yields to the tests, and tears the subprocess down on session teardown. Without the env var, or when a server is already reachable on the URL, the fixture is a no-op (a manually-started server is honoured and not touched). The invocation is the exact set of flags the PI agent expects (model mlx-community/gemma-4-26b-a4b-it-4bit, port 8080, --temp 0.9, --top-p 0.95, 6 GB prompt cache, etc.) — no coupling to the iac CLI: the e2e suite must run standalone. Boot timeout is configurable via STACKAI_E2E_MLX_BOOT_TIMEOUT (default 600s) to absorb first-run model downloads, and the subprocess log lands in $TMPDIR/stackai-e2e-mlx.log for diagnosis. Verified end-to-end on a warm cache: both the Claude and PI round-trips pass in ~33s. Co-Authored-By: Claude Opus 4.7 --- app/cli/tests/e2e/conftest.py | 123 +++++++++++++++++++++++++++++++--- docs/agents/e2e-tests.md | 26 +++++++ 2 files changed, 140 insertions(+), 9 deletions(-) diff --git a/app/cli/tests/e2e/conftest.py b/app/cli/tests/e2e/conftest.py index 66ec2d8..eb65fce 100644 --- a/app/cli/tests/e2e/conftest.py +++ b/app/cli/tests/e2e/conftest.py @@ -18,6 +18,7 @@ import shutil import subprocess import sys +import tempfile import time import urllib.error import urllib.request @@ -96,16 +97,120 @@ def require_pi_image() -> None: pytest.skip("image claude-pi:ubuntu not built — run `q pi build` first") -@pytest.fixture() -def require_mlx_server() -> None: - """Skip the test unless the local mlx_lm.server is reachable on the host.""" - url = os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models") +def _http_ok(url: str, *, timeout: float) -> bool: + """Tiny probe — True when the URL responds 200.""" try: - with urllib.request.urlopen(url, timeout=5) as resp: - if resp.status != 200: - pytest.skip(f"mlx_lm.server at {url} returned HTTP {resp.status}") - except (urllib.error.URLError, OSError) as exc: - pytest.skip(f"mlx_lm.server unreachable at {url} ({exc}) — run `uv run iac server start`") + with urllib.request.urlopen(url, timeout=timeout) as resp: + return resp.status == 200 + except (urllib.error.URLError, OSError): + return False + + +def _mlx_url() -> str: + return os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models") + + +@pytest.fixture(scope="session") +def _mlx_server_lifecycle() -> Iterator[subprocess.Popen[bytes] | None]: + """Optionally spawn mlx_lm.server for the duration of the e2e session. + + Activated by exporting ``STACKAI_E2E_AUTOSTART_MLX=1``. Without that flag, + or when a server is already reachable on the expected URL, this is a no-op + — tests still see whatever process the user manages out of band. + + The exact invocation mirrors the parameters the PI agent expects (no + coupling to the `iac` CLI: the e2e suite must run standalone). First-run + cost is significant — model download + warmup can take several minutes; + raise the wait ceiling with ``STACKAI_E2E_MLX_BOOT_TIMEOUT`` if needed. + """ + if os.environ.get("STACKAI_E2E_AUTOSTART_MLX") != "1": + yield None + return + + url = _mlx_url() + if _http_ok(url, timeout=2): + # Pre-existing server (e.g. started via `uv run iac server start`) — + # leave it alone so manual setups survive teardown. + yield None + return + + # conftest.py lives at app/cli/tests/e2e/ → repo root is parents[4]. + iac_dir = Path(__file__).resolve().parents[4] / "iac" + cmd = [ + "uv", "run", "--directory", str(iac_dir), "mlx_lm.server", + "--model", "mlx-community/gemma-4-26b-a4b-it-4bit", + "--host", "0.0.0.0", + "--port", "8080", + "--prompt-cache-size", "5", + "--prompt-cache-bytes", "6GB", + "--decode-concurrency", "4", + "--prompt-concurrency", "2", + "--prefill-step-size", "1024", + "--temp", "0.9", + "--top-p", "0.95", + "--top-k", "40", + "--min-p", "0.0", + "--max-tokens", "2048", + "--use-default-chat-template", + "--log-level", "INFO", + ] + + log_path = Path(tempfile.gettempdir()) / "stackai-e2e-mlx.log" + log_handle = log_path.open("ab") + print(f"\n[mlx] starting mlx_lm.server (logs: {log_path})", flush=True) + proc = subprocess.Popen( # noqa: S603 — fixed argv, no shell. + cmd, + stdout=log_handle, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + start_new_session=True, + ) + + try: + timeout = float(os.environ.get("STACKAI_E2E_MLX_BOOT_TIMEOUT", "600")) + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if proc.poll() is not None: + raise RuntimeError( + f"mlx_lm.server exited during startup (code {proc.returncode});" + f" see {log_path}" + ) + if _http_ok(url, timeout=2): + break + time.sleep(3) + else: + proc.terminate() + raise TimeoutError( + f"mlx_lm.server did not become reachable at {url} within" + f" {timeout}s; see {log_path}" + ) + print(f"[mlx] ready on {url}", flush=True) + yield proc + finally: + log_handle.close() + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=5) + + +@pytest.fixture() +def require_mlx_server(_mlx_server_lifecycle) -> None: + """Skip the test unless the local mlx_lm.server is reachable on the host. + + Composes with ``_mlx_server_lifecycle`` so when the autostart env var is + set, the server is already up by the time this check runs. + """ + url = _mlx_url() + if _http_ok(url, timeout=5): + return + pytest.skip( + f"mlx_lm.server unreachable at {url} — run `uv run iac server start`" + " or set STACKAI_E2E_AUTOSTART_MLX=1 to let the suite spawn it" + ) @pytest.fixture() diff --git a/docs/agents/e2e-tests.md b/docs/agents/e2e-tests.md index 39b8302..7d2c687 100644 --- a/docs/agents/e2e-tests.md +++ b/docs/agents/e2e-tests.md @@ -37,6 +37,25 @@ Each test skips itself, with a reason, when a prerequisite is missing. The PI server URL defaults to `http://localhost:8080/v1/models`; override the reachability check with `STACKAI_E2E_MLX_URL` if your server listens elsewhere. +### Auto-launching `mlx_lm.server` from the suite + +Export `STACKAI_E2E_AUTOSTART_MLX=1` and the e2e session bootstraps the server +itself — there's no need to start it out of band. The session-scoped fixture +in `tests/e2e/conftest.py`: + +1. Skips if the URL is already reachable (an existing server you launched + manually is honoured and not torn down). +2. Otherwise spawns the exact `mlx_lm.server` invocation the PI agent + expects (`mlx-community/gemma-4-26b-a4b-it-4bit`, port 8080, + `--temp 0.9 --top-p 0.95`, `--prompt-cache-bytes 6GB`, etc.). +3. Polls `/v1/models` until ready, then yields to the tests. +4. Terminates the subprocess at session teardown. + +Logs land in `$TMPDIR/stackai-e2e-mlx.log`. The first run downloads the model +(~16 GB) and warms the prompt cache, so the boot can take several minutes — +override the wait ceiling with `STACKAI_E2E_MLX_BOOT_TIMEOUT` (seconds, +default 600). + ## How to run ```bash @@ -44,6 +63,13 @@ cd app/cli make e2e-test # STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e ``` +With auto-launched mlx_lm.server (PI test runs end-to-end too): + +```bash +cd app/cli +STACKAI_E2E_AUTOSTART_MLX=1 make e2e-test +``` + Run a single test: ```bash