diff --git a/.claude/skills/spawn-agent/SKILL.md b/.claude/skills/spawn-agent/SKILL.md index af3dd7e..ec4b3aa 100644 --- a/.claude/skills/spawn-agent/SKILL.md +++ b/.claude/skills/spawn-agent/SKILL.md @@ -46,8 +46,17 @@ PROJECT_NAME=$(basename "$GIT_ROOT") # e.g. stackai WORKTREES_DIR="${AGENTS_HOME}" # from env var, e.g. ~/agents NETWORK=claude-agent-net IMAGE=claude-agent:wolfi +AGENT_MODEL=opus # model the headless agent runs ``` +**Agent model.** The headless agent runs whatever `AGENT_MODEL` is passed via +`-e AGENT_MODEL`; it defaults to `opus` so the agent runs with the most capable +model out of the box. Do **not** rely on the host's `~/.claude/settings.json` +`model` preference — it is copied into the container but a headless agent must +not inherit a personal interactive setting, and the container's OAuth token may +belong to a different plan. Override per spawn when a lighter task warrants it +(`q spawn --model sonnet`, or `-e AGENT_MODEL=sonnet` on a raw `container run`). + Container names follow the pattern: `-` Branch sanitization — each `/`, `_`, or space becomes a single `-`, lowercased: ```bash @@ -143,6 +152,7 @@ container run -d --rm \ -v "${HOME}/.claude:/root/.claudenew:ro" \ -v "${HOME}/.claude.json:/root/.claudenew.json:ro" \ -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \ + -e "AGENT_MODEL=${AGENT_MODEL:-opus}" \ -e "CLAUDE_CODE_OAUTH_TOKEN=${CLAUDE_CONTAINER_OAUTH_TOKEN}" \ claude-agent:wolfi \ --worktree "${BRANCH}" --task "${TASK}" @@ -405,6 +415,8 @@ Full docs: https://github.com/apple/container/blob/main/docs/command-reference.m the container exits so you can review the agent's work. - **CLAUDE_CONTAINER_OAUTH_TOKEN** must be set — containers authenticate with this, not the host Claude session. +- **AGENT_MODEL** controls the agent's Claude model (`opus` by default). The agent + does not inherit the host's `settings.json` model preference — pass it explicitly. - The image `claude-agent:wolfi` must exist. If not: `cd /config && make build` - Multiple agents run in parallel — each gets a unique container + worktree. - Branch names with `/` (e.g., `feat/auth`) are valid for git; sanitized for container diff --git a/CLAUDE.md b/CLAUDE.md index 96508f2..5c4be47 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -53,6 +53,9 @@ uv run pytest -k test_spawn # single test by name # Acceptance tests only (Gherkin/BDD, local — no real containers) make acceptance-test +# End-to-end tests (real containers, opt-in, local-only — see docs/agents/e2e-tests.md) +make e2e-test + # Mutation testing gate (≥ 70% kill rate) make mutation-ci-threshold diff --git a/app/cli/Makefile b/app/cli/Makefile index b6568a9..a0fc9ee 100644 --- a/app/cli/Makefile +++ b/app/cli/Makefile @@ -42,6 +42,13 @@ test-all: eval-skills: claude -p "/skill-creator:skill-creator run evals for the spawn-agent skill at ~/.claude/skills/spawn-agent/" +# End-to-end container tests — opt-in, local-only (needs Apple Container CLI). +# Spawns real agent containers; the Claude round-trip spends Claude credits. +# Excluded from CI and from `local-qa` by design. +.PHONY: e2e-test +e2e-test: + STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e + # Pre-PR quality gate local-qa: acceptance-test eval-skills diff --git a/app/cli/pyproject.toml b/app/cli/pyproject.toml index 310893f..f29f2d1 100644 --- a/app/cli/pyproject.toml +++ b/app/cli/pyproject.toml @@ -25,6 +25,9 @@ dev = [ [tool.pytest.ini_options] testpaths = ["tests", "tests/acceptance"] python_files = ["test_*.py", "*_steps.py"] +markers = [ + "e2e: real-container end-to-end tests (opt-in via STACKAI_E2E=1, local-only)", +] [tool.mutmut] paths_to_mutate = ["src/container_cli/"] diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py index 30ef3fb..c99c850 100644 --- a/app/cli/src/container_cli/commands/agents.py +++ b/app/cli/src/container_cli/commands/agents.py @@ -1,25 +1,15 @@ """Agent lifecycle commands (spawn, list, logs, follow, stop, status, summary).""" -import json -import os -from pathlib import Path from typing import Annotated import typer -from container_cli.utils import check_token, find_git_root, run_make +from container_cli.targets import Target +from container_cli.utils import check_token, print_agent_status, run_make app = typer.Typer(help="Agent lifecycle commands") -def _agents_home() -> Path: - """Resolve AGENTS_HOME, falling back to sibling .worktrees/ directory.""" - env_val = os.environ.get("AGENTS_HOME") - if env_val: - return Path(env_val) - return find_git_root().parent / ".worktrees" - - @app.command() def spawn( branch: Annotated[str, typer.Option("--branch", help="Git branch for the agent worktree")], @@ -27,6 +17,10 @@ def spawn( cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, image: Annotated[str | None, typer.Option("--image", help="Image tag")] = None, + model: Annotated[ + str | None, + typer.Option("--model", help="Claude model the agent runs (e.g. sonnet, opus, haiku)"), + ] = None, ) -> None: """Spawn a detached headless agent container.""" check_token() @@ -37,13 +31,15 @@ def spawn( make_vars["MEMORY"] = memory if image: make_vars["IMAGE"] = image - run_make("spawn", make_vars) + if model: + make_vars["MODEL"] = model + run_make(Target.SPAWN, make_vars) @app.command(name="list") def list_agents() -> None: """List active agent containers and worktrees.""" - run_make("list-agents") + run_make(Target.LIST_AGENTS) @app.command() @@ -51,7 +47,7 @@ def logs( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Show logs for a branch agent.""" - run_make("logs-agent", {"BRANCH": branch}) + run_make(Target.LOGS_AGENT, {"BRANCH": branch}) @app.command() @@ -59,7 +55,7 @@ def follow( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Follow live streaming logs for a branch agent.""" - run_make("follow-agent", {"BRANCH": branch}, tty=True) + run_make(Target.FOLLOW_AGENT, {"BRANCH": branch}, tty=True) @app.command() @@ -67,7 +63,7 @@ def stop( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Stop a branch agent container.""" - run_make("stop-agent", {"BRANCH": branch}) + run_make(Target.STOP_AGENT, {"BRANCH": branch}) @app.command() @@ -75,14 +71,7 @@ def status( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Show agent status from persisted status.json file.""" - status_file = _agents_home() / branch / ".agent" / "status.json" - if not status_file.exists(): - typer.echo(f"[status] No status file found for branch '{branch}'.") - typer.echo(f"[status] Expected at: {status_file}") - raise typer.Exit(1) - - data = json.loads(status_file.read_text()) - typer.echo(json.dumps(data, indent=2)) + print_agent_status(branch, label="status") @app.command() @@ -90,4 +79,4 @@ def summary( branch: Annotated[str, typer.Option("--branch", help="Agent branch name")], ) -> None: """Show structured lifecycle events for a branch agent.""" - run_make("summary-agent", {"BRANCH": branch}) + run_make(Target.SUMMARY_AGENT, {"BRANCH": branch}) diff --git a/app/cli/src/container_cli/commands/build.py b/app/cli/src/container_cli/commands/build.py index ed72b03..3d3bb89 100644 --- a/app/cli/src/container_cli/commands/build.py +++ b/app/cli/src/container_cli/commands/build.py @@ -1,15 +1,16 @@ -"""Image build and cleanup commands.""" +"""Image build and cleanup commands. + +These functions are registered as top-level commands by `container_cli.main`. +""" from typing import Annotated import typer +from container_cli.targets import Target from container_cli.utils import run_make -app = typer.Typer(help="Image build commands") - -@app.command() def build( image: Annotated[str | None, typer.Option("--image", help="Image tag")] = None, dockerfile: Annotated[str | None, typer.Option("--dockerfile", help="Dockerfile path")] = None, @@ -20,22 +21,19 @@ def build( make_vars["IMAGE"] = image if dockerfile: make_vars["DOCKERFILE"] = dockerfile - run_make("build", make_vars) + run_make(Target.BUILD, make_vars) -@app.command() def clean() -> None: """Remove the container image.""" - run_make("clean") + run_make(Target.CLEAN) -@app.command(name="clean-network") def clean_network() -> None: """Remove the bridge network.""" - run_make("clean-network") + run_make(Target.CLEAN_NETWORK) -@app.command(name="clean-all") def clean_all() -> None: """Remove image and network.""" - run_make("clean-all") + run_make(Target.CLEAN_ALL) diff --git a/app/cli/src/container_cli/commands/network.py b/app/cli/src/container_cli/commands/network.py index 75f08dc..a39374f 100644 --- a/app/cli/src/container_cli/commands/network.py +++ b/app/cli/src/container_cli/commands/network.py @@ -1,15 +1,16 @@ -"""Bridge network management commands.""" +"""Bridge network management command. + +Registered as a top-level command by `container_cli.main`. +""" from typing import Annotated import typer +from container_cli.targets import Target from container_cli.utils import run_make -app = typer.Typer(help="Network management commands") - -@app.command() def network( subnet: Annotated[str | None, typer.Option("--subnet", help="Subnet CIDR")] = None, network_name: Annotated[str | None, typer.Option("--network-name", help="Network name")] = None, @@ -20,4 +21,4 @@ def network( make_vars["SUBNET"] = subnet if network_name: make_vars["NETWORK"] = network_name - run_make("network", make_vars) + run_make(Target.NETWORK, make_vars) diff --git a/app/cli/src/container_cli/commands/pi_agents.py b/app/cli/src/container_cli/commands/pi_agents.py index 0241acb..09bb976 100644 --- a/app/cli/src/container_cli/commands/pi_agents.py +++ b/app/cli/src/container_cli/commands/pi_agents.py @@ -10,26 +10,16 @@ from __future__ import annotations -import json -import os -from pathlib import Path from typing import Annotated import typer -from container_cli.utils import find_git_root, run_make +from container_cli.targets import Target +from container_cli.utils import print_agent_status, run_make app = typer.Typer(help="PI agent lifecycle (local mlx_lm.server backend)") -def _agents_home() -> Path: - """Resolve AGENTS_HOME, falling back to sibling .worktrees/ directory.""" - env_val = os.environ.get("AGENTS_HOME") - if env_val: - return Path(env_val) - return find_git_root().parent / ".worktrees" - - @app.command() def build( image: Annotated[str | None, typer.Option("--image", help="PI image tag")] = None, @@ -43,7 +33,7 @@ def build( make_vars["PI_IMAGE"] = image if dockerfile: make_vars["PI_DOCKERFILE"] = dockerfile - run_make("build-pi", make_vars) + run_make(Target.BUILD_PI, make_vars) @app.command() @@ -87,13 +77,13 @@ def spawn( make_vars["PI_BASE_URL"] = base_url if model_id: make_vars["PI_MODEL_ID"] = model_id - run_make("spawn-pi", make_vars) + run_make(Target.SPAWN_PI, make_vars) @app.command(name="list") def list_agents() -> None: """List active PI agent containers and PI worktrees.""" - run_make("list-pi-agents") + run_make(Target.LIST_PI_AGENTS) @app.command() @@ -101,7 +91,7 @@ def logs( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Show logs for a PI agent (live container or persisted log).""" - run_make("logs-pi-agent", {"BRANCH": branch}) + run_make(Target.LOGS_PI_AGENT, {"BRANCH": branch}) @app.command() @@ -109,7 +99,7 @@ def follow( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Follow live streaming logs for a PI agent.""" - run_make("follow-pi-agent", {"BRANCH": branch}, tty=True) + run_make(Target.FOLLOW_PI_AGENT, {"BRANCH": branch}, tty=True) @app.command() @@ -117,7 +107,7 @@ def stop( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Stop a PI agent container.""" - run_make("stop-pi-agent", {"BRANCH": branch}) + run_make(Target.STOP_PI_AGENT, {"BRANCH": branch}) @app.command() @@ -125,10 +115,4 @@ def status( branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")], ) -> None: """Show PI agent status from persisted status.json file.""" - status_file = _agents_home() / branch / ".agent" / "status.json" - if not status_file.exists(): - typer.echo(f"[pi-status] No status file found for branch '{branch}'.") - typer.echo(f"[pi-status] Expected at: {status_file}") - raise typer.Exit(1) - data = json.loads(status_file.read_text()) - typer.echo(json.dumps(data, indent=2)) + print_agent_status(branch, label="pi-status") diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py index 1f17828..5c9da95 100644 --- a/app/cli/src/container_cli/commands/run.py +++ b/app/cli/src/container_cli/commands/run.py @@ -1,21 +1,18 @@ -"""Interactive coordinator container (`run` and `shell` aliases).""" +"""Interactive coordinator container (`run` and `shell` aliases). + +Registered as top-level commands by `container_cli.main`. +""" from typing import Annotated import typer +from container_cli.targets import Target from container_cli.utils import check_token, run_make -app = typer.Typer(help="Run coordinator container") - -@app.command() -def run( - cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, - memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, - name: Annotated[str | None, typer.Option("--name", help="Container name")] = None, -) -> None: - """Run an interactive coordinator shell (hands off TTY).""" +def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str | None) -> None: + """Validate the token and launch the coordinator container via `target`.""" check_token() make_vars: dict[str, str] = {} if cpus is not None: @@ -24,22 +21,22 @@ def run( make_vars["MEMORY"] = memory if name: make_vars["NAME"] = name - run_make("run", make_vars, tty=True) + run_make(target, make_vars, tty=True) + + +def run( + cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, + memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, + name: Annotated[str | None, typer.Option("--name", help="Container name")] = None, +) -> None: + """Run an interactive coordinator shell (hands off TTY).""" + _coordinator(Target.RUN, cpus, memory, name) -@app.command() def shell( cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None, memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None, name: Annotated[str | None, typer.Option("--name", help="Container name")] = None, ) -> None: """Alias for run — open an interactive shell in the coordinator container.""" - check_token() - make_vars: dict[str, str] = {} - if cpus is not None: - make_vars["CPUS"] = str(cpus) - if memory: - make_vars["MEMORY"] = memory - if name: - make_vars["NAME"] = name - run_make("shell", make_vars, tty=True) + _coordinator(Target.SHELL, cpus, memory, name) diff --git a/app/cli/src/container_cli/targets.py b/app/cli/src/container_cli/targets.py new file mode 100644 index 0000000..69e8def --- /dev/null +++ b/app/cli/src/container_cli/targets.py @@ -0,0 +1,45 @@ +"""Single source of truth for the config/Makefile targets the q CLI invokes. + +Centralising the target names here turns the implicit, scattered functional +coupling between the CLI and the orchestration Makefile into one explicit +contract: a renamed target now has a single Python counterpart to update, and +``tests/test_targets.py`` verifies every member against the real Makefile. + +``Target`` is a ``StrEnum``, so each member compares equal to its string value +and can be passed anywhere a plain target string is expected. +""" + +from enum import StrEnum + + +class Target(StrEnum): + """Names of the config/Makefile targets invoked through ``utils.run_make``.""" + + # Image build and cleanup. + BUILD = "build" + CLEAN = "clean" + CLEAN_NETWORK = "clean-network" + CLEAN_ALL = "clean-all" + + # Bridge network. + NETWORK = "network" + + # Interactive coordinator container. + RUN = "run" + SHELL = "shell" + + # Claude agent lifecycle. + SPAWN = "spawn" + LIST_AGENTS = "list-agents" + LOGS_AGENT = "logs-agent" + FOLLOW_AGENT = "follow-agent" + STOP_AGENT = "stop-agent" + SUMMARY_AGENT = "summary-agent" + + # PI agent lifecycle (local mlx_lm backend). + BUILD_PI = "build-pi" + SPAWN_PI = "spawn-pi" + LIST_PI_AGENTS = "list-pi-agents" + LOGS_PI_AGENT = "logs-pi-agent" + FOLLOW_PI_AGENT = "follow-pi-agent" + STOP_PI_AGENT = "stop-pi-agent" diff --git a/app/cli/src/container_cli/utils.py b/app/cli/src/container_cli/utils.py index c22fdde..b004724 100644 --- a/app/cli/src/container_cli/utils.py +++ b/app/cli/src/container_cli/utils.py @@ -1,11 +1,14 @@ -"""Shared helpers: git root resolution, Makefile runner, and token validation.""" +"""Shared helpers: git/worktree paths, the Makefile runner, token and status I/O.""" +import json import os import subprocess from pathlib import Path import typer +from container_cli.targets import Target + def find_git_root() -> Path: """Return the absolute path of the repository root. @@ -35,6 +38,19 @@ def makefile_dir() -> Path: return find_git_root() / "config" +def agents_home() -> Path: + """Return the directory that holds agent worktrees. + + Returns: + The `AGENTS_HOME` environment variable when set, otherwise a sibling + `.worktrees/` directory next to the repository root. + """ + env_val = os.environ.get("AGENTS_HOME") + if env_val: + return Path(env_val) + return find_git_root().parent / ".worktrees" + + def check_token() -> None: """Verify that the Claude container OAuth token is exported. @@ -50,11 +66,14 @@ def check_token() -> None: raise typer.Exit(1) -def run_make(target: str, extra_vars: dict[str, str] | None = None, *, tty: bool = False) -> None: +def run_make( + target: Target, extra_vars: dict[str, str] | None = None, *, tty: bool = False +) -> None: """Invoke a Makefile target inside the project's `config/` directory. Args: - target: The Make target name to execute. + target: The Makefile target to execute (a `Target` member, which is a + `StrEnum` and therefore usable directly as the make argument). extra_vars: Optional mapping of variables passed as `KEY=VALUE` to make. tty: If True, replace the current process with `make` so it inherits the TTY (used by interactive commands like `run` and `follow`). @@ -71,3 +90,23 @@ def run_make(target: str, extra_vars: dict[str, str] | None = None, *, tty: bool result = subprocess.run(cmd) if result.returncode != 0: raise typer.Exit(result.returncode) + + +def print_agent_status(branch: str, *, label: str) -> None: + """Print the persisted `status.json` for an agent worktree branch. + + Args: + branch: The agent branch whose status should be displayed. + label: Tag used in the not-found messages (e.g. `status`, `pi-status`). + + Raises: + typer.Exit: With code 1 when no status file exists for the branch. + """ + status_file = agents_home() / branch / ".agent" / "status.json" + if not status_file.exists(): + typer.echo(f"[{label}] No status file found for branch '{branch}'.") + typer.echo(f"[{label}] Expected at: {status_file}") + raise typer.Exit(1) + + data = json.loads(status_file.read_text()) + typer.echo(json.dumps(data, indent=2)) diff --git a/app/cli/tests/acceptance/conftest.py b/app/cli/tests/acceptance/conftest.py index 6d00b06..7406de6 100644 --- a/app/cli/tests/acceptance/conftest.py +++ b/app/cli/tests/acceptance/conftest.py @@ -42,8 +42,7 @@ def invocation_context( patch("container_cli.commands.run.run_make") as m_run, patch("container_cli.commands.network.run_make") as m_network, patch("container_cli.commands.pi_agents.run_make") as m_pi, - patch("container_cli.commands.agents.find_git_root", return_value=repo), - patch("container_cli.commands.pi_agents.find_git_root", return_value=repo), + patch("container_cli.utils.find_git_root", return_value=repo), ): ctx = InvocationContext( runner=CliRunner(), diff --git a/app/cli/tests/acceptance/features/spawn.feature b/app/cli/tests/acceptance/features/spawn.feature index 36d98cc..37e6cb1 100644 --- a/app/cli/tests/acceptance/features/spawn.feature +++ b/app/cli/tests/acceptance/features/spawn.feature @@ -24,3 +24,9 @@ Feature: Spawn agent containers When I run "q spawn --branch feat/bar --task work --cpus 8 --memory 16G --image custom:tag" Then the command exits successfully And the make vars include CPUS="8" and MEMORY="16G" and IMAGE="custom:tag" + + Scenario: Spawn with an explicit agent model + Given the CLAUDE_CONTAINER_OAUTH_TOKEN is set + When I run "q spawn --branch feat/baz --task work --model opus" + Then the command exits successfully + And the make vars include MODEL="opus" diff --git a/app/cli/tests/e2e/__init__.py b/app/cli/tests/e2e/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/app/cli/tests/e2e/conftest.py b/app/cli/tests/e2e/conftest.py new file mode 100644 index 0000000..eb65fce --- /dev/null +++ b/app/cli/tests/e2e/conftest.py @@ -0,0 +1,276 @@ +"""Shared fixtures and collection gating for the opt-in end-to-end tests. + +These tests spawn real Apple Container CLI containers, so they are: + +* **local-only** — GitHub Actions has no Apple Container CLI; +* **opt-in** — collected only when ``STACKAI_E2E=1`` is exported; +* **slow and billable** — the Claude round-trip spends real Claude credits. + +Run them with ``make e2e-test`` (from ``app/cli/``), which exports the gate +variable. Without it, pytest never imports the test modules, so the default +``uv run pytest`` and the mutmut suite ignore them entirely. +""" + +from __future__ import annotations + +import json +import os +import shutil +import subprocess +import sys +import tempfile +import time +import urllib.error +import urllib.request +from collections.abc import Callable, Iterator +from pathlib import Path + +import pytest + +# Collection gate: without STACKAI_E2E=1, pytest skips collecting these modules. +collect_ignore_glob = [] if os.environ.get("STACKAI_E2E") == "1" else ["test_*.py"] + +# Terminal phases written by config/entrypoint*.sh into .agent/status.json. +_TERMINAL_PHASES = frozenset({"completed", "errored"}) + + +def _run_q(*args: str, timeout: float = 600) -> subprocess.CompletedProcess[str]: + """Invoke the real q CLI in a subprocess, inheriting the current environment.""" + return subprocess.run( + [sys.executable, "-m", "container_cli.main", *args], + capture_output=True, + text=True, + timeout=timeout, + check=False, + ) + + +def _image_present(image: str) -> bool | None: + """Report whether a container image exists locally. + + Returns: + True or False when the Apple Container CLI answers, None when the + command is unavailable so the caller should not skip on that basis. + """ + result = subprocess.run( + ["container", "image", "list"], + capture_output=True, + text=True, + check=False, + ) + if result.returncode != 0: + return None + return image.split(":")[0] in result.stdout + + +@pytest.fixture() +def run_q() -> Callable[..., subprocess.CompletedProcess[str]]: + """Return a helper that runs the q CLI and captures its output.""" + return _run_q + + +@pytest.fixture() +def require_container_cli() -> None: + """Skip the test unless the Apple `container` CLI is on PATH.""" + if shutil.which("container") is None: + pytest.skip("Apple Container CLI ('container') not found — E2E needs macOS 26+ ARM64") + + +@pytest.fixture() +def require_claude_token() -> None: + """Skip the test unless a Claude container OAuth token is exported.""" + if not os.environ.get("CLAUDE_CONTAINER_OAUTH_TOKEN"): + pytest.skip("CLAUDE_CONTAINER_OAUTH_TOKEN not set — required to spawn a Claude agent") + + +@pytest.fixture() +def require_claude_image() -> None: + """Skip the test unless the claude-agent:wolfi image is built.""" + if _image_present("claude-agent:wolfi") is False: + pytest.skip("image claude-agent:wolfi not built — run `q build` first") + + +@pytest.fixture() +def require_pi_image() -> None: + """Skip the test unless the claude-pi:ubuntu image is built.""" + if _image_present("claude-pi:ubuntu") is False: + pytest.skip("image claude-pi:ubuntu not built — run `q pi build` first") + + +def _http_ok(url: str, *, timeout: float) -> bool: + """Tiny probe — True when the URL responds 200.""" + try: + with urllib.request.urlopen(url, timeout=timeout) as resp: + return resp.status == 200 + except (urllib.error.URLError, OSError): + return False + + +def _mlx_url() -> str: + return os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models") + + +@pytest.fixture(scope="session") +def _mlx_server_lifecycle() -> Iterator[subprocess.Popen[bytes] | None]: + """Optionally spawn mlx_lm.server for the duration of the e2e session. + + Activated by exporting ``STACKAI_E2E_AUTOSTART_MLX=1``. Without that flag, + or when a server is already reachable on the expected URL, this is a no-op + — tests still see whatever process the user manages out of band. + + The exact invocation mirrors the parameters the PI agent expects (no + coupling to the `iac` CLI: the e2e suite must run standalone). First-run + cost is significant — model download + warmup can take several minutes; + raise the wait ceiling with ``STACKAI_E2E_MLX_BOOT_TIMEOUT`` if needed. + """ + if os.environ.get("STACKAI_E2E_AUTOSTART_MLX") != "1": + yield None + return + + url = _mlx_url() + if _http_ok(url, timeout=2): + # Pre-existing server (e.g. started via `uv run iac server start`) — + # leave it alone so manual setups survive teardown. + yield None + return + + # conftest.py lives at app/cli/tests/e2e/ → repo root is parents[4]. + iac_dir = Path(__file__).resolve().parents[4] / "iac" + cmd = [ + "uv", "run", "--directory", str(iac_dir), "mlx_lm.server", + "--model", "mlx-community/gemma-4-26b-a4b-it-4bit", + "--host", "0.0.0.0", + "--port", "8080", + "--prompt-cache-size", "5", + "--prompt-cache-bytes", "6GB", + "--decode-concurrency", "4", + "--prompt-concurrency", "2", + "--prefill-step-size", "1024", + "--temp", "0.9", + "--top-p", "0.95", + "--top-k", "40", + "--min-p", "0.0", + "--max-tokens", "2048", + "--use-default-chat-template", + "--log-level", "INFO", + ] + + log_path = Path(tempfile.gettempdir()) / "stackai-e2e-mlx.log" + log_handle = log_path.open("ab") + print(f"\n[mlx] starting mlx_lm.server (logs: {log_path})", flush=True) + proc = subprocess.Popen( # noqa: S603 — fixed argv, no shell. + cmd, + stdout=log_handle, + stderr=subprocess.STDOUT, + stdin=subprocess.DEVNULL, + start_new_session=True, + ) + + try: + timeout = float(os.environ.get("STACKAI_E2E_MLX_BOOT_TIMEOUT", "600")) + deadline = time.monotonic() + timeout + while time.monotonic() < deadline: + if proc.poll() is not None: + raise RuntimeError( + f"mlx_lm.server exited during startup (code {proc.returncode});" + f" see {log_path}" + ) + if _http_ok(url, timeout=2): + break + time.sleep(3) + else: + proc.terminate() + raise TimeoutError( + f"mlx_lm.server did not become reachable at {url} within" + f" {timeout}s; see {log_path}" + ) + print(f"[mlx] ready on {url}", flush=True) + yield proc + finally: + log_handle.close() + if proc.poll() is None: + proc.terminate() + try: + proc.wait(timeout=10) + except subprocess.TimeoutExpired: + proc.kill() + proc.wait(timeout=5) + + +@pytest.fixture() +def require_mlx_server(_mlx_server_lifecycle) -> None: + """Skip the test unless the local mlx_lm.server is reachable on the host. + + Composes with ``_mlx_server_lifecycle`` so when the autostart env var is + set, the server is already up by the time this check runs. + """ + url = _mlx_url() + if _http_ok(url, timeout=5): + return + pytest.skip( + f"mlx_lm.server unreachable at {url} — run `uv run iac server start`" + " or set STACKAI_E2E_AUTOSTART_MLX=1 to let the suite spawn it" + ) + + +@pytest.fixture() +def isolated_agents_home(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path: + """Point AGENTS_HOME at a throwaway directory for the duration of the test.""" + home = tmp_path / "worktrees" + home.mkdir() + monkeypatch.setenv("AGENTS_HOME", str(home)) + return home + + +@pytest.fixture() +def unique_branch() -> str: + """A collision-free, flat branch name (no slashes — see docs/agents/cli.md).""" + return f"e2e-{int(time.time())}-{os.getpid()}" + + +@pytest.fixture() +def wait_for_terminal_phase() -> Callable[..., dict]: + """Return a poller that blocks until an agent writes a terminal status.json.""" + + def _wait(agents_home: Path, branch: str, timeout: float = 600) -> dict: + status_file = agents_home / branch / ".agent" / "status.json" + deadline = time.monotonic() + timeout + last_phase = "" + while time.monotonic() < deadline: + if status_file.exists(): + try: + data = json.loads(status_file.read_text()) + except json.JSONDecodeError: + data = {} + last_phase = data.get("phase", "") + if last_phase in _TERMINAL_PHASES: + return data + time.sleep(5) + raise AssertionError( + f"agent '{branch}' did not reach a terminal phase within {timeout}s " + f"(last phase seen: {last_phase})" + ) + + return _wait + + +@pytest.fixture() +def agent_cleanup(isolated_agents_home: Path) -> Iterator[Callable[..., None]]: + """Register (branch, kind) pairs to be stopped and pruned after the test.""" + registered: list[tuple[str, str]] = [] + + def _register(branch: str, kind: str = "claude") -> None: + registered.append((branch, kind)) + + yield _register + + for branch, kind in registered: + stop = ["pi", "stop"] if kind == "pi" else ["agents", "stop"] + _run_q(*stop, "--branch", branch, timeout=120) + worktree = isolated_agents_home / branch + for cmd in ( + ["git", "worktree", "remove", "--force", str(worktree)], + ["git", "worktree", "prune"], + ["git", "branch", "-D", branch], + ): + subprocess.run(cmd, capture_output=True, text=True, check=False) diff --git a/app/cli/tests/e2e/test_claude_agent_e2e.py b/app/cli/tests/e2e/test_claude_agent_e2e.py new file mode 100644 index 0000000..1eaecdc --- /dev/null +++ b/app/cli/tests/e2e/test_claude_agent_e2e.py @@ -0,0 +1,68 @@ +"""End-to-end test: orchestrator <-> Claude agent communication. + +Spawns a real Claude agent container through the ``q`` CLI, waits for it to +finish, and asserts the orchestrator receives the agent's report through every +channel: ``status.json``, the persisted log, the ``[agent:status]`` markers, +and the git commit on the worktree branch. + +Opt-in and local-only — see ``tests/e2e/conftest.py``. +""" + +from __future__ import annotations + +import json + +import pytest + +_TASK = ( + "Create a file named E2E_OK.txt in the current directory containing " + "exactly the text PASS, then stage and commit it with git." +) + + +@pytest.mark.e2e +def test_claude_agent_round_trip( + require_container_cli, + require_claude_token, + require_claude_image, + isolated_agents_home, + unique_branch, + run_q, + wait_for_terminal_phase, + agent_cleanup, +): + """A spawned Claude agent does the task and reports back to the orchestrator.""" + branch = unique_branch + + # Outbound — the orchestrator ensures the network exists, then dispatches. + network = run_q("network") + assert network.returncode == 0, network.stderr + + spawn = run_q("spawn", "--branch", branch, "--task", _TASK) + agent_cleanup(branch, "claude") + assert spawn.returncode == 0, f"spawn failed:\n{spawn.stdout}\n{spawn.stderr}" + + # Inbound channel 1 — status.json reaches a terminal phase. + status = wait_for_terminal_phase(isolated_agents_home, branch) + assert status["phase"] == "completed", f"agent did not complete: {status}" + assert status["exit_code"] == 0, f"agent exited non-zero: {status}" + assert status["commits"] >= 1, f"agent reported no commit: {status}" + + # Outbound result — the task produced real work in the shared worktree. + artifact = isolated_agents_home / branch / "E2E_OK.txt" + assert artifact.is_file(), f"agent did not create {artifact}" + assert artifact.read_text().strip() == "PASS" + + # Inbound channel 2 — `q agents status` surfaces the report to the orchestrator. + cli_status = run_q("agents", "status", "--branch", branch) + assert cli_status.returncode == 0, cli_status.stderr + assert json.loads(cli_status.stdout)["phase"] == "completed" + + # Inbound channel 3 — the persisted log and the [agent:status] markers. + logs = run_q("agents", "logs", "--branch", branch) + assert logs.returncode == 0, logs.stderr + assert logs.stdout.strip(), "agent log is empty" + + summary = run_q("agents", "summary", "--branch", branch) + assert summary.returncode == 0, summary.stderr + assert "PHASE=completed" in summary.stdout, summary.stdout diff --git a/app/cli/tests/e2e/test_pi_agent_e2e.py b/app/cli/tests/e2e/test_pi_agent_e2e.py new file mode 100644 index 0000000..32ceaa3 --- /dev/null +++ b/app/cli/tests/e2e/test_pi_agent_e2e.py @@ -0,0 +1,70 @@ +"""End-to-end test: orchestrator <-> PI agent communication. + +Spawns a real PI agent container (local ``mlx_lm.server`` backend) through the +``q pi`` CLI and asserts the orchestrator receives the agent's report. PI +agents need no Claude token — they authenticate against the local model — but +they do need the model server running on the host. + +This test validates the *communication* round-trip, not model quality: a +terminal ``status.json`` carrying ``agent_kind="pi"`` plus a captured log proves +the task was delivered and the report came back. + +``MAX_PI_AGENTS`` enforcement is intentionally not E2E-tested here — it would +require a racy spawn-while-running attempt; it is covered by the Makefile logic +and the mocked acceptance suite. + +Opt-in and local-only — see ``tests/e2e/conftest.py``. +""" + +from __future__ import annotations + +import json + +import pytest + +_TASK = ( + "Create a file named E2E_PI_OK.txt in the current directory containing exactly the text PASS." +) + + +@pytest.mark.e2e +def test_pi_agent_round_trip( + require_container_cli, + require_pi_image, + require_mlx_server, + isolated_agents_home, + unique_branch, + run_q, + wait_for_terminal_phase, + agent_cleanup, +): + """A spawned PI agent runs against the local model and reports back.""" + branch = unique_branch + + # Outbound — dispatch the task to a PI container (no Claude token needed). + network = run_q("network") + assert network.returncode == 0, network.stderr + + spawn = run_q("pi", "spawn", "--branch", branch, "--task", _TASK) + agent_cleanup(branch, "pi") + assert spawn.returncode == 0, f"pi spawn failed:\n{spawn.stdout}\n{spawn.stderr}" + + # Inbound channel 1 — status.json reaches a terminal phase, tagged as PI. + status = wait_for_terminal_phase(isolated_agents_home, branch) + assert status["phase"] == "completed", f"PI agent did not complete: {status}" + assert status["agent_kind"] == "pi", f"status not tagged as PI: {status}" + + # Inbound channel 2 — `q pi status` surfaces the PI report to the orchestrator. + cli_status = run_q("pi", "status", "--branch", branch) + assert cli_status.returncode == 0, cli_status.stderr + assert json.loads(cli_status.stdout)["agent_kind"] == "pi" + + # Inbound channel 3 — `q pi list` filters this worktree in by agent_kind=pi. + pi_list = run_q("pi", "list") + assert pi_list.returncode == 0, pi_list.stderr + assert branch in pi_list.stdout, f"branch {branch} not listed by `q pi list`" + + # Inbound channel 4 — the persisted log is captured. + logs = run_q("pi", "logs", "--branch", branch) + assert logs.returncode == 0, logs.stderr + assert logs.stdout.strip(), "PI agent log is empty" diff --git a/app/cli/tests/test_agents.py b/app/cli/tests/test_agents.py index c229235..b6374dd 100644 --- a/app/cli/tests/test_agents.py +++ b/app/cli/tests/test_agents.py @@ -7,7 +7,6 @@ import typer from container_cli.commands.agents import ( - _agents_home, follow, list_agents, logs, @@ -43,8 +42,18 @@ def test_spawn_with_image(self, mock_run_make, mock_check_token): call_vars = mock_run_make["agents"].call_args[0][1] assert call_vars["IMAGE"] == "my-img:latest" + def test_spawn_with_model(self, mock_run_make, mock_check_token): + spawn(branch="b", task="t", cpus=None, memory=None, image=None, model="opus") + call_vars = mock_run_make["agents"].call_args[0][1] + assert call_vars["MODEL"] == "opus" + + def test_spawn_omits_model_when_unset(self, mock_run_make, mock_check_token): + spawn(branch="b", task="t", cpus=None, memory=None, image=None, model=None) + call_vars = mock_run_make["agents"].call_args[0][1] + assert "MODEL" not in call_vars + def test_spawn_all_optional_params(self, mock_run_make, mock_check_token): - spawn(branch="b", task="t", cpus=2, memory="4G", image="img") + spawn(branch="b", task="t", cpus=2, memory="4G", image="img", model="opus") call_vars = mock_run_make["agents"].call_args[0][1] assert call_vars == { "BRANCH": "b", @@ -52,6 +61,7 @@ def test_spawn_all_optional_params(self, mock_run_make, mock_check_token): "CPUS": "2", "MEMORY": "4G", "IMAGE": "img", + "MODEL": "opus", } @@ -98,22 +108,8 @@ def test_reads_status_file(self, tmp_path, monkeypatch): def test_agents_home_fallback(self, monkeypatch): monkeypatch.delenv("AGENTS_HOME", raising=False) with ( - patch("container_cli.commands.agents.find_git_root", return_value=Path("/fake/repo")), + patch("container_cli.utils.find_git_root", return_value=Path("/fake/repo")), pytest.raises(typer.Exit) as exc_info, ): status(branch="nonexistent-branch") assert exc_info.value.exit_code == 1 - - -class TestAgentsHome: - def test_uses_env_var(self, tmp_path, monkeypatch): - monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) - assert _agents_home() == tmp_path - - def test_fallback_path_name(self, monkeypatch): - monkeypatch.delenv("AGENTS_HOME", raising=False) - with patch( - "container_cli.commands.agents.find_git_root", return_value=Path("/home/user/repo") - ): - result = _agents_home() - assert result == Path("/home/user/.worktrees") diff --git a/app/cli/tests/test_pi_agents.py b/app/cli/tests/test_pi_agents.py index 5193ad4..21b10d2 100644 --- a/app/cli/tests/test_pi_agents.py +++ b/app/cli/tests/test_pi_agents.py @@ -6,14 +6,10 @@ from __future__ import annotations -from pathlib import Path -from unittest.mock import patch - import pytest import typer from container_cli.commands.pi_agents import ( - _agents_home, build, follow, list_agents, @@ -148,18 +144,3 @@ def test_reads_status_file(self, tmp_path, monkeypatch): status_file.parent.mkdir(parents=True) status_file.write_text('{"phase": "completed", "agent_kind": "pi"}') status(branch="pi/x") - - -class TestAgentsHome: - def test_uses_env_var(self, tmp_path, monkeypatch): - monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) - assert _agents_home() == tmp_path - - def test_fallback_path_name(self, monkeypatch): - monkeypatch.delenv("AGENTS_HOME", raising=False) - with patch( - "container_cli.commands.pi_agents.find_git_root", - return_value=Path("/home/user/repo"), - ): - result = _agents_home() - assert result == Path("/home/user/.worktrees") diff --git a/app/cli/tests/test_targets.py b/app/cli/tests/test_targets.py new file mode 100644 index 0000000..971ffbb --- /dev/null +++ b/app/cli/tests/test_targets.py @@ -0,0 +1,46 @@ +from __future__ import annotations + +import re +from pathlib import Path + +from container_cli.targets import Target + +_TARGET_DEF = re.compile(r"^([a-zA-Z0-9_-]+):", re.MULTILINE) + + +def _find_makefile() -> Path: + """Locate config/Makefile by walking up from this test file. + + Walking up (rather than a fixed parent index) keeps the lookup correct when + the test runs from the mutmut copy under ``mutants/``. + """ + for parent in Path(__file__).resolve().parents: + candidate = parent / "config" / "Makefile" + if candidate.is_file(): + return candidate + raise FileNotFoundError("config/Makefile not found in any parent directory") + + +_MAKEFILE = _find_makefile() + + +class TestTargetValues: + def test_every_member_value_is_kebab_case(self): + for target in Target: + assert re.fullmatch(r"[a-z][a-z-]*", target.value), target + + def test_member_equals_its_string_value(self): + assert Target.SPAWN == "spawn" + assert Target.LIST_AGENTS == "list-agents" + assert Target.SPAWN_PI == "spawn-pi" + + def test_no_duplicate_values(self): + values = [t.value for t in Target] + assert len(values) == len(set(values)) + + +class TestMakefileContract: + def test_every_target_exists_in_the_makefile(self): + defined = set(_TARGET_DEF.findall(_MAKEFILE.read_text())) + missing = {t.value for t in Target} - defined + assert not missing, f"Target members absent from config/Makefile: {sorted(missing)}" diff --git a/app/cli/tests/test_utils.py b/app/cli/tests/test_utils.py index 554fe8a..efa26e8 100644 --- a/app/cli/tests/test_utils.py +++ b/app/cli/tests/test_utils.py @@ -7,7 +7,14 @@ import pytest import typer -from container_cli.utils import check_token, find_git_root, makefile_dir, run_make +from container_cli.utils import ( + agents_home, + check_token, + find_git_root, + makefile_dir, + print_agent_status, + run_make, +) # ---------- find_git_root ---------- @@ -132,3 +139,49 @@ def test_default_tty_uses_subprocess(self, fake_git_root: Path): run_make("build") m_sub.assert_called_once() m_exec.assert_not_called() + + +# ---------- agents_home ---------- + + +class TestAgentsHome: + def test_uses_env_var(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + assert agents_home() == tmp_path + + def test_fallback_path_name(self, monkeypatch: pytest.MonkeyPatch): + monkeypatch.delenv("AGENTS_HOME", raising=False) + with patch("container_cli.utils.find_git_root", return_value=Path("/home/user/repo")): + assert agents_home() == Path("/home/user/.worktrees") + + +# ---------- print_agent_status ---------- + + +class TestPrintAgentStatus: + def test_exits_when_status_file_missing( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys + ): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + with pytest.raises(typer.Exit) as exc_info: + print_agent_status("feat-x", label="status") + assert exc_info.value.exit_code == 1 + assert "[status] No status file found" in capsys.readouterr().out + + def test_prints_json_when_status_file_exists( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys + ): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + status_file = tmp_path / "feat-x" / ".agent" / "status.json" + status_file.parent.mkdir(parents=True) + status_file.write_text('{"phase": "completed"}') + print_agent_status("feat-x", label="status") + assert "completed" in capsys.readouterr().out + + def test_label_tags_the_not_found_message( + self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys + ): + monkeypatch.setenv("AGENTS_HOME", str(tmp_path)) + with pytest.raises(typer.Exit): + print_agent_status("feat-x", label="pi-status") + assert "[pi-status]" in capsys.readouterr().out diff --git a/config/Dockerfile.wolfi b/config/Dockerfile.wolfi index 7a17aa1..1bed1c8 100644 --- a/config/Dockerfile.wolfi +++ b/config/Dockerfile.wolfi @@ -80,10 +80,17 @@ COPY --from=builder /root/.cargo/bin/dust /usr/local/bin/dust COPY --from=builder /root/.cargo/bin/procs /usr/local/bin/procs COPY --from=builder /root/.cargo/bin/btm /usr/local/bin/btm -# Claude Code CLI +# Claude Code CLI — pin the install at build time. CLAUDE_CODE_DISABLE_AUTOUPDATE=1 +# in this image means the CLI never updates inside the container, so the image +# ships exactly the version the installer returns at build time. `claude --version` +# at the end serves two purposes: log the version into the build output and act +# as a cache-busting tail so a `--no-cache-filter` rebuild picks up a newer CLI +# (e.g. 2.1.148+ for the Opus `role 'system'` fix) without forcing a full +# --no-cache that would rebuild the slow Rust builder stage. RUN curl -fsSL https://claude.ai/install.sh -o /tmp/claude-install.sh \ && yes | bash /tmp/claude-install.sh \ - && rm -f /tmp/claude-install.sh + && rm -f /tmp/claude-install.sh \ + && claude --version # OpenCode RUN curl -fsSL https://opencode.ai/install | bash \ diff --git a/config/Makefile b/config/Makefile index ae31d69..d56ba76 100644 --- a/config/Makefile +++ b/config/Makefile @@ -44,6 +44,11 @@ MEMORY ?= 3G # Agent variables BRANCH ?= agent-$(shell date +%s) TASK ?= Explore the codebase and report what you find. +# MODEL: Claude model the headless agent runs with. Defaults to opus — the +# agent doesn't inherit the host's interactive `model` preference (see the +# entrypoint); a fixed default keeps the choice in the orchestrator. Override +# per spawn, e.g. `make spawn ... MODEL=sonnet`. +MODEL ?= opus # Derived paths GIT_ROOT := $(shell git -C $(CURDIR) rev-parse --show-toplevel) @@ -109,6 +114,7 @@ spawn: network _check-token @echo "[spawn] Launching agent: $(PROJECT_NAME)-$(CONTAINER_BRANCH)" @echo "[spawn] Worktree: $(WORKTREES_DIR)/$(BRANCH)" @echo "[spawn] Task: $(TASK)" + @echo "[spawn] Model: $(MODEL)" container run -d --rm \ --name $(PROJECT_NAME)-$(CONTAINER_BRANCH) \ --network $(NETWORK) \ @@ -120,6 +126,7 @@ spawn: network _check-token -v $(HOME)/.claude:/root/.claudenew:ro \ -v $(HOME)/.claude.json:/root/.claudenew.json:ro \ -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \ + -e AGENT_MODEL=$(MODEL) \ -e CLAUDE_CODE_OAUTH_TOKEN=$${$(HOST_TOKEN_VAR)} \ $(IMAGE) \ --worktree "$(BRANCH)" --task "$(TASK)" @@ -392,6 +399,7 @@ help: @printf " %-20s %s\n" "CPUS" "$(CPUS)" @printf " %-20s %s\n" "MEMORY" "$(MEMORY)" @printf " %-20s %s\n" "BRANCH" "$(BRANCH)" + @printf " %-20s %s\n" "MODEL" "$(MODEL)" @printf " %-20s %s\n" "PROJECT_NAME" "$(PROJECT_NAME)" @printf " %-20s %s\n" "GIT_ROOT" "$(GIT_ROOT)" @printf " %-20s %s\n" "AGENTS_HOME" "$(AGENTS_HOME)" diff --git a/config/entrypoint.sh b/config/entrypoint.sh index cbbfd82..7f4ba60 100644 --- a/config/entrypoint.sh +++ b/config/entrypoint.sh @@ -22,6 +22,12 @@ AGENT_TASK="" PROJECT_NAME="" PASSTHROUGH_ARGS=() +# Model the headless agent runs with. The orchestrator passes it via +# -e AGENT_MODEL; it defaults to opus so the agent runs with the most capable +# model by default and never inherits the host user's personal `model` +# preference from the copied settings.json (which can be a different account). +AGENT_MODEL="${AGENT_MODEL:-opus}" + # ── Functions ───────────────────────────────────────────────────────────────── parse_args() { @@ -70,6 +76,7 @@ create_worktree() { setup_agent_perms() { echo "[entrypoint] Starting Claude agent (headless)..." echo "[entrypoint] Task: ${AGENT_TASK}" + echo "[entrypoint] Model: ${AGENT_MODEL}" echo "---" # Make claude's install path traversable for non-root users (installed under /root/) # go+x required: agent is in group root (gid=0), so group bits apply, not others bits @@ -94,7 +101,12 @@ write_status() { emit_marker() { local phase="$1"; shift - echo "[agent:status] PHASE=${phase} BRANCH=${WORKTREE_BRANCH} $*" + local line="[agent:status] PHASE=${phase} BRANCH=${WORKTREE_BRANCH} $*" + # Echo to the container's stdout (read live via `container logs`) and append + # to the persisted log so `summary-agent` still finds the markers once the + # --rm container is gone and `container logs` no longer works. + echo "$line" + { echo "$line" >> "${AGENT_DIR}/agent.log"; } 2>/dev/null || true } run_agent() { @@ -102,6 +114,11 @@ run_agent() { mkdir -p "$AGENT_DIR" chown -R agent:agent "$AGENT_DIR" + # Start the persisted log empty so a re-spawn on an existing worktree does + # not accumulate a previous run's output. Markers and the claude output + # below both append to it. + { : > "${AGENT_DIR}/agent.log"; } 2>/dev/null || true + # Add .agent/ to worktree .gitignore (safe if dir doesn't exist yet) if ! grep -qxF '.agent/' "${WORKTREE_PATH}/.gitignore" 2>/dev/null; then echo '.agent/' >> "${WORKTREE_PATH}/.gitignore" 2>/dev/null || true @@ -117,10 +134,11 @@ run_agent() { write_status "working" emit_marker "working" - # Run claude with tee to persist logs; capture exit code through pipe + # Run claude with tee to persist logs; capture exit code through pipe. + # `tee -a` appends so the "starting"/"working" markers above survive. set +e su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions \ - -p "$AGENT_TASK" 2>&1 | tee "$AGENT_DIR/agent.log" + --model "$AGENT_MODEL" -p "$AGENT_TASK" 2>&1 | tee -a "$AGENT_DIR/agent.log" local exit_code=${PIPESTATUS[0]} set -e diff --git a/config/spec/entrypoint_spec.sh b/config/spec/entrypoint_spec.sh index 0770c0e..c7485c8 100644 --- a/config/spec/entrypoint_spec.sh +++ b/config/spec/entrypoint_spec.sh @@ -342,7 +342,21 @@ WRAPPER_EOF It "runs su-exec with correct claude command" When run run_entrypoint --worktree agent-br --task "do work" - The output should include "[MOCK] su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions -p do work" + The output should include "[MOCK] su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions --model opus -p do work" + The status should equal 0 + End + + It "defaults the agent model to opus" + When run run_entrypoint --worktree agent-br --task "do work" + The output should include "[entrypoint] Model: opus" + The status should equal 0 + End + + It "honors AGENT_MODEL when the orchestrator sets it" + export AGENT_MODEL=sonnet + When run run_entrypoint --worktree agent-br --task "do work" + The output should include "claude --dangerously-skip-permissions --model sonnet -p do work" + The output should include "[entrypoint] Model: sonnet" The status should equal 0 End diff --git a/docs/agents/cli.md b/docs/agents/cli.md index e91bc61..f4e1ae8 100644 --- a/docs/agents/cli.md +++ b/docs/agents/cli.md @@ -69,6 +69,7 @@ Spawns a detached headless agent in an isolated git worktree. ```bash q spawn --branch feat/oauth2 --task "Implement OAuth2 with JWT tokens" q spawn --branch test/payments --task "Write unit tests for payments module" --cpus 4 +q spawn --branch feat/quick-fix --task "Tighten the readme" --model sonnet ``` | Option | Required | Description | @@ -78,9 +79,16 @@ q spawn --branch test/payments --task "Write unit tests for payments module" --c | `--cpus` | no | CPU count override | | `--memory` | no | Memory limit override (e.g. `8G`) | | `--image` | no | Image tag override | +| `--model` | no | Claude model the agent runs (`opus` default; e.g. `sonnet`, `haiku`) | Requires `CLAUDE_CONTAINER_OAUTH_TOKEN` to be set. +> **Agent model:** the agent runs `opus` unless `--model` says otherwise. It +> does **not** inherit the host's `~/.claude/settings.json` `model` preference — +> that file is copied into the container for credentials only, and a headless +> agent must not depend on a personal interactive setting. Drop to `sonnet` for +> lighter tasks. The Makefile passes the choice through as `-e AGENT_MODEL`. + > **Branch naming:** avoid `/` in branch names — they create nested subdirectories in `$AGENTS_HOME` which the entrypoint cannot handle. Use flat names like `feat-oauth2` instead of `feat/oauth2`. --- @@ -235,16 +243,22 @@ app/cli/ ├── pyproject.toml ← uv project, entry point: q └── src/ └── container_cli/ - ├── main.py ← registers all commands + agents sub-app - ├── utils.py ← find_git_root, makefile_dir, check_token, run_make + ├── main.py ← registers all commands + agents/pi sub-apps + ├── targets.py ← Target enum: the config/Makefile target contract + ├── utils.py ← git/worktree paths, run_make, check_token, print_agent_status └── commands/ ├── build.py ← build, clean, clean-network, clean-all ├── network.py ← network ├── run.py ← run, shell - ├── agents.py ← spawn, list, logs, follow, stop + ├── agents.py ← spawn, list, logs, follow, stop, status, summary └── pi_agents.py ← pi build/spawn/list/logs/follow/stop/status ``` +> Target names are not free strings: every Makefile target the CLI invokes is a +> member of the `Target` enum in `targets.py`, and `tests/test_targets.py` +> verifies each one against `config/Makefile`. Real-container round-trip tests +> live in `tests/e2e/` — see [e2e-tests.md](./e2e-tests.md). + --- ## CLI ↔ Makefile mapping diff --git a/docs/agents/container-agent.md b/docs/agents/container-agent.md index 1af685c..d6f33c6 100644 --- a/docs/agents/container-agent.md +++ b/docs/agents/container-agent.md @@ -127,10 +127,12 @@ container run -d --rm claude-agent:wolfi \ 2. `git -C /workspace worktree add /worktrees/ -b ` - If the branch already exists: `git worktree add /worktrees/ ` 3. `cd /worktrees/` -4. `claude --dangerously-skip-permissions -p ""` +4. `claude --dangerously-skip-permissions --model "$AGENT_MODEL" -p ""` **Why `--dangerously-skip-permissions`:** In headless mode there is no interactive user to approve permissions. The container is a sandboxed environment with access only to the mounted worktree, so it is safe to skip confirmations. +**Why `--model "$AGENT_MODEL"`:** The host's `~/.claude` is copied into the container for credentials, and that copy includes the host user's personal `settings.json` — whose `model` preference must not silently govern a headless agent (the container's OAuth token may belong to a different plan). The entrypoint pins the model explicitly: `AGENT_MODEL` is passed via `-e AGENT_MODEL` and defaults to `opus`. Override per spawn with `q spawn --model ` or `make spawn ... MODEL=`. + **Why run as `agent` (non-root):** Claude CLI blocks `--dangerously-skip-permissions` when the process runs as `root` (uid 0). The entrypoint uses `su-exec` to drop to the `agent` user before executing Claude. IMPORTANT: **Why the worktree is created inside the container:** Git needs access to the repository to register the worktree in `.git/worktrees/`. Since the repo is mounted at `/workspace` inside the container, the worktree must be created from there. If it were created directly from the host, the path registered in git would be the host path (`/Users/...`), which would not exist inside the container. @@ -366,7 +368,7 @@ RUN addgroup -S agent \ │ └─► /home/agent/.claude/ (copied + chown → agent) │ - su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions -p "..." + su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions --model "$AGENT_MODEL" -p "..." ``` The entrypoint: diff --git a/docs/agents/e2e-tests.md b/docs/agents/e2e-tests.md new file mode 100644 index 0000000..7d2c687 --- /dev/null +++ b/docs/agents/e2e-tests.md @@ -0,0 +1,103 @@ +# End-to-end tests — orchestrator ↔ agent + +The E2E tests spawn **real Apple Container CLI containers** and validate the +full round-trip between the orchestrator (the `q` CLI) and an agent running in +a container. They complement — they do not replace — the mocked acceptance +suite: acceptance tests are the fast contract, E2E is the real integration +smoke. + +**Location:** `app/cli/tests/e2e/` + +## Why opt-in and local-only + +The tests need Apple Container CLI (macOS 26+, ARM64), which GitHub Actions +does not have — exactly like the acceptance suite. They are also slow and +billable. So they are gated: + +- Collection is guarded by `STACKAI_E2E=1`. Without it, pytest never imports + the test modules, so the default `uv run pytest` and the `mutmut` suite + ignore them entirely. +- `make e2e-test` (from `app/cli/`) exports the variable and runs them. +- They are excluded from CI and from `make local-qa` by design. + +## Prerequisites + +Each test skips itself, with a reason, when a prerequisite is missing. + +| Test | Needs | +|---|---| +| both | Apple Container CLI on `PATH`; the bridge network (created automatically) | +| E2E-1 (Claude) | `CLAUDE_CONTAINER_OAUTH_TOKEN` exported; `claude-agent:wolfi` built (`q build`) | +| E2E-2 (PI) | `claude-pi:ubuntu` built (`q pi build`); `mlx_lm.server` reachable (`uv run iac server start`) | + +> **Cost:** E2E-1 spends real Claude credits — it runs a live headless agent. +> E2E-2 uses the local model, so it is free but needs the model downloaded and +> the server running. + +The PI server URL defaults to `http://localhost:8080/v1/models`; override the +reachability check with `STACKAI_E2E_MLX_URL` if your server listens elsewhere. + +### Auto-launching `mlx_lm.server` from the suite + +Export `STACKAI_E2E_AUTOSTART_MLX=1` and the e2e session bootstraps the server +itself — there's no need to start it out of band. The session-scoped fixture +in `tests/e2e/conftest.py`: + +1. Skips if the URL is already reachable (an existing server you launched + manually is honoured and not torn down). +2. Otherwise spawns the exact `mlx_lm.server` invocation the PI agent + expects (`mlx-community/gemma-4-26b-a4b-it-4bit`, port 8080, + `--temp 0.9 --top-p 0.95`, `--prompt-cache-bytes 6GB`, etc.). +3. Polls `/v1/models` until ready, then yields to the tests. +4. Terminates the subprocess at session teardown. + +Logs land in `$TMPDIR/stackai-e2e-mlx.log`. The first run downloads the model +(~16 GB) and warms the prompt cache, so the boot can take several minutes — +override the wait ceiling with `STACKAI_E2E_MLX_BOOT_TIMEOUT` (seconds, +default 600). + +## How to run + +```bash +cd app/cli +make e2e-test # STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e +``` + +With auto-launched mlx_lm.server (PI test runs end-to-end too): + +```bash +cd app/cli +STACKAI_E2E_AUTOSTART_MLX=1 make e2e-test +``` + +Run a single test: + +```bash +STACKAI_E2E=1 uv run pytest tests/e2e/test_claude_agent_e2e.py -v +``` + +## What each test asserts + +Both tests dispatch a deterministic task, wait for the agent to reach a +terminal phase, then assert the orchestrator receives the report through every +channel. Each test runs in an isolated `AGENTS_HOME` (a temp directory) and +tears down its container, worktree and branch afterwards. + +### E2E-1 — Claude agent (`test_claude_agent_e2e.py`) + +1. `q network`, then `q spawn` with a task that creates `E2E_OK.txt` and commits. +2. `status.json` reaches `phase=completed`, `exit_code=0`, `commits>=1`. +3. The committed file exists in the shared worktree. +4. `q agents status`, `q agents logs` and `q agents summary` surface the + report (status JSON, persisted log, `[agent:status]` markers). + +### E2E-2 — PI agent (`test_pi_agent_e2e.py`) + +1. `q pi spawn` against the local `mlx_lm.server` backend (no Claude token). +2. `status.json` reaches `phase=completed` and is tagged `agent_kind=pi`. +3. `q pi status`, `q pi list` and `q pi logs` surface the PI report; `q pi list` + filters this worktree in by `agent_kind=pi`. + +`MAX_PI_AGENTS` enforcement is not E2E-tested — it would need a racy +spawn-while-running attempt; it is covered by the Makefile logic and the mocked +acceptance suite. diff --git a/docs/agents/spawn-agent-skill.md b/docs/agents/spawn-agent-skill.md index f7b20d7..78acdfd 100644 --- a/docs/agents/spawn-agent-skill.md +++ b/docs/agents/spawn-agent-skill.md @@ -85,6 +85,7 @@ export CLAUDE_CONTAINER_OAUTH_TOKEN= 5. container run -d --rm ← detached (non-blocking) • -v $GIT_ROOT:/workspace ← full repo (read/write) • -v $AGENTS_HOME:/worktrees ← worktree destination + • -e AGENT_MODEL=$MODEL → claude --model $MODEL (default opus) • --worktree $BRANCH → entrypoint creates the worktree • --task "$TASK" → claude -p "$TASK" in the worktree │ @@ -245,6 +246,7 @@ container run -d --rm \ -v "${HOME}/.claude:/root/.claudenew:ro" \ -v "${HOME}/.claude.json:/root/.claudenew.json:ro" \ -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \ + -e "AGENT_MODEL=${AGENT_MODEL:-opus}" \ -e "CLAUDE_CODE_OAUTH_TOKEN=${CLAUDE_CONTAINER_OAUTH_TOKEN}" \ claude-agent:wolfi \ --worktree "feat/oauth2" --task "${TASK}" diff --git a/docs/modularity-review/2026-05-21/modularity-review.html b/docs/modularity-review/2026-05-21/modularity-review.html new file mode 100644 index 0000000..95eb179 --- /dev/null +++ b/docs/modularity-review/2026-05-21/modularity-review.html @@ -0,0 +1,473 @@ + + + + + +Modularity Review + + + + + +
+

Modularity Review

+

Scope: The q Python CLI (app/cli/) and its integration boundary with config/Makefile
+Date: 2026-05-21

+ +

Executive Summary

+

stackai orchestrates parallel Claude Code agents in sandboxed Apple Containers; the q CLI is a typed Typer front end that wraps the config/Makefile orchestration targets. In the small the CLI is healthy — utils.run_make cleanly encapsulates the mechanism of invoking make, and every module is short and single-purpose. The modularity weakness is concentrated at one boundary: the CLI's knowledge of which Makefile targets exist and what variables they accept is functionally coupled and implicit — encoded as bare string literals scattered across five command modules with no contract to make the dependency explicit or fail loudly. Because the Makefile is the core subdomain of the product and evolves continuously, this is the most important finding: the CLI silently breaks at runtime when a target is renamed. A second, self-inflicted issue compounds it — the status command reaches past the Makefile into the container's private .agent/status.json file, reproducing an existing status-agent target with intrusive coupling.

+

Overall status: needs attention. There is no critical distributed-monolith failure, but two unbalanced-and-volatile integrations will cause avoidable pain as the Makefile evolves, and two smaller cohesion defects add cognitive friction.

+ +

Coupling Overview

+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
IntegrationStrengthDistanceVolatilityBalanced?
commands/*config/Makefile (target & variable names)Functional, implicitHigh — separate language, separate directory, subprocessHigh — orchestration is the core subdomain, targets evolveNo — unbalanced & volatile
agents.status / pi.status → container .agent/status.jsonIntrusiveHigh — bash entrypoint, runs inside the containerMedium–High — monitoring actively developedNo — unbalanced & volatile
agents._agents_homepi_agents._agents_homeFunctional, duplicatedLow — sibling modules, one packageLow–MediumNo — duplication mistaken for cohesion
_agents_home() fallback ≡ Makefile AGENTS_HOMEFunctional, duplicatedHigh — cross-languageLowTolerable — low volatility neutralizes
main.pycommands/* (registration)Model, inconsistentLow — one packageLowBorderline — low cohesion
check_token() ≡ Makefile _check-tokenFunctional, duplicatedHigh — cross-languageLowYes — low volatility neutralizes
+

The last row is included deliberately as a contrast: it is duplicated, unbalanced coupling, but volatility is low enough that the balance rule(STRENGTH XOR DISTANCE) OR NOT VOLATILE — is satisfied. Not every imbalance is a defect; only the volatile ones below are.

+ +
+

Issue: Makefile interface is an implicit, scattered contract

+

Integration: container_cli/commands/*config/Makefile
+Severity: Significant

+ +

Knowledge Leakage

+

Every command function knows, as hard-coded strings, two things about the Makefile: the name of a target ("spawn", "list-agents", "spawn-pi", "summary-agent", …) and the names of the variables that target accepts (BRANCH, TASK, CPUS, MEMORY, IMAGE, DOCKERFILE, SUBNET, NETWORK, PI_IMAGE, PI_DOCKERFILE, PI_BASE_URL, PI_MODEL_ID, NAME). That is the Makefile's functional requirements — what it does and how it must be invoked — replicated inside Python. The shared knowledge is implicit: there is no enum, no schema, no generated mapping. The only place the correspondence is written down is a prose table in docs/agents/cli.md, which nothing keeps honest. run_make centralizes the mechanism (make -C config <target> KEY=VALUE) but not the vocabulary, so each of the five commands/* modules independently owns a fragment of an undeclared contract.

+

This is functional coupling — the components must change together when a target's interface changes — and it is implicit functional coupling, which the model singles out as particularly dangerous: there is no compiler, type checker, or test that links agents.py:33's "BRANCH" to the Makefile's BRANCH ?=.

+ +

Complexity Impact

+

A developer changing a Makefile target name or variable cannot see, from the Makefile, that Python depends on it — the dependency is invisible at the point of change. To safely rename one target the developer must simultaneously hold in mind: the Makefile target, the run_make call site in commands/*, the cli.md mapping table, and the acceptance test. That is knowledge spread across four artifacts in two languages for what should be a trivial rename. It exceeds the 4±1 working-memory budget, which is precisely what turns a change from modular (predictable outcome) into complex (outcome discovered only by running it).

+ +

Cascading Changes

+
    +
  • Renaming list-agentsagents-list in the Makefile: the Python literal "list-agents" still type-checks, still passes ruff, still imports cleanly. The break surfaces only when a user runs q agents list and make reports No rule to make target. The acceptance tests mock run_make, so they assert which string was passed, not that the target exists — there is no test-time signal either.
  • +
  • Adding a variable to an existing target (e.g. a --no-cache option for build): requires coordinated edits in build.py, the Makefile, and cli.md, with nothing enforcing that the three agree.
  • +
  • The coupling is genuinely tight: high integration strength (functional) across high distance — separate language, separate top-level directory, subprocess boundary. The Makefile is a core subdomain: the orchestration logic is the product, and git history shows targets added continuously (PI agents, apply-sensors). High strength + high distance + high volatility fails the balance rule.
  • +
+

Why Significant and not Critical: the same person maintains both sides, so socio-technical distance is low — there is no cross-team coordination cost — and Makefile change is mostly additive (new targets), so existing literals rarely break. The danger is real but not yet frequent.

+ +

Recommended Improvement

+

Introduce a single contract module — e.g. container_cli/makefile.py — as the only place target names and their permitted variables are written down. One small structure per target (target name + allowed variable keys) converts the implicit functional coupling into explicit contract coupling localized in one file; command modules then reference Targets.SPAWN instead of the literal "spawn". Optionally add one test that runs make -C config -n <target> (dry-run) for every target in the registry, so a renamed or removed target fails CI instead of a user's terminal.

+

This deliberately does not reduce distance — the CLI must wrap the Makefile, that is its purpose, and decomposing further would only raise distance. It reduces strength by encapsulating the contract, and it concentrates the volatile knowledge so a Makefile change has exactly one Python counterpart to update. Trade-off: a thin indirection layer plus the discipline of keeping the registry current — cheap relative to silent runtime breakage in the product's most-changed area.

+
+ +
+

Issue: status command bypasses the Makefile and reads the container's private file

+

Integration: agents.status / pi_agents.status → container worktree .agent/status.json
+Severity: Significant

+ +

Knowledge Leakage

+

agents.py:78 and pi_agents.py:128 build the path $AGENTS_HOME/<branch>/.agent/status.json and read it directly off disk. .agent/ is internal bookkeeping created by the container entrypoint (config/entrypoint.sh / entrypoint-pi.sh) for its own use; the CLI treats that private layout as if it were a public interface. This is intrusive coupling — the highest integration strength — because the CLI depends on an implementation detail of a different component.

+

The leak is twofold: the directory layout (.agent/status.json) and even the fallback message text ([status] No status file found…, [status] Expected at: …) are reproduced in Python. They are reproduced because the Makefile already has a status-agent target (Makefile:179) that performs the identical read and prints near-identical messages. So this is also functional coupling through duplication — the knowledge is copied, not shared.

+ +

Complexity Impact

+

There are now two implementations of "show agent status" — status-agent in Make and status() in Python — and nothing keeps them consistent. q agents summary routes through make summary-agent, but q agents status does not route through make status-agent; a reader cannot predict from the CLI surface which commands delegate and which reimplement. The Python file read looks like ordinary I/O, not an integration point, so it is easy to miss when reasoning about what depends on the entrypoint's layout — the integration is hidden, which is the hallmark of complexity.

+ +

Cascading Changes

+
    +
  • The entrypoint relocates status.json (say to .agent/state/status.json): status-agent, status-pi-agent, agents.status, and pi_agents.status all break — four sites, two languages — and the two Python ones give no signal until invoked.
  • +
  • The "no status file" UX is reworded in the Makefile only: q … status silently keeps the old wording; the two surfaces drift apart.
  • +
  • status.json is part of actively-developed structured monitoring (recent commits #7 and #9), so this sits in a volatile area — the duplication will be exercised. Intrusive strength + high distance + volatility is unbalanced and not neutralized.
  • +
+ +

Recommended Improvement

+

Route status through the target that already exists: run_make("status-agent", {"BRANCH": branch}) — exactly as summary already does. This removes the intrusive coupling and the duplication in one move: the Makefile becomes the single component that knows the .agent/ layout, and the CLI holds only a target name (governed by the contract module proposed in the first issue). The documented rationale for the direct read — "works after the container has exited" — is already satisfied by status-agent, which is a pure file read with no live-container dependency, so no capability is lost.

+

Trade-off: the Python status currently raises typer.Exit(1) when the file is missing, whereas the Makefile target prints a message and exits 0. If a non-zero exit code is contractually required, add an explicit exit 1 to the missing-file branch of status-agent rather than reimplementing the read in Python. The cost is one line of Make; the gain is eliminating an intrusive, duplicated integration in a volatile area.

+
+ +
+

Issue: Worktree-path logic duplicated across command modules

+

Integration: agents._agents_homepi_agents._agents_home (and both ≡ Makefile AGENTS_HOME)
+Severity: Minor

+ +

Knowledge Leakage

+

_agents_home() is byte-for-byte identical in agents.py:15 and pi_agents.py:25: "use $AGENTS_HOME if set, else <git-root>/../.worktrees". The same rule is the Makefile's AGENTS_HOME ?= $(dirname GIT_ROOT)/.worktrees (Makefile:54). One business rule — where agent worktrees live — implemented three times. This is functional coupling through duplicated knowledge. The run/shell pair in run.py is a smaller instance of the same pattern: two functions with identical bodies that differ only in the target string passed to run_make.

+ +

Complexity Impact

+

The two Python copies sit at low distance — sibling modules in one package. High strength at low distance would normally read as high cohesion, which is modular. But this is duplication, not cohesion: the copies are not one deliberate unit, they are two units that secretly share a rule. The STRENGTH XOR DISTANCE formula says "balanced" only if you mistake the duplication for cohesion. The real cost is the hidden obligation every developer carries — to remember that editing one copy means editing the twin.

+ +

Cascading Changes

+
    +
  • Changing the fallback (e.g. .worktrees.agent-worktrees) requires editing both Python copies; missing one makes q agents … and q pi … resolve different worktree roots — a confusing, hard-to-spot bug.
  • +
  • The Python ↔ Makefile copy is unbalanced (high strength, high cross-language distance), but volatility is low — the worktree convention rarely changes. By the balance rule, low volatility neutralizes it: tolerable technical debt, not urgent.
  • +
+ +

Recommended Improvement

+

Hoist _agents_home() into utils.py, beside find_git_root, and have both command modules import it — one definition, genuine high cohesion instead of duplication. Collapse run/shell the same way, into one shared helper parameterised by target name. Both are pure DRY fixes at low distance, with no trade-off.

+

Leave the Python-vs-Makefile AGENTS_HOME duplication as-is. Its low volatility makes the imbalance acceptable, and unifying it would mean either the CLI parsing the Makefile or the Makefile shelling into Python — both add more coupling than they remove. This is the model's pragmatism in action: not every imbalance is worth fixing.

+
+ +
+

Issue: Two inconsistent command-registration patterns

+

Integration: main.pycommands/*
+Severity: Minor

+ +

Knowledge Leakage

+

main.py mounts agents and pi_agents as Typer sub-apps via add_typer(agents.app, …), but registers build, network, and run by cherry-picking individual functions: app.command("build")(build.build). Meanwhile build.py, network.py, and run.py each still create a typer.Typer() app (e.g. build.py:9) and decorate their functions with @app.command() — yet that app object is never mounted anywhere. main.py carries an inconsistent model of what a "command module" is: sometimes a sub-app, sometimes a bag of functions.

+ +

Complexity Impact

+

This is a low cohesion signal: the app object in build.py/network.py/run.py is dead code that actively misleads. A reader who has seen agents.app mounted reasonably assumes build.app is mounted too, and must trace main.py to discover it is not. The cost is small and contained to one short file, but every new command module forces an unstated choice between two patterns — accidental complexity with no functional purpose.

+ +

Cascading Changes

+

Low. main.py and commands/* are at minimal distance — one package — and registration changes are infrequent. There is no volatile cascade here; the harm is purely cognitive friction, which is why this is Minor.

+ +

Recommended Improvement

+

Pick one pattern and apply it everywhere. The simplest: every command module exposes a Typer app, and main.py only ever calls add_typer. If flat top-level commands like q build must be preserved (rather than q build build), then drop the unused typer.Typer() from build.py, network.py, and run.py so their functions are honestly just functions, registered directly. Either way the rule "a command module looks like X" becomes true everywhere, and the dead app objects disappear. The only cost is a one-time edit; the gain is one fewer decision per future command module.

+
+ +
+ +
+ + + + diff --git a/docs/modularity-review/2026-05-21/modularity-review.md b/docs/modularity-review/2026-05-21/modularity-review.md new file mode 100644 index 0000000..d80b520 --- /dev/null +++ b/docs/modularity-review/2026-05-21/modularity-review.md @@ -0,0 +1,128 @@ +# Modularity Review + +**Scope**: The `q` Python CLI (`app/cli/`) and its integration boundary with `config/Makefile` +**Date**: 2026-05-21 + +## Executive Summary + +stackai orchestrates parallel Claude Code agents in sandboxed Apple Containers; the `q` CLI is a typed Typer front end that wraps the `config/Makefile` orchestration targets. In the small the CLI is healthy — `utils.run_make` cleanly encapsulates the *mechanism* of invoking `make`, and every module is short and single-purpose. The [modularity](https://coupling.dev/posts/core-concepts/modularity/) weakness is concentrated at one boundary: the CLI's knowledge of *which* Makefile targets exist and *what variables they accept* is [functionally coupled](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) and **implicit** — encoded as bare string literals scattered across five command modules with no contract to make the dependency explicit or fail loudly. Because the Makefile is the [core subdomain](https://coupling.dev/posts/dimensions-of-coupling/volatility/) of the product and evolves continuously, this is the most important finding: the CLI silently breaks at runtime when a target is renamed. A second, self-inflicted issue compounds it — the `status` command reaches *past* the Makefile into the container's private `.agent/status.json` file, reproducing an existing `status-agent` target with [intrusive coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/). + +Overall status: **needs attention**. There is no critical distributed-monolith failure, but two unbalanced-and-volatile integrations will cause avoidable pain as the Makefile evolves, and two smaller cohesion defects add cognitive friction. + +## Coupling Overview + +| Integration | [Strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) | [Distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) | [Volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) | [Balanced?](https://coupling.dev/posts/core-concepts/balance/) | +| --- | --- | --- | --- | --- | +| `commands/*` → `config/Makefile` (target & variable names) | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), implicit | High — separate language, separate directory, subprocess | High — orchestration is the core subdomain, targets evolve | **No** — unbalanced & volatile | +| `agents.status` / `pi.status` → container `.agent/status.json` | [Intrusive](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) | High — bash entrypoint, runs inside the container | Medium–High — monitoring actively developed | **No** — unbalanced & volatile | +| `agents._agents_home` ≡ `pi_agents._agents_home` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | Low — sibling modules, one package | Low–Medium | **No** — duplication mistaken for cohesion | +| `_agents_home()` fallback ≡ Makefile `AGENTS_HOME` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | High — cross-language | Low | Tolerable — low volatility neutralizes | +| `main.py` → `commands/*` (registration) | [Model](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), inconsistent | Low — one package | Low | Borderline — low cohesion | +| `check_token()` ≡ Makefile `_check-token` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | High — cross-language | Low | Yes — low volatility neutralizes | + +The last row is included deliberately as a contrast: it *is* duplicated, unbalanced coupling, but [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) is low enough that the [balance rule](https://coupling.dev/posts/core-concepts/balance/) — `(STRENGTH XOR DISTANCE) OR NOT VOLATILE` — is satisfied. Not every imbalance is a defect; only the volatile ones below are. + +## Issue: Makefile interface is an implicit, scattered contract + +**Integration**: `container_cli/commands/*` → `config/Makefile` +**Severity**: Significant + +### Knowledge Leakage + +Every command function knows, as hard-coded strings, two things about the Makefile: the **name of a target** (`"spawn"`, `"list-agents"`, `"spawn-pi"`, `"summary-agent"`, …) and the **names of the variables that target accepts** (`BRANCH`, `TASK`, `CPUS`, `MEMORY`, `IMAGE`, `DOCKERFILE`, `SUBNET`, `NETWORK`, `PI_IMAGE`, `PI_DOCKERFILE`, `PI_BASE_URL`, `PI_MODEL_ID`, `NAME`). That is the Makefile's *functional requirements* — what it does and how it must be invoked — replicated inside Python. The shared knowledge is **implicit**: there is no enum, no schema, no generated mapping. The only place the correspondence is written down is a prose table in `docs/agents/cli.md`, which nothing keeps honest. `run_make` centralizes the *mechanism* (`make -C config KEY=VALUE`) but not the *vocabulary*, so each of the five `commands/*` modules independently owns a fragment of an undeclared contract. + +This is [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — the components must change together when a target's interface changes — and it is implicit functional coupling, which the model singles out as particularly dangerous: there is no compiler, type checker, or test that links `agents.py:33`'s `"BRANCH"` to the Makefile's `BRANCH ?=`. + +### Complexity Impact + +A developer changing a Makefile target name or variable cannot see, *from the Makefile*, that Python depends on it — the dependency is invisible at the point of change. To safely rename one target the developer must simultaneously hold in mind: the Makefile target, the `run_make` call site in `commands/*`, the `cli.md` mapping table, and the acceptance test. That is knowledge spread across four artifacts in two languages for what should be a trivial rename. It exceeds the 4±1 working-memory budget, which is precisely what turns a change from [modular](https://coupling.dev/posts/core-concepts/modularity/) (predictable outcome) into [complex](https://coupling.dev/posts/core-concepts/complexity/) (outcome discovered only by running it). + +### Cascading Changes + +- **Renaming `list-agents` → `agents-list` in the Makefile**: the Python literal `"list-agents"` still type-checks, still passes `ruff`, still imports cleanly. The break surfaces only when a user runs `q agents list` and `make` reports `No rule to make target`. The acceptance tests mock `run_make`, so they assert *which string was passed*, not that the target *exists* — there is no test-time signal either. +- **Adding a variable to an existing target** (e.g. a `--no-cache` option for `build`): requires coordinated edits in `build.py`, the Makefile, and `cli.md`, with nothing enforcing that the three agree. +- The coupling is genuinely [tight](https://coupling.dev/posts/core-concepts/balance/): high [integration strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) (functional) across high [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — separate language, separate top-level directory, subprocess boundary. The Makefile is a [core subdomain](https://coupling.dev/posts/dimensions-of-coupling/volatility/): the orchestration logic *is* the product, and git history shows targets added continuously (PI agents, `apply-sensors`). High strength + high distance + high [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) fails the balance rule. + +Why **Significant** and not Critical: the same person maintains both sides, so [socio-technical distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) is low — there is no cross-team coordination cost — and Makefile change is mostly *additive* (new targets), so existing literals rarely break. The danger is real but not yet frequent. + +### Recommended Improvement + +Introduce a single [contract](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) module — e.g. `container_cli/makefile.py` — as the *only* place target names and their permitted variables are written down. One small structure per target (target name + allowed variable keys) converts the implicit functional coupling into explicit [contract coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) localized in one file; command modules then reference `Targets.SPAWN` instead of the literal `"spawn"`. Optionally add one test that runs `make -C config -n ` (dry-run) for every target in the registry, so a renamed or removed target fails CI instead of a user's terminal. + +This deliberately does **not** reduce distance — the CLI must wrap the Makefile, that is its purpose, and decomposing further would only raise distance. It reduces *strength* by encapsulating the contract, and it concentrates the volatile knowledge so a Makefile change has exactly one Python counterpart to update. Trade-off: a thin indirection layer plus the discipline of keeping the registry current — cheap relative to silent runtime breakage in the product's most-changed area. + +## Issue: `status` command bypasses the Makefile and reads the container's private file + +**Integration**: `agents.status` / `pi_agents.status` → container worktree `.agent/status.json` +**Severity**: Significant + +### Knowledge Leakage + +`agents.py:78` and `pi_agents.py:128` build the path `$AGENTS_HOME//.agent/status.json` and read it directly off disk. `.agent/` is internal bookkeeping created by the container entrypoint (`config/entrypoint.sh` / `entrypoint-pi.sh`) for its own use; the CLI treats that private layout as if it were a public interface. This is [intrusive coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — the highest [integration strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — because the CLI depends on an implementation detail of a different component. + +The leak is twofold: the *directory layout* (`.agent/status.json`) and even the *fallback message text* (`[status] No status file found…`, `[status] Expected at: …`) are reproduced in Python. They are reproduced because the Makefile **already has** a `status-agent` target (`Makefile:179`) that performs the identical read and prints near-identical messages. So this is also [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) through duplication — the knowledge is copied, not shared. + +### Complexity Impact + +There are now two implementations of "show agent status" — `status-agent` in Make and `status()` in Python — and nothing keeps them consistent. `q agents summary` routes through `make summary-agent`, but `q agents status` does *not* route through `make status-agent`; a reader cannot predict from the CLI surface which commands delegate and which reimplement. The Python file read looks like ordinary I/O, not an integration point, so it is easy to miss when reasoning about what depends on the entrypoint's layout — the integration is hidden, which is the hallmark of [complexity](https://coupling.dev/posts/core-concepts/complexity/). + +### Cascading Changes + +- **The entrypoint relocates `status.json`** (say to `.agent/state/status.json`): `status-agent`, `status-pi-agent`, `agents.status`, and `pi_agents.status` all break — four sites, two languages — and the two Python ones give no signal until invoked. +- **The "no status file" UX is reworded in the Makefile only**: `q … status` silently keeps the old wording; the two surfaces drift apart. +- `status.json` is part of actively-developed structured monitoring (recent commits #7 and #9), so this sits in a [volatile](https://coupling.dev/posts/dimensions-of-coupling/volatility/) area — the duplication *will* be exercised. Intrusive strength + high distance + volatility is unbalanced and not neutralized. + +### Recommended Improvement + +Route `status` through the target that already exists: `run_make("status-agent", {"BRANCH": branch})` — exactly as `summary` already does. This removes the intrusive coupling *and* the duplication in one move: the Makefile becomes the single component that knows the `.agent/` layout, and the CLI holds only a target name (governed by the contract module proposed in the first issue). The documented rationale for the direct read — "works after the container has exited" — is *already* satisfied by `status-agent`, which is a pure file read with no live-container dependency, so no capability is lost. + +Trade-off: the Python `status` currently raises `typer.Exit(1)` when the file is missing, whereas the Makefile target prints a message and exits `0`. If a non-zero exit code is contractually required, add an explicit `exit 1` to the missing-file branch of `status-agent` rather than reimplementing the read in Python. The cost is one line of Make; the gain is eliminating an intrusive, duplicated integration in a volatile area. + +## Issue: Worktree-path logic duplicated across command modules + +**Integration**: `agents._agents_home` ≡ `pi_agents._agents_home` (and both ≡ Makefile `AGENTS_HOME`) +**Severity**: Minor + +### Knowledge Leakage + +`_agents_home()` is byte-for-byte identical in `agents.py:15` and `pi_agents.py:25`: "use `$AGENTS_HOME` if set, else `/../.worktrees`". The same rule is the Makefile's `AGENTS_HOME ?= $(dirname GIT_ROOT)/.worktrees` (`Makefile:54`). One business rule — where agent worktrees live — implemented three times. This is [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) through duplicated knowledge. The `run`/`shell` pair in `run.py` is a smaller instance of the same pattern: two functions with identical bodies that differ only in the target string passed to `run_make`. + +### Complexity Impact + +The two Python copies sit at low [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — sibling modules in one package. High strength at low distance would normally read as [high cohesion](https://coupling.dev/posts/core-concepts/balance/), which is modular. But this is duplication, not cohesion: the copies are not one deliberate unit, they are two units that *secretly* share a rule. The `STRENGTH XOR DISTANCE` formula says "balanced" only if you mistake the duplication for cohesion. The real cost is the hidden obligation every developer carries — to remember that editing one copy means editing the twin. + +### Cascading Changes + +- **Changing the fallback** (e.g. `.worktrees` → `.agent-worktrees`) requires editing both Python copies; missing one makes `q agents …` and `q pi …` resolve *different* worktree roots — a confusing, hard-to-spot bug. +- The Python ↔ Makefile copy is unbalanced (high strength, high cross-language distance), but [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) is low — the worktree convention rarely changes. By the [balance rule](https://coupling.dev/posts/core-concepts/balance/), low volatility neutralizes it: tolerable [technical debt](https://coupling.dev/posts/core-concepts/balance/), not urgent. + +### Recommended Improvement + +Hoist `_agents_home()` into `utils.py`, beside `find_git_root`, and have both command modules import it — one definition, genuine [high cohesion](https://coupling.dev/posts/core-concepts/balance/) instead of duplication. Collapse `run`/`shell` the same way, into one shared helper parameterised by target name. Both are pure DRY fixes at low distance, with no trade-off. + +Leave the Python-vs-Makefile `AGENTS_HOME` duplication as-is. Its low volatility makes the imbalance acceptable, and unifying it would mean either the CLI parsing the Makefile or the Makefile shelling into Python — both add more coupling than they remove. This is the model's pragmatism in action: not every imbalance is worth fixing. + +## Issue: Two inconsistent command-registration patterns + +**Integration**: `main.py` → `commands/*` +**Severity**: Minor + +### Knowledge Leakage + +`main.py` mounts `agents` and `pi_agents` as Typer sub-apps via `add_typer(agents.app, …)`, but registers `build`, `network`, and `run` by cherry-picking individual functions: `app.command("build")(build.build)`. Meanwhile `build.py`, `network.py`, and `run.py` each still create a `typer.Typer()` `app` (e.g. `build.py:9`) and decorate their functions with `@app.command()` — yet that `app` object is never mounted anywhere. `main.py` carries an inconsistent [model](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) of what a "command module" is: sometimes a sub-app, sometimes a bag of functions. + +### Complexity Impact + +This is a [low cohesion](https://coupling.dev/posts/core-concepts/balance/) signal: the `app` object in `build.py`/`network.py`/`run.py` is dead code that actively misleads. A reader who has seen `agents.app` mounted reasonably assumes `build.app` is mounted too, and must trace `main.py` to discover it is not. The cost is small and contained to one short file, but every new command module forces an unstated choice between two patterns — accidental complexity with no functional purpose. + +### Cascading Changes + +Low. `main.py` and `commands/*` are at minimal [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — one package — and registration changes are infrequent. There is no volatile cascade here; the harm is purely cognitive friction, which is why this is Minor. + +### Recommended Improvement + +Pick one pattern and apply it everywhere. The simplest: every command module exposes a Typer `app`, and `main.py` only ever calls `add_typer`. If flat top-level commands like `q build` must be preserved (rather than `q build build`), then drop the unused `typer.Typer()` from `build.py`, `network.py`, and `run.py` so their functions are honestly just functions, registered directly. Either way the rule "a command module looks like *X*" becomes true everywhere, and the dead `app` objects disappear. The only cost is a one-time edit; the gain is one fewer decision per future command module. + +--- + +_This analysis was performed using the [Balanced Coupling](https://coupling.dev) model by [Vlad Khononov](https://vladikk.com)._