From c533b49141a022a8c7518066ac52d62b1ddd2943 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Thu, 21 May 2026 20:33:10 -0500
Subject: [PATCH 1/9] test(cli): add opt-in end-to-end orchestrator<->agent
 tests
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add real-container E2E tests that validate the full round-trip between the
orchestrator (the q CLI) and agents running in Apple containers:

- test_claude_agent_e2e.py: spawn a Claude agent, then assert status.json,
  the persisted log, the [agent:status] markers and the worktree commit all
  reach the orchestrator.
- test_pi_agent_e2e.py: the same round-trip for a PI agent (local mlx_lm
  backend), asserting the report is tagged agent_kind=pi.

Tests are gated by STACKAI_E2E=1 via collect_ignore_glob, so the default
pytest run and the mutmut suite never collect them — they require Apple
Container CLI and are local-only, like the acceptance suite. Adds the
`e2e` marker and a `make e2e-test` target (excluded from CI and local-qa).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/cli/Makefile                           |   7 +
 app/cli/pyproject.toml                     |   3 +
 app/cli/tests/e2e/__init__.py              |   0
 app/cli/tests/e2e/conftest.py              | 171 +++++++++++++++++++++
 app/cli/tests/e2e/test_claude_agent_e2e.py |  68 ++++++++
 app/cli/tests/e2e/test_pi_agent_e2e.py     |  70 +++++++++
 6 files changed, 319 insertions(+)
 create mode 100644 app/cli/tests/e2e/__init__.py
 create mode 100644 app/cli/tests/e2e/conftest.py
 create mode 100644 app/cli/tests/e2e/test_claude_agent_e2e.py
 create mode 100644 app/cli/tests/e2e/test_pi_agent_e2e.py

diff --git a/app/cli/Makefile b/app/cli/Makefile
index b6568a9..a0fc9ee 100644
--- a/app/cli/Makefile
+++ b/app/cli/Makefile
@@ -42,6 +42,13 @@ test-all:
 eval-skills:
 	claude -p "/skill-creator:skill-creator run evals for the spawn-agent skill at ~/.claude/skills/spawn-agent/"
 
+# End-to-end container tests — opt-in, local-only (needs Apple Container CLI).
+# Spawns real agent containers; the Claude round-trip spends Claude credits.
+# Excluded from CI and from `local-qa` by design.
+.PHONY: e2e-test
+e2e-test:
+	STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e
+
 # Pre-PR quality gate
 local-qa: acceptance-test eval-skills
 
diff --git a/app/cli/pyproject.toml b/app/cli/pyproject.toml
index 310893f..f29f2d1 100644
--- a/app/cli/pyproject.toml
+++ b/app/cli/pyproject.toml
@@ -25,6 +25,9 @@ dev = [
 [tool.pytest.ini_options]
 testpaths = ["tests", "tests/acceptance"]
 python_files = ["test_*.py", "*_steps.py"]
+markers = [
+    "e2e: real-container end-to-end tests (opt-in via STACKAI_E2E=1, local-only)",
+]
 
 [tool.mutmut]
 paths_to_mutate = ["src/container_cli/"]
diff --git a/app/cli/tests/e2e/__init__.py b/app/cli/tests/e2e/__init__.py
new file mode 100644
index 0000000..e69de29
diff --git a/app/cli/tests/e2e/conftest.py b/app/cli/tests/e2e/conftest.py
new file mode 100644
index 0000000..66ec2d8
--- /dev/null
+++ b/app/cli/tests/e2e/conftest.py
@@ -0,0 +1,171 @@
+"""Shared fixtures and collection gating for the opt-in end-to-end tests.
+
+These tests spawn real Apple Container CLI containers, so they are:
+
+* **local-only** — GitHub Actions has no Apple Container CLI;
+* **opt-in** — collected only when ``STACKAI_E2E=1`` is exported;
+* **slow and billable** — the Claude round-trip spends real Claude credits.
+
+Run them with ``make e2e-test`` (from ``app/cli/``), which exports the gate
+variable. Without it, pytest never imports the test modules, so the default
+``uv run pytest`` and the mutmut suite ignore them entirely.
+"""
+
+from __future__ import annotations
+
+import json
+import os
+import shutil
+import subprocess
+import sys
+import time
+import urllib.error
+import urllib.request
+from collections.abc import Callable, Iterator
+from pathlib import Path
+
+import pytest
+
+# Collection gate: without STACKAI_E2E=1, pytest skips collecting these modules.
+collect_ignore_glob = [] if os.environ.get("STACKAI_E2E") == "1" else ["test_*.py"]
+
+# Terminal phases written by config/entrypoint*.sh into .agent/status.json.
+_TERMINAL_PHASES = frozenset({"completed", "errored"})
+
+
+def _run_q(*args: str, timeout: float = 600) -> subprocess.CompletedProcess[str]:
+    """Invoke the real q CLI in a subprocess, inheriting the current environment."""
+    return subprocess.run(
+        [sys.executable, "-m", "container_cli.main", *args],
+        capture_output=True,
+        text=True,
+        timeout=timeout,
+        check=False,
+    )
+
+
+def _image_present(image: str) -> bool | None:
+    """Report whether a container image exists locally.
+
+    Returns:
+        True or False when the Apple Container CLI answers, None when the
+        command is unavailable so the caller should not skip on that basis.
+    """
+    result = subprocess.run(
+        ["container", "image", "list"],
+        capture_output=True,
+        text=True,
+        check=False,
+    )
+    if result.returncode != 0:
+        return None
+    return image.split(":")[0] in result.stdout
+
+
+@pytest.fixture()
+def run_q() -> Callable[..., subprocess.CompletedProcess[str]]:
+    """Return a helper that runs the q CLI and captures its output."""
+    return _run_q
+
+
+@pytest.fixture()
+def require_container_cli() -> None:
+    """Skip the test unless the Apple `container` CLI is on PATH."""
+    if shutil.which("container") is None:
+        pytest.skip("Apple Container CLI ('container') not found — E2E needs macOS 26+ ARM64")
+
+
+@pytest.fixture()
+def require_claude_token() -> None:
+    """Skip the test unless a Claude container OAuth token is exported."""
+    if not os.environ.get("CLAUDE_CONTAINER_OAUTH_TOKEN"):
+        pytest.skip("CLAUDE_CONTAINER_OAUTH_TOKEN not set — required to spawn a Claude agent")
+
+
+@pytest.fixture()
+def require_claude_image() -> None:
+    """Skip the test unless the claude-agent:wolfi image is built."""
+    if _image_present("claude-agent:wolfi") is False:
+        pytest.skip("image claude-agent:wolfi not built — run `q build` first")
+
+
+@pytest.fixture()
+def require_pi_image() -> None:
+    """Skip the test unless the claude-pi:ubuntu image is built."""
+    if _image_present("claude-pi:ubuntu") is False:
+        pytest.skip("image claude-pi:ubuntu not built — run `q pi build` first")
+
+
+@pytest.fixture()
+def require_mlx_server() -> None:
+    """Skip the test unless the local mlx_lm.server is reachable on the host."""
+    url = os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models")
+    try:
+        with urllib.request.urlopen(url, timeout=5) as resp:
+            if resp.status != 200:
+                pytest.skip(f"mlx_lm.server at {url} returned HTTP {resp.status}")
+    except (urllib.error.URLError, OSError) as exc:
+        pytest.skip(f"mlx_lm.server unreachable at {url} ({exc}) — run `uv run iac server start`")
+
+
+@pytest.fixture()
+def isolated_agents_home(tmp_path: Path, monkeypatch: pytest.MonkeyPatch) -> Path:
+    """Point AGENTS_HOME at a throwaway directory for the duration of the test."""
+    home = tmp_path / "worktrees"
+    home.mkdir()
+    monkeypatch.setenv("AGENTS_HOME", str(home))
+    return home
+
+
+@pytest.fixture()
+def unique_branch() -> str:
+    """A collision-free, flat branch name (no slashes — see docs/agents/cli.md)."""
+    return f"e2e-{int(time.time())}-{os.getpid()}"
+
+
+@pytest.fixture()
+def wait_for_terminal_phase() -> Callable[..., dict]:
+    """Return a poller that blocks until an agent writes a terminal status.json."""
+
+    def _wait(agents_home: Path, branch: str, timeout: float = 600) -> dict:
+        status_file = agents_home / branch / ".agent" / "status.json"
+        deadline = time.monotonic() + timeout
+        last_phase = "<no status.json>"
+        while time.monotonic() < deadline:
+            if status_file.exists():
+                try:
+                    data = json.loads(status_file.read_text())
+                except json.JSONDecodeError:
+                    data = {}
+                last_phase = data.get("phase", "<unparsed>")
+                if last_phase in _TERMINAL_PHASES:
+                    return data
+            time.sleep(5)
+        raise AssertionError(
+            f"agent '{branch}' did not reach a terminal phase within {timeout}s "
+            f"(last phase seen: {last_phase})"
+        )
+
+    return _wait
+
+
+@pytest.fixture()
+def agent_cleanup(isolated_agents_home: Path) -> Iterator[Callable[..., None]]:
+    """Register (branch, kind) pairs to be stopped and pruned after the test."""
+    registered: list[tuple[str, str]] = []
+
+    def _register(branch: str, kind: str = "claude") -> None:
+        registered.append((branch, kind))
+
+    yield _register
+
+    for branch, kind in registered:
+        stop = ["pi", "stop"] if kind == "pi" else ["agents", "stop"]
+        _run_q(*stop, "--branch", branch, timeout=120)
+        worktree = isolated_agents_home / branch
+        for cmd in (
+            ["git", "worktree", "remove", "--force", str(worktree)],
+            ["git", "worktree", "prune"],
+            ["git", "branch", "-D", branch],
+        ):
+            subprocess.run(cmd, capture_output=True, text=True, check=False)
diff --git a/app/cli/tests/e2e/test_claude_agent_e2e.py b/app/cli/tests/e2e/test_claude_agent_e2e.py
new file mode 100644
index 0000000..1eaecdc
--- /dev/null
+++ b/app/cli/tests/e2e/test_claude_agent_e2e.py
@@ -0,0 +1,68 @@
+"""End-to-end test: orchestrator <-> Claude agent communication.
+
+Spawns a real Claude agent container through the ``q`` CLI, waits for it to
+finish, and asserts the orchestrator receives the agent's report through every
+channel: ``status.json``, the persisted log, the ``[agent:status]`` markers,
+and the git commit on the worktree branch.
+
+Opt-in and local-only — see ``tests/e2e/conftest.py``.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+_TASK = (
+    "Create a file named E2E_OK.txt in the current directory containing "
+    "exactly the text PASS, then stage and commit it with git."
+)
+
+
+@pytest.mark.e2e
+def test_claude_agent_round_trip(
+    require_container_cli,
+    require_claude_token,
+    require_claude_image,
+    isolated_agents_home,
+    unique_branch,
+    run_q,
+    wait_for_terminal_phase,
+    agent_cleanup,
+):
+    """A spawned Claude agent does the task and reports back to the orchestrator."""
+    branch = unique_branch
+
+    # Outbound — the orchestrator ensures the network exists, then dispatches.
+    network = run_q("network")
+    assert network.returncode == 0, network.stderr
+
+    spawn = run_q("spawn", "--branch", branch, "--task", _TASK)
+    agent_cleanup(branch, "claude")
+    assert spawn.returncode == 0, f"spawn failed:\n{spawn.stdout}\n{spawn.stderr}"
+
+    # Inbound channel 1 — status.json reaches a terminal phase.
+    status = wait_for_terminal_phase(isolated_agents_home, branch)
+    assert status["phase"] == "completed", f"agent did not complete: {status}"
+    assert status["exit_code"] == 0, f"agent exited non-zero: {status}"
+    assert status["commits"] >= 1, f"agent reported no commit: {status}"
+
+    # Outbound result — the task produced real work in the shared worktree.
+    artifact = isolated_agents_home / branch / "E2E_OK.txt"
+    assert artifact.is_file(), f"agent did not create {artifact}"
+    assert artifact.read_text().strip() == "PASS"
+
+    # Inbound channel 2 — `q agents status` surfaces the report to the orchestrator.
+    cli_status = run_q("agents", "status", "--branch", branch)
+    assert cli_status.returncode == 0, cli_status.stderr
+    assert json.loads(cli_status.stdout)["phase"] == "completed"
+
+    # Inbound channel 3 — the persisted log and the [agent:status] markers.
+    logs = run_q("agents", "logs", "--branch", branch)
+    assert logs.returncode == 0, logs.stderr
+    assert logs.stdout.strip(), "agent log is empty"
+
+    summary = run_q("agents", "summary", "--branch", branch)
+    assert summary.returncode == 0, summary.stderr
+    assert "PHASE=completed" in summary.stdout, summary.stdout
diff --git a/app/cli/tests/e2e/test_pi_agent_e2e.py b/app/cli/tests/e2e/test_pi_agent_e2e.py
new file mode 100644
index 0000000..32ceaa3
--- /dev/null
+++ b/app/cli/tests/e2e/test_pi_agent_e2e.py
@@ -0,0 +1,70 @@
+"""End-to-end test: orchestrator <-> PI agent communication.
+
+Spawns a real PI agent container (local ``mlx_lm.server`` backend) through the
+``q pi`` CLI and asserts the orchestrator receives the agent's report. PI
+agents need no Claude token — they authenticate against the local model — but
+they do need the model server running on the host.
+
+This test validates the *communication* round-trip, not model quality: a
+terminal ``status.json`` carrying ``agent_kind="pi"`` plus a captured log proves
+the task was delivered and the report came back.
+
+``MAX_PI_AGENTS`` enforcement is intentionally not E2E-tested here — it would
+require a racy spawn-while-running attempt; it is covered by the Makefile logic
+and the mocked acceptance suite.
+
+Opt-in and local-only — see ``tests/e2e/conftest.py``.
+"""
+
+from __future__ import annotations
+
+import json
+
+import pytest
+
+_TASK = (
+    "Create a file named E2E_PI_OK.txt in the current directory containing exactly the text PASS."
+)
+
+
+@pytest.mark.e2e
+def test_pi_agent_round_trip(
+    require_container_cli,
+    require_pi_image,
+    require_mlx_server,
+    isolated_agents_home,
+    unique_branch,
+    run_q,
+    wait_for_terminal_phase,
+    agent_cleanup,
+):
+    """A spawned PI agent runs against the local model and reports back."""
+    branch = unique_branch
+
+    # Outbound — dispatch the task to a PI container (no Claude token needed).
+    network = run_q("network")
+    assert network.returncode == 0, network.stderr
+
+    spawn = run_q("pi", "spawn", "--branch", branch, "--task", _TASK)
+    agent_cleanup(branch, "pi")
+    assert spawn.returncode == 0, f"pi spawn failed:\n{spawn.stdout}\n{spawn.stderr}"
+
+    # Inbound channel 1 — status.json reaches a terminal phase, tagged as PI.
+    status = wait_for_terminal_phase(isolated_agents_home, branch)
+    assert status["phase"] == "completed", f"PI agent did not complete: {status}"
+    assert status["agent_kind"] == "pi", f"status not tagged as PI: {status}"
+
+    # Inbound channel 2 — `q pi status` surfaces the PI report to the orchestrator.
+    cli_status = run_q("pi", "status", "--branch", branch)
+    assert cli_status.returncode == 0, cli_status.stderr
+    assert json.loads(cli_status.stdout)["agent_kind"] == "pi"
+
+    # Inbound channel 3 — `q pi list` filters this worktree in by agent_kind=pi.
+    pi_list = run_q("pi", "list")
+    assert pi_list.returncode == 0, pi_list.stderr
+    assert branch in pi_list.stdout, f"branch {branch} not listed by `q pi list`"
+
+    # Inbound channel 4 — the persisted log is captured.
+    logs = run_q("pi", "logs", "--branch", branch)
+    assert logs.returncode == 0, logs.stderr
+    assert logs.stdout.strip(), "PI agent log is empty"

From 783b049c8b8f565f7e6aeb45cd9478c144c0b326 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Thu, 21 May 2026 20:37:42 -0500
Subject: [PATCH 2/9] refactor(cli): centralise Makefile target names in a
 contract module
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The CLI's knowledge of which config/Makefile targets exist was scattered
across five command modules as bare string literals, with nothing to catch
a renamed target until a user hit the failure at runtime.

Add src/container_cli/targets.py — a Target StrEnum that is the single
source of truth for the 19 targets the CLI invokes. The command modules
now reference Target.SPAWN etc.; run_make is typed against Target. Because
StrEnum members equal their string value, every existing assertion stays
green unchanged.

tests/test_targets.py adds a contract test that parses config/Makefile and
fails if any Target member is missing — a rename is now caught in CI.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/cli/src/container_cli/commands/agents.py  | 13 +++---
 app/cli/src/container_cli/commands/build.py   |  9 ++--
 app/cli/src/container_cli/commands/network.py |  3 +-
 .../src/container_cli/commands/pi_agents.py   | 13 +++---
 app/cli/src/container_cli/commands/run.py     |  5 ++-
 app/cli/src/container_cli/targets.py          | 45 +++++++++++++++++++
 app/cli/src/container_cli/utils.py            |  9 +++-
 app/cli/tests/test_targets.py                 | 32 +++++++++++++
 8 files changed, 108 insertions(+), 21 deletions(-)
 create mode 100644 app/cli/src/container_cli/targets.py
 create mode 100644 app/cli/tests/test_targets.py

diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py
index 30ef3fb..0b1f706 100644
--- a/app/cli/src/container_cli/commands/agents.py
+++ b/app/cli/src/container_cli/commands/agents.py
@@ -7,6 +7,7 @@
 
 import typer
 
+from container_cli.targets import Target
 from container_cli.utils import check_token, find_git_root, run_make
 
 app = typer.Typer(help="Agent lifecycle commands")
@@ -37,13 +38,13 @@ def spawn(
         make_vars["MEMORY"] = memory
     if image:
         make_vars["IMAGE"] = image
-    run_make("spawn", make_vars)
+    run_make(Target.SPAWN, make_vars)
 
 
 @app.command(name="list")
 def list_agents() -> None:
     """List active agent containers and worktrees."""
-    run_make("list-agents")
+    run_make(Target.LIST_AGENTS)
 
 
 @app.command()
@@ -51,7 +52,7 @@ def logs(
     branch: Annotated[str, typer.Option("--branch", help="Agent branch name")],
 ) -> None:
     """Show logs for a branch agent."""
-    run_make("logs-agent", {"BRANCH": branch})
+    run_make(Target.LOGS_AGENT, {"BRANCH": branch})
 
 
 @app.command()
@@ -59,7 +60,7 @@ def follow(
     branch: Annotated[str, typer.Option("--branch", help="Agent branch name")],
 ) -> None:
     """Follow live streaming logs for a branch agent."""
-    run_make("follow-agent", {"BRANCH": branch}, tty=True)
+    run_make(Target.FOLLOW_AGENT, {"BRANCH": branch}, tty=True)
 
 
 @app.command()
@@ -67,7 +68,7 @@ def stop(
     branch: Annotated[str, typer.Option("--branch", help="Agent branch name")],
 ) -> None:
     """Stop a branch agent container."""
-    run_make("stop-agent", {"BRANCH": branch})
+    run_make(Target.STOP_AGENT, {"BRANCH": branch})
 
 
 @app.command()
@@ -90,4 +91,4 @@ def summary(
     branch: Annotated[str, typer.Option("--branch", help="Agent branch name")],
 ) -> None:
     """Show structured lifecycle events for a branch agent."""
-    run_make("summary-agent", {"BRANCH": branch})
+    run_make(Target.SUMMARY_AGENT, {"BRANCH": branch})
diff --git a/app/cli/src/container_cli/commands/build.py b/app/cli/src/container_cli/commands/build.py
index ed72b03..d1bab21 100644
--- a/app/cli/src/container_cli/commands/build.py
+++ b/app/cli/src/container_cli/commands/build.py
@@ -4,6 +4,7 @@
 
 import typer
 
+from container_cli.targets import Target
 from container_cli.utils import run_make
 
 app = typer.Typer(help="Image build commands")
@@ -20,22 +21,22 @@ def build(
         make_vars["IMAGE"] = image
     if dockerfile:
         make_vars["DOCKERFILE"] = dockerfile
-    run_make("build", make_vars)
+    run_make(Target.BUILD, make_vars)
 
 
 @app.command()
 def clean() -> None:
     """Remove the container image."""
-    run_make("clean")
+    run_make(Target.CLEAN)
 
 
 @app.command(name="clean-network")
 def clean_network() -> None:
     """Remove the bridge network."""
-    run_make("clean-network")
+    run_make(Target.CLEAN_NETWORK)
 
 
 @app.command(name="clean-all")
 def clean_all() -> None:
     """Remove image and network."""
-    run_make("clean-all")
+    run_make(Target.CLEAN_ALL)
diff --git a/app/cli/src/container_cli/commands/network.py b/app/cli/src/container_cli/commands/network.py
index 75f08dc..714b60f 100644
--- a/app/cli/src/container_cli/commands/network.py
+++ b/app/cli/src/container_cli/commands/network.py
@@ -4,6 +4,7 @@
 
 import typer
 
+from container_cli.targets import Target
 from container_cli.utils import run_make
 
 app = typer.Typer(help="Network management commands")
@@ -20,4 +21,4 @@ def network(
         make_vars["SUBNET"] = subnet
     if network_name:
         make_vars["NETWORK"] = network_name
-    run_make("network", make_vars)
+    run_make(Target.NETWORK, make_vars)
diff --git a/app/cli/src/container_cli/commands/pi_agents.py b/app/cli/src/container_cli/commands/pi_agents.py
index 0241acb..f624dfa 100644
--- a/app/cli/src/container_cli/commands/pi_agents.py
+++ b/app/cli/src/container_cli/commands/pi_agents.py
@@ -17,6 +17,7 @@
 
 import typer
 
+from container_cli.targets import Target
 from container_cli.utils import find_git_root, run_make
 
 app = typer.Typer(help="PI agent lifecycle (local mlx_lm.server backend)")
@@ -43,7 +44,7 @@ def build(
         make_vars["PI_IMAGE"] = image
     if dockerfile:
         make_vars["PI_DOCKERFILE"] = dockerfile
-    run_make("build-pi", make_vars)
+    run_make(Target.BUILD_PI, make_vars)
 
 
 @app.command()
@@ -87,13 +88,13 @@ def spawn(
         make_vars["PI_BASE_URL"] = base_url
     if model_id:
         make_vars["PI_MODEL_ID"] = model_id
-    run_make("spawn-pi", make_vars)
+    run_make(Target.SPAWN_PI, make_vars)
 
 
 @app.command(name="list")
 def list_agents() -> None:
     """List active PI agent containers and PI worktrees."""
-    run_make("list-pi-agents")
+    run_make(Target.LIST_PI_AGENTS)
 
 
 @app.command()
@@ -101,7 +102,7 @@ def logs(
     branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")],
 ) -> None:
     """Show logs for a PI agent (live container or persisted log)."""
-    run_make("logs-pi-agent", {"BRANCH": branch})
+    run_make(Target.LOGS_PI_AGENT, {"BRANCH": branch})
 
 
 @app.command()
@@ -109,7 +110,7 @@ def follow(
     branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")],
 ) -> None:
     """Follow live streaming logs for a PI agent."""
-    run_make("follow-pi-agent", {"BRANCH": branch}, tty=True)
+    run_make(Target.FOLLOW_PI_AGENT, {"BRANCH": branch}, tty=True)
 
 
 @app.command()
@@ -117,7 +118,7 @@ def stop(
     branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")],
 ) -> None:
     """Stop a PI agent container."""
-    run_make("stop-pi-agent", {"BRANCH": branch})
+    run_make(Target.STOP_PI_AGENT, {"BRANCH": branch})
 
 
 @app.command()
diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py
index 1f17828..12dc3ba 100644
--- a/app/cli/src/container_cli/commands/run.py
+++ b/app/cli/src/container_cli/commands/run.py
@@ -4,6 +4,7 @@
 
 import typer
 
+from container_cli.targets import Target
 from container_cli.utils import check_token, run_make
 
 app = typer.Typer(help="Run coordinator container")
@@ -24,7 +25,7 @@ def run(
         make_vars["MEMORY"] = memory
     if name:
         make_vars["NAME"] = name
-    run_make("run", make_vars, tty=True)
+    run_make(Target.RUN, make_vars, tty=True)
 
 
 @app.command()
@@ -42,4 +43,4 @@ def shell(
         make_vars["MEMORY"] = memory
     if name:
         make_vars["NAME"] = name
-    run_make("shell", make_vars, tty=True)
+    run_make(Target.SHELL, make_vars, tty=True)
diff --git a/app/cli/src/container_cli/targets.py b/app/cli/src/container_cli/targets.py
new file mode 100644
index 0000000..69e8def
--- /dev/null
+++ b/app/cli/src/container_cli/targets.py
@@ -0,0 +1,45 @@
+"""Single source of truth for the config/Makefile targets the q CLI invokes.
+
+Centralising the target names here turns the implicit, scattered functional
+coupling between the CLI and the orchestration Makefile into one explicit
+contract: a renamed target now has a single Python counterpart to update, and
+``tests/test_targets.py`` verifies every member against the real Makefile.
+
+``Target`` is a ``StrEnum``, so each member compares equal to its string value
+and can be passed anywhere a plain target string is expected.
+"""
+
+from enum import StrEnum
+
+
+class Target(StrEnum):
+    """Names of the config/Makefile targets invoked through ``utils.run_make``."""
+
+    # Image build and cleanup.
+    BUILD = "build"
+    CLEAN = "clean"
+    CLEAN_NETWORK = "clean-network"
+    CLEAN_ALL = "clean-all"
+
+    # Bridge network.
+    NETWORK = "network"
+
+    # Interactive coordinator container.
+    RUN = "run"
+    SHELL = "shell"
+
+    # Claude agent lifecycle.
+    SPAWN = "spawn"
+    LIST_AGENTS = "list-agents"
+    LOGS_AGENT = "logs-agent"
+    FOLLOW_AGENT = "follow-agent"
+    STOP_AGENT = "stop-agent"
+    SUMMARY_AGENT = "summary-agent"
+
+    # PI agent lifecycle (local mlx_lm backend).
+    BUILD_PI = "build-pi"
+    SPAWN_PI = "spawn-pi"
+    LIST_PI_AGENTS = "list-pi-agents"
+    LOGS_PI_AGENT = "logs-pi-agent"
+    FOLLOW_PI_AGENT = "follow-pi-agent"
+    STOP_PI_AGENT = "stop-pi-agent"
diff --git a/app/cli/src/container_cli/utils.py b/app/cli/src/container_cli/utils.py
index c22fdde..8fd77b4 100644
--- a/app/cli/src/container_cli/utils.py
+++ b/app/cli/src/container_cli/utils.py
@@ -6,6 +6,8 @@
 
 import typer
 
+from container_cli.targets import Target
+
 
 def find_git_root() -> Path:
     """Return the absolute path of the repository root.
@@ -50,11 +52,14 @@ def check_token() -> None:
         raise typer.Exit(1)
 
 
-def run_make(target: str, extra_vars: dict[str, str] | None = None, *, tty: bool = False) -> None:
+def run_make(
+    target: Target, extra_vars: dict[str, str] | None = None, *, tty: bool = False
+) -> None:
     """Invoke a Makefile target inside the project's `config/` directory.
 
     Args:
-        target: The Make target name to execute.
+        target: The Makefile target to execute (a `Target` member, which is a
+            `StrEnum` and therefore usable directly as the make argument).
         extra_vars: Optional mapping of variables passed as `KEY=VALUE` to make.
         tty: If True, replace the current process with `make` so it inherits the TTY
             (used by interactive commands like `run` and `follow`).
diff --git a/app/cli/tests/test_targets.py b/app/cli/tests/test_targets.py
new file mode 100644
index 0000000..5818e9e
--- /dev/null
+++ b/app/cli/tests/test_targets.py
@@ -0,0 +1,32 @@
+from __future__ import annotations
+
+import re
+from pathlib import Path
+
+from container_cli.targets import Target
+
+_REPO_ROOT = Path(__file__).resolve().parents[3]
+_MAKEFILE = _REPO_ROOT / "config" / "Makefile"
+_TARGET_DEF = re.compile(r"^([a-zA-Z0-9_-]+):", re.MULTILINE)
+
+
+class TestTargetValues:
+    def test_every_member_value_is_kebab_case(self):
+        for target in Target:
+            assert re.fullmatch(r"[a-z][a-z-]*", target.value), target
+
+    def test_member_equals_its_string_value(self):
+        assert Target.SPAWN == "spawn"
+        assert Target.LIST_AGENTS == "list-agents"
+        assert Target.SPAWN_PI == "spawn-pi"
+
+    def test_no_duplicate_values(self):
+        values = [t.value for t in Target]
+        assert len(values) == len(set(values))
+
+
+class TestMakefileContract:
+    def test_every_target_exists_in_the_makefile(self):
+        defined = set(_TARGET_DEF.findall(_MAKEFILE.read_text()))
+        missing = {t.value for t in Target} - defined
+        assert not missing, f"Target members absent from config/Makefile: {sorted(missing)}"

From fe58b191dac7f6417d6b99319f77b2b4500c5e1b Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Thu, 21 May 2026 20:41:51 -0500
Subject: [PATCH 3/9] refactor(cli): consolidate worktree and status logic in
 utils.py
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

_agents_home() was duplicated byte-for-byte in agents.py and pi_agents.py,
and the status command bodies were near-identical copies that also knew the
private .agent/status.json layout. run/shell shared an identical body too.

- Add utils.agents_home() — the single definition of the worktree-root
  rule (was duplicated 2x in Python).
- Add utils.print_agent_status(branch, label=...) — the single reader of
  .agent/status.json; agents.status and pi.status now delegate to it,
  keeping their [status] / [pi-status] message tags via the label arg.
- Extract run._coordinator() so run and shell stop duplicating their body.

Behaviour is unchanged: status still reads the file directly and still
exits 1 when it is missing, so no .feature file changes. The acceptance
conftest now patches find_git_root in utils (one patch instead of two),
and the moved TestAgentsHome tests live in test_utils.py alongside new
coverage for print_agent_status.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/cli/src/container_cli/commands/agents.py  | 22 +-------
 .../src/container_cli/commands/pi_agents.py   | 21 +------
 app/cli/src/container_cli/commands/run.py     | 31 +++++------
 app/cli/src/container_cli/utils.py            | 36 +++++++++++-
 app/cli/tests/acceptance/conftest.py          |  3 +-
 app/cli/tests/test_agents.py                  | 17 +-----
 app/cli/tests/test_pi_agents.py               | 19 -------
 app/cli/tests/test_utils.py                   | 55 ++++++++++++++++++-
 8 files changed, 109 insertions(+), 95 deletions(-)

diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py
index 0b1f706..9c1c3d5 100644
--- a/app/cli/src/container_cli/commands/agents.py
+++ b/app/cli/src/container_cli/commands/agents.py
@@ -1,26 +1,15 @@
 """Agent lifecycle commands (spawn, list, logs, follow, stop, status, summary)."""
 
-import json
-import os
-from pathlib import Path
 from typing import Annotated
 
 import typer
 
 from container_cli.targets import Target
-from container_cli.utils import check_token, find_git_root, run_make
+from container_cli.utils import check_token, print_agent_status, run_make
 
 app = typer.Typer(help="Agent lifecycle commands")
 
 
-def _agents_home() -> Path:
-    """Resolve AGENTS_HOME, falling back to sibling .worktrees/ directory."""
-    env_val = os.environ.get("AGENTS_HOME")
-    if env_val:
-        return Path(env_val)
-    return find_git_root().parent / ".worktrees"
-
-
 @app.command()
 def spawn(
     branch: Annotated[str, typer.Option("--branch", help="Git branch for the agent worktree")],
@@ -76,14 +65,7 @@ def status(
     branch: Annotated[str, typer.Option("--branch", help="Agent branch name")],
 ) -> None:
     """Show agent status from persisted status.json file."""
-    status_file = _agents_home() / branch / ".agent" / "status.json"
-    if not status_file.exists():
-        typer.echo(f"[status] No status file found for branch '{branch}'.")
-        typer.echo(f"[status] Expected at: {status_file}")
-        raise typer.Exit(1)
-
-    data = json.loads(status_file.read_text())
-    typer.echo(json.dumps(data, indent=2))
+    print_agent_status(branch, label="status")
 
 
 @app.command()
diff --git a/app/cli/src/container_cli/commands/pi_agents.py b/app/cli/src/container_cli/commands/pi_agents.py
index f624dfa..09bb976 100644
--- a/app/cli/src/container_cli/commands/pi_agents.py
+++ b/app/cli/src/container_cli/commands/pi_agents.py
@@ -10,27 +10,16 @@
 
 from __future__ import annotations
 
-import json
-import os
-from pathlib import Path
 from typing import Annotated
 
 import typer
 
 from container_cli.targets import Target
-from container_cli.utils import find_git_root, run_make
+from container_cli.utils import print_agent_status, run_make
 
 app = typer.Typer(help="PI agent lifecycle (local mlx_lm.server backend)")
 
 
-def _agents_home() -> Path:
-    """Resolve AGENTS_HOME, falling back to sibling .worktrees/ directory."""
-    env_val = os.environ.get("AGENTS_HOME")
-    if env_val:
-        return Path(env_val)
-    return find_git_root().parent / ".worktrees"
-
-
 @app.command()
 def build(
     image: Annotated[str | None, typer.Option("--image", help="PI image tag")] = None,
@@ -126,10 +115,4 @@ def status(
     branch: Annotated[str, typer.Option("--branch", help="PI agent branch name")],
 ) -> None:
     """Show PI agent status from persisted status.json file."""
-    status_file = _agents_home() / branch / ".agent" / "status.json"
-    if not status_file.exists():
-        typer.echo(f"[pi-status] No status file found for branch '{branch}'.")
-        typer.echo(f"[pi-status] Expected at: {status_file}")
-        raise typer.Exit(1)
-    data = json.loads(status_file.read_text())
-    typer.echo(json.dumps(data, indent=2))
+    print_agent_status(branch, label="pi-status")
diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py
index 12dc3ba..6d853b8 100644
--- a/app/cli/src/container_cli/commands/run.py
+++ b/app/cli/src/container_cli/commands/run.py
@@ -10,13 +10,8 @@
 app = typer.Typer(help="Run coordinator container")
 
 
-@app.command()
-def run(
-    cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None,
-    memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None,
-    name: Annotated[str | None, typer.Option("--name", help="Container name")] = None,
-) -> None:
-    """Run an interactive coordinator shell (hands off TTY)."""
+def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str | None) -> None:
+    """Validate the token and launch the coordinator container via `target`."""
     check_token()
     make_vars: dict[str, str] = {}
     if cpus is not None:
@@ -25,7 +20,17 @@ def run(
         make_vars["MEMORY"] = memory
     if name:
         make_vars["NAME"] = name
-    run_make(Target.RUN, make_vars, tty=True)
+    run_make(target, make_vars, tty=True)
+
+
+@app.command()
+def run(
+    cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None,
+    memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None,
+    name: Annotated[str | None, typer.Option("--name", help="Container name")] = None,
+) -> None:
+    """Run an interactive coordinator shell (hands off TTY)."""
+    _coordinator(Target.RUN, cpus, memory, name)
 
 
 @app.command()
@@ -35,12 +40,4 @@ def shell(
     name: Annotated[str | None, typer.Option("--name", help="Container name")] = None,
 ) -> None:
     """Alias for run — open an interactive shell in the coordinator container."""
-    check_token()
-    make_vars: dict[str, str] = {}
-    if cpus is not None:
-        make_vars["CPUS"] = str(cpus)
-    if memory:
-        make_vars["MEMORY"] = memory
-    if name:
-        make_vars["NAME"] = name
-    run_make(Target.SHELL, make_vars, tty=True)
+    _coordinator(Target.SHELL, cpus, memory, name)
diff --git a/app/cli/src/container_cli/utils.py b/app/cli/src/container_cli/utils.py
index 8fd77b4..b004724 100644
--- a/app/cli/src/container_cli/utils.py
+++ b/app/cli/src/container_cli/utils.py
@@ -1,5 +1,6 @@
-"""Shared helpers: git root resolution, Makefile runner, and token validation."""
+"""Shared helpers: git/worktree paths, the Makefile runner, token and status I/O."""
 
+import json
 import os
 import subprocess
 from pathlib import Path
@@ -37,6 +38,19 @@ def makefile_dir() -> Path:
     return find_git_root() / "config"
 
 
+def agents_home() -> Path:
+    """Return the directory that holds agent worktrees.
+
+    Returns:
+        The `AGENTS_HOME` environment variable when set, otherwise a sibling
+        `.worktrees/` directory next to the repository root.
+    """
+    env_val = os.environ.get("AGENTS_HOME")
+    if env_val:
+        return Path(env_val)
+    return find_git_root().parent / ".worktrees"
+
+
 def check_token() -> None:
     """Verify that the Claude container OAuth token is exported.
 
@@ -76,3 +90,23 @@ def run_make(
         result = subprocess.run(cmd)
         if result.returncode != 0:
             raise typer.Exit(result.returncode)
+
+
+def print_agent_status(branch: str, *, label: str) -> None:
+    """Print the persisted `status.json` for an agent worktree branch.
+
+    Args:
+        branch: The agent branch whose status should be displayed.
+        label: Tag used in the not-found messages (e.g. `status`, `pi-status`).
+
+    Raises:
+        typer.Exit: With code 1 when no status file exists for the branch.
+    """
+    status_file = agents_home() / branch / ".agent" / "status.json"
+    if not status_file.exists():
+        typer.echo(f"[{label}] No status file found for branch '{branch}'.")
+        typer.echo(f"[{label}] Expected at: {status_file}")
+        raise typer.Exit(1)
+
+    data = json.loads(status_file.read_text())
+    typer.echo(json.dumps(data, indent=2))
diff --git a/app/cli/tests/acceptance/conftest.py b/app/cli/tests/acceptance/conftest.py
index 6d00b06..7406de6 100644
--- a/app/cli/tests/acceptance/conftest.py
+++ b/app/cli/tests/acceptance/conftest.py
@@ -42,8 +42,7 @@ def invocation_context(
         patch("container_cli.commands.run.run_make") as m_run,
         patch("container_cli.commands.network.run_make") as m_network,
         patch("container_cli.commands.pi_agents.run_make") as m_pi,
-        patch("container_cli.commands.agents.find_git_root", return_value=repo),
-        patch("container_cli.commands.pi_agents.find_git_root", return_value=repo),
+        patch("container_cli.utils.find_git_root", return_value=repo),
     ):
         ctx = InvocationContext(
             runner=CliRunner(),
diff --git a/app/cli/tests/test_agents.py b/app/cli/tests/test_agents.py
index c229235..719e9c8 100644
--- a/app/cli/tests/test_agents.py
+++ b/app/cli/tests/test_agents.py
@@ -7,7 +7,6 @@
 import typer
 
 from container_cli.commands.agents import (
-    _agents_home,
     follow,
     list_agents,
     logs,
@@ -98,22 +97,8 @@ def test_reads_status_file(self, tmp_path, monkeypatch):
     def test_agents_home_fallback(self, monkeypatch):
         monkeypatch.delenv("AGENTS_HOME", raising=False)
         with (
-            patch("container_cli.commands.agents.find_git_root", return_value=Path("/fake/repo")),
+            patch("container_cli.utils.find_git_root", return_value=Path("/fake/repo")),
             pytest.raises(typer.Exit) as exc_info,
         ):
             status(branch="nonexistent-branch")
         assert exc_info.value.exit_code == 1
-
-
-class TestAgentsHome:
-    def test_uses_env_var(self, tmp_path, monkeypatch):
-        monkeypatch.setenv("AGENTS_HOME", str(tmp_path))
-        assert _agents_home() == tmp_path
-
-    def test_fallback_path_name(self, monkeypatch):
-        monkeypatch.delenv("AGENTS_HOME", raising=False)
-        with patch(
-            "container_cli.commands.agents.find_git_root", return_value=Path("/home/user/repo")
-        ):
-            result = _agents_home()
-        assert result == Path("/home/user/.worktrees")
diff --git a/app/cli/tests/test_pi_agents.py b/app/cli/tests/test_pi_agents.py
index 5193ad4..21b10d2 100644
--- a/app/cli/tests/test_pi_agents.py
+++ b/app/cli/tests/test_pi_agents.py
@@ -6,14 +6,10 @@
 
 from __future__ import annotations
 
-from pathlib import Path
-from unittest.mock import patch
-
 import pytest
 import typer
 
 from container_cli.commands.pi_agents import (
-    _agents_home,
     build,
     follow,
     list_agents,
@@ -148,18 +144,3 @@ def test_reads_status_file(self, tmp_path, monkeypatch):
         status_file.parent.mkdir(parents=True)
         status_file.write_text('{"phase": "completed", "agent_kind": "pi"}')
         status(branch="pi/x")
-
-
-class TestAgentsHome:
-    def test_uses_env_var(self, tmp_path, monkeypatch):
-        monkeypatch.setenv("AGENTS_HOME", str(tmp_path))
-        assert _agents_home() == tmp_path
-
-    def test_fallback_path_name(self, monkeypatch):
-        monkeypatch.delenv("AGENTS_HOME", raising=False)
-        with patch(
-            "container_cli.commands.pi_agents.find_git_root",
-            return_value=Path("/home/user/repo"),
-        ):
-            result = _agents_home()
-        assert result == Path("/home/user/.worktrees")
diff --git a/app/cli/tests/test_utils.py b/app/cli/tests/test_utils.py
index 554fe8a..efa26e8 100644
--- a/app/cli/tests/test_utils.py
+++ b/app/cli/tests/test_utils.py
@@ -7,7 +7,14 @@
 import pytest
 import typer
 
-from container_cli.utils import check_token, find_git_root, makefile_dir, run_make
+from container_cli.utils import (
+    agents_home,
+    check_token,
+    find_git_root,
+    makefile_dir,
+    print_agent_status,
+    run_make,
+)
 
 # ---------- find_git_root ----------
 
@@ -132,3 +139,49 @@ def test_default_tty_uses_subprocess(self, fake_git_root: Path):
             run_make("build")
             m_sub.assert_called_once()
             m_exec.assert_not_called()
+
+
+# ---------- agents_home ----------
+
+
+class TestAgentsHome:
+    def test_uses_env_var(self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch):
+        monkeypatch.setenv("AGENTS_HOME", str(tmp_path))
+        assert agents_home() == tmp_path
+
+    def test_fallback_path_name(self, monkeypatch: pytest.MonkeyPatch):
+        monkeypatch.delenv("AGENTS_HOME", raising=False)
+        with patch("container_cli.utils.find_git_root", return_value=Path("/home/user/repo")):
+            assert agents_home() == Path("/home/user/.worktrees")
+
+
+# ---------- print_agent_status ----------
+
+
+class TestPrintAgentStatus:
+    def test_exits_when_status_file_missing(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
+    ):
+        monkeypatch.setenv("AGENTS_HOME", str(tmp_path))
+        with pytest.raises(typer.Exit) as exc_info:
+            print_agent_status("feat-x", label="status")
+        assert exc_info.value.exit_code == 1
+        assert "[status] No status file found" in capsys.readouterr().out
+
+    def test_prints_json_when_status_file_exists(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
+    ):
+        monkeypatch.setenv("AGENTS_HOME", str(tmp_path))
+        status_file = tmp_path / "feat-x" / ".agent" / "status.json"
+        status_file.parent.mkdir(parents=True)
+        status_file.write_text('{"phase": "completed"}')
+        print_agent_status("feat-x", label="status")
+        assert "completed" in capsys.readouterr().out
+
+    def test_label_tags_the_not_found_message(
+        self, tmp_path: Path, monkeypatch: pytest.MonkeyPatch, capsys
+    ):
+        monkeypatch.setenv("AGENTS_HOME", str(tmp_path))
+        with pytest.raises(typer.Exit):
+            print_agent_status("feat-x", label="pi-status")
+        assert "[pi-status]" in capsys.readouterr().out

From 94c7c37be9941d9718e2b1a70e7e33f45ad80c29 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Thu, 21 May 2026 20:44:26 -0500
Subject: [PATCH 4/9] refactor(cli): drop the dead Typer apps from
 build/network/run
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

build.py, network.py and run.py each created a typer.Typer() app and
decorated their functions with @app.command(), but main.py never mounted
those apps — it registers the bare functions directly. The app objects
were dead code that misled readers into expecting a mounted sub-app.

Remove the unused typer.Typer() and @app.command() decorators from the
three modules; their functions are now plain `def`s, registered by
main.py exactly as before. agents.py and pi_agents.py keep their Typer
app because they genuinely are mounted as sub-apps — that becomes the one
consistent pattern. The q CLI surface is unchanged.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/cli/src/container_cli/commands/build.py   | 11 ++++-------
 app/cli/src/container_cli/commands/network.py |  8 ++++----
 app/cli/src/container_cli/commands/run.py     |  9 ++++-----
 3 files changed, 12 insertions(+), 16 deletions(-)

diff --git a/app/cli/src/container_cli/commands/build.py b/app/cli/src/container_cli/commands/build.py
index d1bab21..3d3bb89 100644
--- a/app/cli/src/container_cli/commands/build.py
+++ b/app/cli/src/container_cli/commands/build.py
@@ -1,4 +1,7 @@
-"""Image build and cleanup commands."""
+"""Image build and cleanup commands.
+
+These functions are registered as top-level commands by `container_cli.main`.
+"""
 
 from typing import Annotated
 
@@ -7,10 +10,7 @@
 from container_cli.targets import Target
 from container_cli.utils import run_make
 
-app = typer.Typer(help="Image build commands")
-
 
-@app.command()
 def build(
     image: Annotated[str | None, typer.Option("--image", help="Image tag")] = None,
     dockerfile: Annotated[str | None, typer.Option("--dockerfile", help="Dockerfile path")] = None,
@@ -24,19 +24,16 @@ def build(
     run_make(Target.BUILD, make_vars)
 
 
-@app.command()
 def clean() -> None:
     """Remove the container image."""
     run_make(Target.CLEAN)
 
 
-@app.command(name="clean-network")
 def clean_network() -> None:
     """Remove the bridge network."""
     run_make(Target.CLEAN_NETWORK)
 
 
-@app.command(name="clean-all")
 def clean_all() -> None:
     """Remove image and network."""
     run_make(Target.CLEAN_ALL)
diff --git a/app/cli/src/container_cli/commands/network.py b/app/cli/src/container_cli/commands/network.py
index 714b60f..a39374f 100644
--- a/app/cli/src/container_cli/commands/network.py
+++ b/app/cli/src/container_cli/commands/network.py
@@ -1,4 +1,7 @@
-"""Bridge network management commands."""
+"""Bridge network management command.
+
+Registered as a top-level command by `container_cli.main`.
+"""
 
 from typing import Annotated
 
@@ -7,10 +10,7 @@
 from container_cli.targets import Target
 from container_cli.utils import run_make
 
-app = typer.Typer(help="Network management commands")
-
 
-@app.command()
 def network(
     subnet: Annotated[str | None, typer.Option("--subnet", help="Subnet CIDR")] = None,
     network_name: Annotated[str | None, typer.Option("--network-name", help="Network name")] = None,
diff --git a/app/cli/src/container_cli/commands/run.py b/app/cli/src/container_cli/commands/run.py
index 6d853b8..5c9da95 100644
--- a/app/cli/src/container_cli/commands/run.py
+++ b/app/cli/src/container_cli/commands/run.py
@@ -1,4 +1,7 @@
-"""Interactive coordinator container (`run` and `shell` aliases)."""
+"""Interactive coordinator container (`run` and `shell` aliases).
+
+Registered as top-level commands by `container_cli.main`.
+"""
 
 from typing import Annotated
 
@@ -7,8 +10,6 @@
 from container_cli.targets import Target
 from container_cli.utils import check_token, run_make
 
-app = typer.Typer(help="Run coordinator container")
-
 
 def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str | None) -> None:
     """Validate the token and launch the coordinator container via `target`."""
@@ -23,7 +24,6 @@ def _coordinator(target: Target, cpus: int | None, memory: str | None, name: str
     run_make(target, make_vars, tty=True)
 
 
-@app.command()
 def run(
     cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None,
     memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None,
@@ -33,7 +33,6 @@ def run(
     _coordinator(Target.RUN, cpus, memory, name)
 
 
-@app.command()
 def shell(
     cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None,
     memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None,

From 6511dec48e46ab523d286a86cc572337e3c1ceb4 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Fri, 22 May 2026 07:14:23 -0500
Subject: [PATCH 5/9] fix(cli): locate config/Makefile by walking up in the
 contract test
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

test_targets.py resolved the repo root with a fixed parent index, which
pointed at the wrong directory when the test ran from the mutmut copy
under mutants/ (one extra path level). Walk up from the test file until a
config/Makefile is found instead — correct both in tests/ and mutants/tests/.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/cli/tests/test_targets.py | 18 ++++++++++++++++--
 1 file changed, 16 insertions(+), 2 deletions(-)

diff --git a/app/cli/tests/test_targets.py b/app/cli/tests/test_targets.py
index 5818e9e..971ffbb 100644
--- a/app/cli/tests/test_targets.py
+++ b/app/cli/tests/test_targets.py
@@ -5,11 +5,25 @@
 
 from container_cli.targets import Target
 
-_REPO_ROOT = Path(__file__).resolve().parents[3]
-_MAKEFILE = _REPO_ROOT / "config" / "Makefile"
 _TARGET_DEF = re.compile(r"^([a-zA-Z0-9_-]+):", re.MULTILINE)
 
 
+def _find_makefile() -> Path:
+    """Locate config/Makefile by walking up from this test file.
+
+    Walking up (rather than a fixed parent index) keeps the lookup correct when
+    the test runs from the mutmut copy under ``mutants/``.
+    """
+    for parent in Path(__file__).resolve().parents:
+        candidate = parent / "config" / "Makefile"
+        if candidate.is_file():
+            return candidate
+    raise FileNotFoundError("config/Makefile not found in any parent directory")
+
+
+_MAKEFILE = _find_makefile()
+
+
 class TestTargetValues:
     def test_every_member_value_is_kebab_case(self):
         for target in Target:

From 125e5b1c15ada8d964d8f5511b4da92de7055ea4 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Fri, 22 May 2026 07:14:23 -0500
Subject: [PATCH 6/9] docs: document the E2E tests and refresh the CLI
 reference

- Add docs/agents/e2e-tests.md: prerequisites, how to run `make e2e-test`,
  what each round-trip test asserts, and the cost/CI caveats.
- cli.md: add targets.py to the package structure, note the Target
  contract and the tests/e2e/ suite.
- CLAUDE.md: list `make e2e-test` in the Testing section.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 CLAUDE.md                |  3 ++
 docs/agents/cli.md       | 12 +++++--
 docs/agents/e2e-tests.md | 77 ++++++++++++++++++++++++++++++++++++++++
 3 files changed, 89 insertions(+), 3 deletions(-)
 create mode 100644 docs/agents/e2e-tests.md

diff --git a/CLAUDE.md b/CLAUDE.md
index 96508f2..5c4be47 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -53,6 +53,9 @@ uv run pytest -k test_spawn         # single test by name
 # Acceptance tests only (Gherkin/BDD, local — no real containers)
 make acceptance-test
 
+# End-to-end tests (real containers, opt-in, local-only — see docs/agents/e2e-tests.md)
+make e2e-test
+
 # Mutation testing gate (≥ 70% kill rate)
 make mutation-ci-threshold
 
diff --git a/docs/agents/cli.md b/docs/agents/cli.md
index e91bc61..b5d5dc0 100644
--- a/docs/agents/cli.md
+++ b/docs/agents/cli.md
@@ -235,16 +235,22 @@ app/cli/
 ├── pyproject.toml                    ← uv project, entry point: q
 └── src/
     └── container_cli/
-        ├── main.py                   ← registers all commands + agents sub-app
-        ├── utils.py                  ← find_git_root, makefile_dir, check_token, run_make
+        ├── main.py                   ← registers all commands + agents/pi sub-apps
+        ├── targets.py                ← Target enum: the config/Makefile target contract
+        ├── utils.py                  ← git/worktree paths, run_make, check_token, print_agent_status
         └── commands/
             ├── build.py              ← build, clean, clean-network, clean-all
             ├── network.py            ← network
             ├── run.py                ← run, shell
-            ├── agents.py             ← spawn, list, logs, follow, stop
+            ├── agents.py             ← spawn, list, logs, follow, stop, status, summary
             └── pi_agents.py          ← pi build/spawn/list/logs/follow/stop/status
 ```
 
+> Target names are not free strings: every Makefile target the CLI invokes is a
+> member of the `Target` enum in `targets.py`, and `tests/test_targets.py`
+> verifies each one against `config/Makefile`. Real-container round-trip tests
+> live in `tests/e2e/` — see [e2e-tests.md](./e2e-tests.md).
+
 ---
 
 ## CLI ↔ Makefile mapping
diff --git a/docs/agents/e2e-tests.md b/docs/agents/e2e-tests.md
new file mode 100644
index 0000000..39b8302
--- /dev/null
+++ b/docs/agents/e2e-tests.md
@@ -0,0 +1,77 @@
+# End-to-end tests — orchestrator ↔ agent
+
+The E2E tests spawn **real Apple Container CLI containers** and validate the
+full round-trip between the orchestrator (the `q` CLI) and an agent running in
+a container. They complement — they do not replace — the mocked acceptance
+suite: acceptance tests are the fast contract, E2E is the real integration
+smoke.
+
+**Location:** `app/cli/tests/e2e/`
+
+## Why opt-in and local-only
+
+The tests need Apple Container CLI (macOS 26+, ARM64), which GitHub Actions
+does not have — exactly like the acceptance suite. They are also slow and
+billable. So they are gated:
+
+- Collection is guarded by `STACKAI_E2E=1`. Without it, pytest never imports
+  the test modules, so the default `uv run pytest` and the `mutmut` suite
+  ignore them entirely.
+- `make e2e-test` (from `app/cli/`) exports the variable and runs them.
+- They are excluded from CI and from `make local-qa` by design.
+
+## Prerequisites
+
+Each test skips itself, with a reason, when a prerequisite is missing.
+
+| Test | Needs |
+|---|---|
+| both | Apple Container CLI on `PATH`; the bridge network (created automatically) |
+| E2E-1 (Claude) | `CLAUDE_CONTAINER_OAUTH_TOKEN` exported; `claude-agent:wolfi` built (`q build`) |
+| E2E-2 (PI) | `claude-pi:ubuntu` built (`q pi build`); `mlx_lm.server` reachable (`uv run iac server start`) |
+
+> **Cost:** E2E-1 spends real Claude credits — it runs a live headless agent.
+> E2E-2 uses the local model, so it is free but needs the model downloaded and
+> the server running.
+
+The PI server URL defaults to `http://localhost:8080/v1/models`; override the
+reachability check with `STACKAI_E2E_MLX_URL` if your server listens elsewhere.
+
+## How to run
+
+```bash
+cd app/cli
+make e2e-test          # STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e
+```
+
+Run a single test:
+
+```bash
+STACKAI_E2E=1 uv run pytest tests/e2e/test_claude_agent_e2e.py -v
+```
+
+## What each test asserts
+
+Both tests dispatch a deterministic task, wait for the agent to reach a
+terminal phase, then assert the orchestrator receives the report through every
+channel. Each test runs in an isolated `AGENTS_HOME` (a temp directory) and
+tears down its container, worktree and branch afterwards.
+
+### E2E-1 — Claude agent (`test_claude_agent_e2e.py`)
+
+1. `q network`, then `q spawn` with a task that creates `E2E_OK.txt` and commits.
+2. `status.json` reaches `phase=completed`, `exit_code=0`, `commits>=1`.
+3. The committed file exists in the shared worktree.
+4. `q agents status`, `q agents logs` and `q agents summary` surface the
+   report (status JSON, persisted log, `[agent:status]` markers).
+
+### E2E-2 — PI agent (`test_pi_agent_e2e.py`)
+
+1. `q pi spawn` against the local `mlx_lm.server` backend (no Claude token).
+2. `status.json` reaches `phase=completed` and is tagged `agent_kind=pi`.
+3. `q pi status`, `q pi list` and `q pi logs` surface the PI report; `q pi list`
+   filters this worktree in by `agent_kind=pi`.
+
+`MAX_PI_AGENTS` enforcement is not E2E-tested — it would need a racy
+spawn-while-running attempt; it is covered by the Makefile logic and the mocked
+acceptance suite.

From 74747651e3725590c4d128ff735383aa6fff235b Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Fri, 22 May 2026 07:20:20 -0500
Subject: [PATCH 7/9] add: modularity review

---
 .../2026-05-21/modularity-review.html         | 473 ++++++++++++++++++
 .../2026-05-21/modularity-review.md           | 128 +++++
 2 files changed, 601 insertions(+)
 create mode 100644 docs/modularity-review/2026-05-21/modularity-review.html
 create mode 100644 docs/modularity-review/2026-05-21/modularity-review.md

diff --git a/docs/modularity-review/2026-05-21/modularity-review.html b/docs/modularity-review/2026-05-21/modularity-review.html
new file mode 100644
index 0000000..95eb179
--- /dev/null
+++ b/docs/modularity-review/2026-05-21/modularity-review.html
@@ -0,0 +1,473 @@
+<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Modularity Review</title>
+<style>
+  :root {
+    --font-sans: -apple-system, BlinkMacSystemFont, 'Segoe UI', Helvetica, Arial, sans-serif;
+    --font-mono: 'SF Mono', 'Fira Code', 'Fira Mono', Menlo, Consolas, monospace;
+    --max-width: 820px;
+  }
+
+  :root[data-theme="light"] {
+    --bg: #ffffff;
+    --bg-secondary: #f6f8fa;
+    --bg-tertiary: #eef1f5;
+    --text: #1a1a1a;
+    --text-secondary: #4a5568;
+    --text-muted: #718096;
+    --border: #e2e8f0;
+    --accent: #2b6cb0;
+    --accent-light: #ebf4ff;
+    --severity-critical: #c53030;
+    --severity-critical-bg: #fff5f5;
+    --severity-significant: #c05621;
+    --severity-significant-bg: #fffaf0;
+    --severity-minor: #2b6cb0;
+    --severity-minor-bg: #ebf8ff;
+    --table-stripe: #f7fafc;
+    --link: #2b6cb0;
+    --link-hover: #1a4971;
+    --shadow: rgba(0, 0, 0, 0.06);
+  }
+
+  :root[data-theme="dark"] {
+    --bg: #1a1a1a;
+    --bg-secondary: #242424;
+    --bg-tertiary: #2e2e2e;
+    --text: #e2e2e2;
+    --text-secondary: #a0a0a0;
+    --text-muted: #717171;
+    --border: #333333;
+    --accent: #b0b0b0;
+    --accent-light: #2a2a2a;
+    --severity-critical: #fc8181;
+    --severity-critical-bg: #2d1b1b;
+    --severity-significant: #f6ad55;
+    --severity-significant-bg: #2d2415;
+    --severity-minor: #a0a0a0;
+    --severity-minor-bg: #282828;
+    --table-stripe: #222222;
+    --link: #e0c97f;
+    --link-hover: #f0dca0;
+    --shadow: rgba(0, 0, 0, 0.2);
+  }
+
+  *, *::before, *::after {
+    box-sizing: border-box;
+  }
+
+  body {
+    font-family: var(--font-sans);
+    background: var(--bg);
+    color: var(--text);
+    line-height: 1.7;
+    margin: 0;
+    padding: 2rem 1.5rem;
+    transition: background 0.3s ease, color 0.3s ease;
+  }
+
+  .container {
+    max-width: var(--max-width);
+    margin: 0 auto;
+  }
+
+  /* Theme toggle */
+  .theme-toggle {
+    position: fixed;
+    top: 1.5rem;
+    right: 1.5rem;
+    background: var(--bg-secondary);
+    border: 1px solid var(--border);
+    border-radius: 50%;
+    width: 40px;
+    height: 40px;
+    cursor: pointer;
+    display: flex;
+    align-items: center;
+    justify-content: center;
+    font-size: 1.1rem;
+    color: var(--text);
+    transition: background 0.2s ease, border-color 0.2s ease, color 0.2s ease;
+    z-index: 100;
+  }
+
+  .theme-toggle:hover {
+    background: var(--bg-tertiary);
+  }
+
+  /* Typography */
+  h1 {
+    font-size: 2rem;
+    font-weight: 700;
+    letter-spacing: -0.02em;
+    margin: 0 0 0.5rem 0;
+    color: var(--text);
+  }
+
+  h2 {
+    font-size: 1.4rem;
+    font-weight: 600;
+    letter-spacing: -0.01em;
+    margin: 2.5rem 0 1rem 0;
+    padding-bottom: 0.4rem;
+    border-bottom: 1px solid var(--border);
+    color: var(--text);
+  }
+
+  h3 {
+    font-size: 1.1rem;
+    font-weight: 600;
+    margin: 1.8rem 0 0.6rem 0;
+    color: var(--text);
+  }
+
+  p {
+    margin: 0 0 1rem 0;
+    color: var(--text-secondary);
+  }
+
+  a {
+    color: var(--link);
+    text-decoration: none;
+    border-bottom: 1px solid transparent;
+    transition: border-color 0.2s ease;
+  }
+
+  a:hover {
+    color: var(--link-hover);
+    border-bottom-color: var(--link-hover);
+  }
+
+  /* Metadata */
+  .meta {
+    color: var(--text-muted);
+    font-size: 0.95rem;
+    margin-bottom: 2.5rem;
+  }
+
+  .meta strong {
+    color: var(--text-secondary);
+  }
+
+  /* Tables */
+  table {
+    width: 100%;
+    border-collapse: collapse;
+    margin: 1.5rem 0;
+    font-size: 0.9rem;
+  }
+
+  th {
+    background: var(--bg-secondary);
+    font-weight: 600;
+    text-align: left;
+    padding: 0.75rem 1rem;
+    border: 1px solid var(--border);
+    color: var(--text);
+  }
+
+  td {
+    padding: 0.65rem 1rem;
+    border: 1px solid var(--border);
+    color: var(--text-secondary);
+  }
+
+  tr:nth-child(even) td {
+    background: var(--table-stripe);
+  }
+
+  /* Severity badges */
+  .severity {
+    display: inline-block;
+    font-size: 0.8rem;
+    font-weight: 600;
+    padding: 0.15rem 0.6rem;
+    border-radius: 4px;
+    text-transform: uppercase;
+    letter-spacing: 0.04em;
+  }
+
+  .severity-critical {
+    color: var(--severity-critical);
+    background: var(--severity-critical-bg);
+  }
+
+  .severity-significant {
+    color: var(--severity-significant);
+    background: var(--severity-significant-bg);
+  }
+
+  .severity-minor {
+    color: var(--severity-minor);
+    background: var(--severity-minor-bg);
+  }
+
+  /* Issue cards */
+  .issue {
+    background: var(--bg-secondary);
+    border: 1px solid var(--border);
+    border-radius: 8px;
+    padding: 1.5rem 2rem;
+    margin: 1.5rem 0;
+    box-shadow: 0 1px 3px var(--shadow);
+  }
+
+  .issue h2 {
+    margin-top: 0;
+    border-bottom: none;
+    padding-bottom: 0;
+  }
+
+  .issue h3 {
+    font-size: 1rem;
+  }
+
+  .issue-meta {
+    font-size: 0.9rem;
+    color: var(--text-muted);
+    margin-bottom: 1rem;
+  }
+
+  /* Lists */
+  ul, ol {
+    padding-left: 1.5rem;
+    color: var(--text-secondary);
+  }
+
+  li {
+    margin-bottom: 0.35rem;
+  }
+
+  /* Code */
+  code {
+    font-family: var(--font-mono);
+    font-size: 0.88em;
+    background: var(--bg-tertiary);
+    padding: 0.15rem 0.4rem;
+    border-radius: 3px;
+  }
+
+  /* Horizontal rule */
+  hr {
+    border: none;
+    border-top: 1px solid var(--border);
+    margin: 3rem 0 1.5rem 0;
+  }
+
+  /* Footer */
+  .footer {
+    margin-top: 3rem;
+    padding-top: 1.5rem;
+    border-top: 1px solid var(--border);
+    font-size: 0.9rem;
+    color: var(--text-muted);
+    font-style: italic;
+  }
+
+  .footer a {
+    color: var(--text-muted);
+    border-bottom-color: var(--border);
+  }
+
+  .footer a:hover {
+    color: var(--link);
+  }
+
+  /* Print */
+  @media print {
+    .theme-toggle { display: none; }
+    body { padding: 0; }
+    .issue { break-inside: avoid; }
+  }
+</style>
+</head>
+<body>
+<button class="theme-toggle" onclick="toggleTheme()" aria-label="Toggle theme">
+  <span id="theme-icon"></span>
+</button>
+
+<div class="container">
+  <h1>Modularity Review</h1>
+<p class="meta"><strong>Scope</strong>: The <code>q</code> Python CLI (<code>app/cli/</code>) and its integration boundary with <code>config/Makefile</code><br>
+<strong>Date</strong>: 2026-05-21</p>
+
+<h2>Executive Summary</h2>
+<p>stackai orchestrates parallel Claude Code agents in sandboxed Apple Containers; the <code>q</code> CLI is a typed Typer front end that wraps the <code>config/Makefile</code> orchestration targets. In the small the CLI is healthy &mdash; <code>utils.run_make</code> cleanly encapsulates the <em>mechanism</em> of invoking <code>make</code>, and every module is short and single-purpose. The <a href="https://coupling.dev/posts/core-concepts/modularity/">modularity</a> weakness is concentrated at one boundary: the CLI's knowledge of <em>which</em> Makefile targets exist and <em>what variables they accept</em> is <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">functionally coupled</a> and <strong>implicit</strong> &mdash; encoded as bare string literals scattered across five command modules with no contract to make the dependency explicit or fail loudly. Because the Makefile is the <a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">core subdomain</a> of the product and evolves continuously, this is the most important finding: the CLI silently breaks at runtime when a target is renamed. A second, self-inflicted issue compounds it &mdash; the <code>status</code> command reaches <em>past</em> the Makefile into the container's private <code>.agent/status.json</code> file, reproducing an existing <code>status-agent</code> target with <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">intrusive coupling</a>.</p>
+<p>Overall status: <strong>needs attention</strong>. There is no critical distributed-monolith failure, but two unbalanced-and-volatile integrations will cause avoidable pain as the Makefile evolves, and two smaller cohesion defects add cognitive friction.</p>
+
+<h2>Coupling Overview</h2>
+<table>
+<thead>
+<tr>
+<th>Integration</th>
+<th><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Strength</a></th>
+<th><a href="https://coupling.dev/posts/dimensions-of-coupling/distance/">Distance</a></th>
+<th><a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">Volatility</a></th>
+<th><a href="https://coupling.dev/posts/core-concepts/balance/">Balanced?</a></th>
+</tr>
+</thead>
+<tbody>
+<tr>
+<td><code>commands/*</code> &rarr; <code>config/Makefile</code> (target &amp; variable names)</td>
+<td><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Functional</a>, implicit</td>
+<td>High &mdash; separate language, separate directory, subprocess</td>
+<td>High &mdash; orchestration is the core subdomain, targets evolve</td>
+<td><strong>No</strong> &mdash; unbalanced &amp; volatile</td>
+</tr>
+<tr>
+<td><code>agents.status</code> / <code>pi.status</code> &rarr; container <code>.agent/status.json</code></td>
+<td><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Intrusive</a></td>
+<td>High &mdash; bash entrypoint, runs inside the container</td>
+<td>Medium&ndash;High &mdash; monitoring actively developed</td>
+<td><strong>No</strong> &mdash; unbalanced &amp; volatile</td>
+</tr>
+<tr>
+<td><code>agents._agents_home</code> &equiv; <code>pi_agents._agents_home</code></td>
+<td><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Functional</a>, duplicated</td>
+<td>Low &mdash; sibling modules, one package</td>
+<td>Low&ndash;Medium</td>
+<td><strong>No</strong> &mdash; duplication mistaken for cohesion</td>
+</tr>
+<tr>
+<td><code>_agents_home()</code> fallback &equiv; Makefile <code>AGENTS_HOME</code></td>
+<td><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Functional</a>, duplicated</td>
+<td>High &mdash; cross-language</td>
+<td>Low</td>
+<td>Tolerable &mdash; low volatility neutralizes</td>
+</tr>
+<tr>
+<td><code>main.py</code> &rarr; <code>commands/*</code> (registration)</td>
+<td><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Model</a>, inconsistent</td>
+<td>Low &mdash; one package</td>
+<td>Low</td>
+<td>Borderline &mdash; low cohesion</td>
+</tr>
+<tr>
+<td><code>check_token()</code> &equiv; Makefile <code>_check-token</code></td>
+<td><a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">Functional</a>, duplicated</td>
+<td>High &mdash; cross-language</td>
+<td>Low</td>
+<td>Yes &mdash; low volatility neutralizes</td>
+</tr>
+</tbody>
+</table>
+<p>The last row is included deliberately as a contrast: it <em>is</em> duplicated, unbalanced coupling, but <a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">volatility</a> is low enough that the <a href="https://coupling.dev/posts/core-concepts/balance/">balance rule</a> &mdash; <code>(STRENGTH XOR DISTANCE) OR NOT VOLATILE</code> &mdash; is satisfied. Not every imbalance is a defect; only the volatile ones below are.</p>
+
+<div class="issue">
+<h2>Issue: Makefile interface is an implicit, scattered contract</h2>
+<p class="issue-meta"><strong>Integration</strong>: <code>container_cli/commands/*</code> &rarr; <code>config/Makefile</code><br>
+<strong>Severity</strong>: <span class="severity severity-significant">Significant</span></p>
+
+<h3>Knowledge Leakage</h3>
+<p>Every command function knows, as hard-coded strings, two things about the Makefile: the <strong>name of a target</strong> (<code>"spawn"</code>, <code>"list-agents"</code>, <code>"spawn-pi"</code>, <code>"summary-agent"</code>, &hellip;) and the <strong>names of the variables that target accepts</strong> (<code>BRANCH</code>, <code>TASK</code>, <code>CPUS</code>, <code>MEMORY</code>, <code>IMAGE</code>, <code>DOCKERFILE</code>, <code>SUBNET</code>, <code>NETWORK</code>, <code>PI_IMAGE</code>, <code>PI_DOCKERFILE</code>, <code>PI_BASE_URL</code>, <code>PI_MODEL_ID</code>, <code>NAME</code>). That is the Makefile's <em>functional requirements</em> &mdash; what it does and how it must be invoked &mdash; replicated inside Python. The shared knowledge is <strong>implicit</strong>: there is no enum, no schema, no generated mapping. The only place the correspondence is written down is a prose table in <code>docs/agents/cli.md</code>, which nothing keeps honest. <code>run_make</code> centralizes the <em>mechanism</em> (<code>make -C config &lt;target&gt; KEY=VALUE</code>) but not the <em>vocabulary</em>, so each of the five <code>commands/*</code> modules independently owns a fragment of an undeclared contract.</p>
+<p>This is <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">functional coupling</a> &mdash; the components must change together when a target's interface changes &mdash; and it is implicit functional coupling, which the model singles out as particularly dangerous: there is no compiler, type checker, or test that links <code>agents.py:33</code>'s <code>"BRANCH"</code> to the Makefile's <code>BRANCH ?=</code>.</p>
+
+<h3>Complexity Impact</h3>
+<p>A developer changing a Makefile target name or variable cannot see, <em>from the Makefile</em>, that Python depends on it &mdash; the dependency is invisible at the point of change. To safely rename one target the developer must simultaneously hold in mind: the Makefile target, the <code>run_make</code> call site in <code>commands/*</code>, the <code>cli.md</code> mapping table, and the acceptance test. That is knowledge spread across four artifacts in two languages for what should be a trivial rename. It exceeds the 4&plusmn;1 working-memory budget, which is precisely what turns a change from <a href="https://coupling.dev/posts/core-concepts/modularity/">modular</a> (predictable outcome) into <a href="https://coupling.dev/posts/core-concepts/complexity/">complex</a> (outcome discovered only by running it).</p>
+
+<h3>Cascading Changes</h3>
+<ul>
+<li><strong>Renaming <code>list-agents</code> &rarr; <code>agents-list</code> in the Makefile</strong>: the Python literal <code>"list-agents"</code> still type-checks, still passes <code>ruff</code>, still imports cleanly. The break surfaces only when a user runs <code>q agents list</code> and <code>make</code> reports <code>No rule to make target</code>. The acceptance tests mock <code>run_make</code>, so they assert <em>which string was passed</em>, not that the target <em>exists</em> &mdash; there is no test-time signal either.</li>
+<li><strong>Adding a variable to an existing target</strong> (e.g. a <code>--no-cache</code> option for <code>build</code>): requires coordinated edits in <code>build.py</code>, the Makefile, and <code>cli.md</code>, with nothing enforcing that the three agree.</li>
+<li>The coupling is genuinely <a href="https://coupling.dev/posts/core-concepts/balance/">tight</a>: high <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">integration strength</a> (functional) across high <a href="https://coupling.dev/posts/dimensions-of-coupling/distance/">distance</a> &mdash; separate language, separate top-level directory, subprocess boundary. The Makefile is a <a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">core subdomain</a>: the orchestration logic <em>is</em> the product, and git history shows targets added continuously (PI agents, <code>apply-sensors</code>). High strength + high distance + high <a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">volatility</a> fails the balance rule.</li>
+</ul>
+<p>Why <strong>Significant</strong> and not Critical: the same person maintains both sides, so <a href="https://coupling.dev/posts/dimensions-of-coupling/distance/">socio-technical distance</a> is low &mdash; there is no cross-team coordination cost &mdash; and Makefile change is mostly <em>additive</em> (new targets), so existing literals rarely break. The danger is real but not yet frequent.</p>
+
+<h3>Recommended Improvement</h3>
+<p>Introduce a single <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">contract</a> module &mdash; e.g. <code>container_cli/makefile.py</code> &mdash; as the <em>only</em> place target names and their permitted variables are written down. One small structure per target (target name + allowed variable keys) converts the implicit functional coupling into explicit <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">contract coupling</a> localized in one file; command modules then reference <code>Targets.SPAWN</code> instead of the literal <code>"spawn"</code>. Optionally add one test that runs <code>make -C config -n &lt;target&gt;</code> (dry-run) for every target in the registry, so a renamed or removed target fails CI instead of a user's terminal.</p>
+<p>This deliberately does <strong>not</strong> reduce distance &mdash; the CLI must wrap the Makefile, that is its purpose, and decomposing further would only raise distance. It reduces <em>strength</em> by encapsulating the contract, and it concentrates the volatile knowledge so a Makefile change has exactly one Python counterpart to update. Trade-off: a thin indirection layer plus the discipline of keeping the registry current &mdash; cheap relative to silent runtime breakage in the product's most-changed area.</p>
+</div>
+
+<div class="issue">
+<h2>Issue: <code>status</code> command bypasses the Makefile and reads the container's private file</h2>
+<p class="issue-meta"><strong>Integration</strong>: <code>agents.status</code> / <code>pi_agents.status</code> &rarr; container worktree <code>.agent/status.json</code><br>
+<strong>Severity</strong>: <span class="severity severity-significant">Significant</span></p>
+
+<h3>Knowledge Leakage</h3>
+<p><code>agents.py:78</code> and <code>pi_agents.py:128</code> build the path <code>$AGENTS_HOME/&lt;branch&gt;/.agent/status.json</code> and read it directly off disk. <code>.agent/</code> is internal bookkeeping created by the container entrypoint (<code>config/entrypoint.sh</code> / <code>entrypoint-pi.sh</code>) for its own use; the CLI treats that private layout as if it were a public interface. This is <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">intrusive coupling</a> &mdash; the highest <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">integration strength</a> &mdash; because the CLI depends on an implementation detail of a different component.</p>
+<p>The leak is twofold: the <em>directory layout</em> (<code>.agent/status.json</code>) and even the <em>fallback message text</em> (<code>[status] No status file found&hellip;</code>, <code>[status] Expected at: &hellip;</code>) are reproduced in Python. They are reproduced because the Makefile <strong>already has</strong> a <code>status-agent</code> target (<code>Makefile:179</code>) that performs the identical read and prints near-identical messages. So this is also <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">functional coupling</a> through duplication &mdash; the knowledge is copied, not shared.</p>
+
+<h3>Complexity Impact</h3>
+<p>There are now two implementations of "show agent status" &mdash; <code>status-agent</code> in Make and <code>status()</code> in Python &mdash; and nothing keeps them consistent. <code>q agents summary</code> routes through <code>make summary-agent</code>, but <code>q agents status</code> does <em>not</em> route through <code>make status-agent</code>; a reader cannot predict from the CLI surface which commands delegate and which reimplement. The Python file read looks like ordinary I/O, not an integration point, so it is easy to miss when reasoning about what depends on the entrypoint's layout &mdash; the integration is hidden, which is the hallmark of <a href="https://coupling.dev/posts/core-concepts/complexity/">complexity</a>.</p>
+
+<h3>Cascading Changes</h3>
+<ul>
+<li><strong>The entrypoint relocates <code>status.json</code></strong> (say to <code>.agent/state/status.json</code>): <code>status-agent</code>, <code>status-pi-agent</code>, <code>agents.status</code>, and <code>pi_agents.status</code> all break &mdash; four sites, two languages &mdash; and the two Python ones give no signal until invoked.</li>
+<li><strong>The "no status file" UX is reworded in the Makefile only</strong>: <code>q &hellip; status</code> silently keeps the old wording; the two surfaces drift apart.</li>
+<li><code>status.json</code> is part of actively-developed structured monitoring (recent commits #7 and #9), so this sits in a <a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">volatile</a> area &mdash; the duplication <em>will</em> be exercised. Intrusive strength + high distance + volatility is unbalanced and not neutralized.</li>
+</ul>
+
+<h3>Recommended Improvement</h3>
+<p>Route <code>status</code> through the target that already exists: <code>run_make("status-agent", {"BRANCH": branch})</code> &mdash; exactly as <code>summary</code> already does. This removes the intrusive coupling <em>and</em> the duplication in one move: the Makefile becomes the single component that knows the <code>.agent/</code> layout, and the CLI holds only a target name (governed by the contract module proposed in the first issue). The documented rationale for the direct read &mdash; "works after the container has exited" &mdash; is <em>already</em> satisfied by <code>status-agent</code>, which is a pure file read with no live-container dependency, so no capability is lost.</p>
+<p>Trade-off: the Python <code>status</code> currently raises <code>typer.Exit(1)</code> when the file is missing, whereas the Makefile target prints a message and exits <code>0</code>. If a non-zero exit code is contractually required, add an explicit <code>exit 1</code> to the missing-file branch of <code>status-agent</code> rather than reimplementing the read in Python. The cost is one line of Make; the gain is eliminating an intrusive, duplicated integration in a volatile area.</p>
+</div>
+
+<div class="issue">
+<h2>Issue: Worktree-path logic duplicated across command modules</h2>
+<p class="issue-meta"><strong>Integration</strong>: <code>agents._agents_home</code> &equiv; <code>pi_agents._agents_home</code> (and both &equiv; Makefile <code>AGENTS_HOME</code>)<br>
+<strong>Severity</strong>: <span class="severity severity-minor">Minor</span></p>
+
+<h3>Knowledge Leakage</h3>
+<p><code>_agents_home()</code> is byte-for-byte identical in <code>agents.py:15</code> and <code>pi_agents.py:25</code>: "use <code>$AGENTS_HOME</code> if set, else <code>&lt;git-root&gt;/../.worktrees</code>". The same rule is the Makefile's <code>AGENTS_HOME ?= $(dirname GIT_ROOT)/.worktrees</code> (<code>Makefile:54</code>). One business rule &mdash; where agent worktrees live &mdash; implemented three times. This is <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">functional coupling</a> through duplicated knowledge. The <code>run</code>/<code>shell</code> pair in <code>run.py</code> is a smaller instance of the same pattern: two functions with identical bodies that differ only in the target string passed to <code>run_make</code>.</p>
+
+<h3>Complexity Impact</h3>
+<p>The two Python copies sit at low <a href="https://coupling.dev/posts/dimensions-of-coupling/distance/">distance</a> &mdash; sibling modules in one package. High strength at low distance would normally read as <a href="https://coupling.dev/posts/core-concepts/balance/">high cohesion</a>, which is modular. But this is duplication, not cohesion: the copies are not one deliberate unit, they are two units that <em>secretly</em> share a rule. The <code>STRENGTH XOR DISTANCE</code> formula says "balanced" only if you mistake the duplication for cohesion. The real cost is the hidden obligation every developer carries &mdash; to remember that editing one copy means editing the twin.</p>
+
+<h3>Cascading Changes</h3>
+<ul>
+<li><strong>Changing the fallback</strong> (e.g. <code>.worktrees</code> &rarr; <code>.agent-worktrees</code>) requires editing both Python copies; missing one makes <code>q agents &hellip;</code> and <code>q pi &hellip;</code> resolve <em>different</em> worktree roots &mdash; a confusing, hard-to-spot bug.</li>
+<li>The Python &harr; Makefile copy is unbalanced (high strength, high cross-language distance), but <a href="https://coupling.dev/posts/dimensions-of-coupling/volatility/">volatility</a> is low &mdash; the worktree convention rarely changes. By the <a href="https://coupling.dev/posts/core-concepts/balance/">balance rule</a>, low volatility neutralizes it: tolerable <a href="https://coupling.dev/posts/core-concepts/balance/">technical debt</a>, not urgent.</li>
+</ul>
+
+<h3>Recommended Improvement</h3>
+<p>Hoist <code>_agents_home()</code> into <code>utils.py</code>, beside <code>find_git_root</code>, and have both command modules import it &mdash; one definition, genuine <a href="https://coupling.dev/posts/core-concepts/balance/">high cohesion</a> instead of duplication. Collapse <code>run</code>/<code>shell</code> the same way, into one shared helper parameterised by target name. Both are pure DRY fixes at low distance, with no trade-off.</p>
+<p>Leave the Python-vs-Makefile <code>AGENTS_HOME</code> duplication as-is. Its low volatility makes the imbalance acceptable, and unifying it would mean either the CLI parsing the Makefile or the Makefile shelling into Python &mdash; both add more coupling than they remove. This is the model's pragmatism in action: not every imbalance is worth fixing.</p>
+</div>
+
+<div class="issue">
+<h2>Issue: Two inconsistent command-registration patterns</h2>
+<p class="issue-meta"><strong>Integration</strong>: <code>main.py</code> &rarr; <code>commands/*</code><br>
+<strong>Severity</strong>: <span class="severity severity-minor">Minor</span></p>
+
+<h3>Knowledge Leakage</h3>
+<p><code>main.py</code> mounts <code>agents</code> and <code>pi_agents</code> as Typer sub-apps via <code>add_typer(agents.app, &hellip;)</code>, but registers <code>build</code>, <code>network</code>, and <code>run</code> by cherry-picking individual functions: <code>app.command("build")(build.build)</code>. Meanwhile <code>build.py</code>, <code>network.py</code>, and <code>run.py</code> each still create a <code>typer.Typer()</code> <code>app</code> (e.g. <code>build.py:9</code>) and decorate their functions with <code>@app.command()</code> &mdash; yet that <code>app</code> object is never mounted anywhere. <code>main.py</code> carries an inconsistent <a href="https://coupling.dev/posts/dimensions-of-coupling/integration-strength/">model</a> of what a "command module" is: sometimes a sub-app, sometimes a bag of functions.</p>
+
+<h3>Complexity Impact</h3>
+<p>This is a <a href="https://coupling.dev/posts/core-concepts/balance/">low cohesion</a> signal: the <code>app</code> object in <code>build.py</code>/<code>network.py</code>/<code>run.py</code> is dead code that actively misleads. A reader who has seen <code>agents.app</code> mounted reasonably assumes <code>build.app</code> is mounted too, and must trace <code>main.py</code> to discover it is not. The cost is small and contained to one short file, but every new command module forces an unstated choice between two patterns &mdash; accidental complexity with no functional purpose.</p>
+
+<h3>Cascading Changes</h3>
+<p>Low. <code>main.py</code> and <code>commands/*</code> are at minimal <a href="https://coupling.dev/posts/dimensions-of-coupling/distance/">distance</a> &mdash; one package &mdash; and registration changes are infrequent. There is no volatile cascade here; the harm is purely cognitive friction, which is why this is Minor.</p>
+
+<h3>Recommended Improvement</h3>
+<p>Pick one pattern and apply it everywhere. The simplest: every command module exposes a Typer <code>app</code>, and <code>main.py</code> only ever calls <code>add_typer</code>. If flat top-level commands like <code>q build</code> must be preserved (rather than <code>q build build</code>), then drop the unused <code>typer.Typer()</code> from <code>build.py</code>, <code>network.py</code>, and <code>run.py</code> so their functions are honestly just functions, registered directly. Either way the rule "a command module looks like <em>X</em>" becomes true everywhere, and the dead <code>app</code> objects disappear. The only cost is a one-time edit; the gain is one fewer decision per future command module.</p>
+</div>
+
+<hr>
+<p class="footer"><em>This analysis was performed using the <a href="https://coupling.dev">Balanced Coupling</a> model by <a href="https://vladikk.com">Vlad Khononov</a>.</em></p>
+</div>
+
+<script>
+  function getDefaultTheme() {
+    const hour = new Date().getHours();
+    return (hour >= 7 && hour < 19) ? 'light' : 'dark';
+  }
+
+  function setTheme(theme) {
+    document.documentElement.setAttribute('data-theme', theme);
+    localStorage.setItem('modularity-review-theme', theme);
+    document.getElementById('theme-icon').textContent = theme === 'light' ? '☾' : '☀';
+  }
+
+  function toggleTheme() {
+    const current = document.documentElement.getAttribute('data-theme');
+    setTheme(current === 'light' ? 'dark' : 'light');
+  }
+
+  const saved = localStorage.getItem('modularity-review-theme');
+  setTheme(saved || getDefaultTheme());
+</script>
+</body>
+</html>
diff --git a/docs/modularity-review/2026-05-21/modularity-review.md b/docs/modularity-review/2026-05-21/modularity-review.md
new file mode 100644
index 0000000..d80b520
--- /dev/null
+++ b/docs/modularity-review/2026-05-21/modularity-review.md
@@ -0,0 +1,128 @@
+# Modularity Review
+
+**Scope**: The `q` Python CLI (`app/cli/`) and its integration boundary with `config/Makefile`
+**Date**: 2026-05-21
+
+## Executive Summary
+
+stackai orchestrates parallel Claude Code agents in sandboxed Apple Containers; the `q` CLI is a typed Typer front end that wraps the `config/Makefile` orchestration targets. In the small the CLI is healthy — `utils.run_make` cleanly encapsulates the *mechanism* of invoking `make`, and every module is short and single-purpose. The [modularity](https://coupling.dev/posts/core-concepts/modularity/) weakness is concentrated at one boundary: the CLI's knowledge of *which* Makefile targets exist and *what variables they accept* is [functionally coupled](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) and **implicit** — encoded as bare string literals scattered across five command modules with no contract to make the dependency explicit or fail loudly. Because the Makefile is the [core subdomain](https://coupling.dev/posts/dimensions-of-coupling/volatility/) of the product and evolves continuously, this is the most important finding: the CLI silently breaks at runtime when a target is renamed. A second, self-inflicted issue compounds it — the `status` command reaches *past* the Makefile into the container's private `.agent/status.json` file, reproducing an existing `status-agent` target with [intrusive coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/).
+
+Overall status: **needs attention**. There is no critical distributed-monolith failure, but two unbalanced-and-volatile integrations will cause avoidable pain as the Makefile evolves, and two smaller cohesion defects add cognitive friction.
+
+## Coupling Overview
+
+| Integration | [Strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) | [Distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) | [Volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) | [Balanced?](https://coupling.dev/posts/core-concepts/balance/) |
+| --- | --- | --- | --- | --- |
+| `commands/*` → `config/Makefile` (target & variable names) | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), implicit | High — separate language, separate directory, subprocess | High — orchestration is the core subdomain, targets evolve | **No** — unbalanced & volatile |
+| `agents.status` / `pi.status` → container `.agent/status.json` | [Intrusive](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) | High — bash entrypoint, runs inside the container | Medium–High — monitoring actively developed | **No** — unbalanced & volatile |
+| `agents._agents_home` ≡ `pi_agents._agents_home` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | Low — sibling modules, one package | Low–Medium | **No** — duplication mistaken for cohesion |
+| `_agents_home()` fallback ≡ Makefile `AGENTS_HOME` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | High — cross-language | Low | Tolerable — low volatility neutralizes |
+| `main.py` → `commands/*` (registration) | [Model](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), inconsistent | Low — one package | Low | Borderline — low cohesion |
+| `check_token()` ≡ Makefile `_check-token` | [Functional](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/), duplicated | High — cross-language | Low | Yes — low volatility neutralizes |
+
+The last row is included deliberately as a contrast: it *is* duplicated, unbalanced coupling, but [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) is low enough that the [balance rule](https://coupling.dev/posts/core-concepts/balance/) — `(STRENGTH XOR DISTANCE) OR NOT VOLATILE` — is satisfied. Not every imbalance is a defect; only the volatile ones below are.
+
+## Issue: Makefile interface is an implicit, scattered contract
+
+**Integration**: `container_cli/commands/*` → `config/Makefile`
+**Severity**: Significant
+
+### Knowledge Leakage
+
+Every command function knows, as hard-coded strings, two things about the Makefile: the **name of a target** (`"spawn"`, `"list-agents"`, `"spawn-pi"`, `"summary-agent"`, …) and the **names of the variables that target accepts** (`BRANCH`, `TASK`, `CPUS`, `MEMORY`, `IMAGE`, `DOCKERFILE`, `SUBNET`, `NETWORK`, `PI_IMAGE`, `PI_DOCKERFILE`, `PI_BASE_URL`, `PI_MODEL_ID`, `NAME`). That is the Makefile's *functional requirements* — what it does and how it must be invoked — replicated inside Python. The shared knowledge is **implicit**: there is no enum, no schema, no generated mapping. The only place the correspondence is written down is a prose table in `docs/agents/cli.md`, which nothing keeps honest. `run_make` centralizes the *mechanism* (`make -C config <target> KEY=VALUE`) but not the *vocabulary*, so each of the five `commands/*` modules independently owns a fragment of an undeclared contract.
+
+This is [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — the components must change together when a target's interface changes — and it is implicit functional coupling, which the model singles out as particularly dangerous: there is no compiler, type checker, or test that links `agents.py:33`'s `"BRANCH"` to the Makefile's `BRANCH ?=`.
+
+### Complexity Impact
+
+A developer changing a Makefile target name or variable cannot see, *from the Makefile*, that Python depends on it — the dependency is invisible at the point of change. To safely rename one target the developer must simultaneously hold in mind: the Makefile target, the `run_make` call site in `commands/*`, the `cli.md` mapping table, and the acceptance test. That is knowledge spread across four artifacts in two languages for what should be a trivial rename. It exceeds the 4±1 working-memory budget, which is precisely what turns a change from [modular](https://coupling.dev/posts/core-concepts/modularity/) (predictable outcome) into [complex](https://coupling.dev/posts/core-concepts/complexity/) (outcome discovered only by running it).
+
+### Cascading Changes
+
+- **Renaming `list-agents` → `agents-list` in the Makefile**: the Python literal `"list-agents"` still type-checks, still passes `ruff`, still imports cleanly. The break surfaces only when a user runs `q agents list` and `make` reports `No rule to make target`. The acceptance tests mock `run_make`, so they assert *which string was passed*, not that the target *exists* — there is no test-time signal either.
+- **Adding a variable to an existing target** (e.g. a `--no-cache` option for `build`): requires coordinated edits in `build.py`, the Makefile, and `cli.md`, with nothing enforcing that the three agree.
+- The coupling is genuinely [tight](https://coupling.dev/posts/core-concepts/balance/): high [integration strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) (functional) across high [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — separate language, separate top-level directory, subprocess boundary. The Makefile is a [core subdomain](https://coupling.dev/posts/dimensions-of-coupling/volatility/): the orchestration logic *is* the product, and git history shows targets added continuously (PI agents, `apply-sensors`). High strength + high distance + high [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) fails the balance rule.
+
+Why **Significant** and not Critical: the same person maintains both sides, so [socio-technical distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) is low — there is no cross-team coordination cost — and Makefile change is mostly *additive* (new targets), so existing literals rarely break. The danger is real but not yet frequent.
+
+### Recommended Improvement
+
+Introduce a single [contract](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) module — e.g. `container_cli/makefile.py` — as the *only* place target names and their permitted variables are written down. One small structure per target (target name + allowed variable keys) converts the implicit functional coupling into explicit [contract coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) localized in one file; command modules then reference `Targets.SPAWN` instead of the literal `"spawn"`. Optionally add one test that runs `make -C config -n <target>` (dry-run) for every target in the registry, so a renamed or removed target fails CI instead of a user's terminal.
+
+This deliberately does **not** reduce distance — the CLI must wrap the Makefile, that is its purpose, and decomposing further would only raise distance. It reduces *strength* by encapsulating the contract, and it concentrates the volatile knowledge so a Makefile change has exactly one Python counterpart to update. Trade-off: a thin indirection layer plus the discipline of keeping the registry current — cheap relative to silent runtime breakage in the product's most-changed area.
+
+## Issue: `status` command bypasses the Makefile and reads the container's private file
+
+**Integration**: `agents.status` / `pi_agents.status` → container worktree `.agent/status.json`
+**Severity**: Significant
+
+### Knowledge Leakage
+
+`agents.py:78` and `pi_agents.py:128` build the path `$AGENTS_HOME/<branch>/.agent/status.json` and read it directly off disk. `.agent/` is internal bookkeeping created by the container entrypoint (`config/entrypoint.sh` / `entrypoint-pi.sh`) for its own use; the CLI treats that private layout as if it were a public interface. This is [intrusive coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — the highest [integration strength](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) — because the CLI depends on an implementation detail of a different component.
+
+The leak is twofold: the *directory layout* (`.agent/status.json`) and even the *fallback message text* (`[status] No status file found…`, `[status] Expected at: …`) are reproduced in Python. They are reproduced because the Makefile **already has** a `status-agent` target (`Makefile:179`) that performs the identical read and prints near-identical messages. So this is also [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) through duplication — the knowledge is copied, not shared.
+
+### Complexity Impact
+
+There are now two implementations of "show agent status" — `status-agent` in Make and `status()` in Python — and nothing keeps them consistent. `q agents summary` routes through `make summary-agent`, but `q agents status` does *not* route through `make status-agent`; a reader cannot predict from the CLI surface which commands delegate and which reimplement. The Python file read looks like ordinary I/O, not an integration point, so it is easy to miss when reasoning about what depends on the entrypoint's layout — the integration is hidden, which is the hallmark of [complexity](https://coupling.dev/posts/core-concepts/complexity/).
+
+### Cascading Changes
+
+- **The entrypoint relocates `status.json`** (say to `.agent/state/status.json`): `status-agent`, `status-pi-agent`, `agents.status`, and `pi_agents.status` all break — four sites, two languages — and the two Python ones give no signal until invoked.
+- **The "no status file" UX is reworded in the Makefile only**: `q … status` silently keeps the old wording; the two surfaces drift apart.
+- `status.json` is part of actively-developed structured monitoring (recent commits #7 and #9), so this sits in a [volatile](https://coupling.dev/posts/dimensions-of-coupling/volatility/) area — the duplication *will* be exercised. Intrusive strength + high distance + volatility is unbalanced and not neutralized.
+
+### Recommended Improvement
+
+Route `status` through the target that already exists: `run_make("status-agent", {"BRANCH": branch})` — exactly as `summary` already does. This removes the intrusive coupling *and* the duplication in one move: the Makefile becomes the single component that knows the `.agent/` layout, and the CLI holds only a target name (governed by the contract module proposed in the first issue). The documented rationale for the direct read — "works after the container has exited" — is *already* satisfied by `status-agent`, which is a pure file read with no live-container dependency, so no capability is lost.
+
+Trade-off: the Python `status` currently raises `typer.Exit(1)` when the file is missing, whereas the Makefile target prints a message and exits `0`. If a non-zero exit code is contractually required, add an explicit `exit 1` to the missing-file branch of `status-agent` rather than reimplementing the read in Python. The cost is one line of Make; the gain is eliminating an intrusive, duplicated integration in a volatile area.
+
+## Issue: Worktree-path logic duplicated across command modules
+
+**Integration**: `agents._agents_home` ≡ `pi_agents._agents_home` (and both ≡ Makefile `AGENTS_HOME`)
+**Severity**: Minor
+
+### Knowledge Leakage
+
+`_agents_home()` is byte-for-byte identical in `agents.py:15` and `pi_agents.py:25`: "use `$AGENTS_HOME` if set, else `<git-root>/../.worktrees`". The same rule is the Makefile's `AGENTS_HOME ?= $(dirname GIT_ROOT)/.worktrees` (`Makefile:54`). One business rule — where agent worktrees live — implemented three times. This is [functional coupling](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) through duplicated knowledge. The `run`/`shell` pair in `run.py` is a smaller instance of the same pattern: two functions with identical bodies that differ only in the target string passed to `run_make`.
+
+### Complexity Impact
+
+The two Python copies sit at low [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — sibling modules in one package. High strength at low distance would normally read as [high cohesion](https://coupling.dev/posts/core-concepts/balance/), which is modular. But this is duplication, not cohesion: the copies are not one deliberate unit, they are two units that *secretly* share a rule. The `STRENGTH XOR DISTANCE` formula says "balanced" only if you mistake the duplication for cohesion. The real cost is the hidden obligation every developer carries — to remember that editing one copy means editing the twin.
+
+### Cascading Changes
+
+- **Changing the fallback** (e.g. `.worktrees` → `.agent-worktrees`) requires editing both Python copies; missing one makes `q agents …` and `q pi …` resolve *different* worktree roots — a confusing, hard-to-spot bug.
+- The Python ↔ Makefile copy is unbalanced (high strength, high cross-language distance), but [volatility](https://coupling.dev/posts/dimensions-of-coupling/volatility/) is low — the worktree convention rarely changes. By the [balance rule](https://coupling.dev/posts/core-concepts/balance/), low volatility neutralizes it: tolerable [technical debt](https://coupling.dev/posts/core-concepts/balance/), not urgent.
+
+### Recommended Improvement
+
+Hoist `_agents_home()` into `utils.py`, beside `find_git_root`, and have both command modules import it — one definition, genuine [high cohesion](https://coupling.dev/posts/core-concepts/balance/) instead of duplication. Collapse `run`/`shell` the same way, into one shared helper parameterised by target name. Both are pure DRY fixes at low distance, with no trade-off.
+
+Leave the Python-vs-Makefile `AGENTS_HOME` duplication as-is. Its low volatility makes the imbalance acceptable, and unifying it would mean either the CLI parsing the Makefile or the Makefile shelling into Python — both add more coupling than they remove. This is the model's pragmatism in action: not every imbalance is worth fixing.
+
+## Issue: Two inconsistent command-registration patterns
+
+**Integration**: `main.py` → `commands/*`
+**Severity**: Minor
+
+### Knowledge Leakage
+
+`main.py` mounts `agents` and `pi_agents` as Typer sub-apps via `add_typer(agents.app, …)`, but registers `build`, `network`, and `run` by cherry-picking individual functions: `app.command("build")(build.build)`. Meanwhile `build.py`, `network.py`, and `run.py` each still create a `typer.Typer()` `app` (e.g. `build.py:9`) and decorate their functions with `@app.command()` — yet that `app` object is never mounted anywhere. `main.py` carries an inconsistent [model](https://coupling.dev/posts/dimensions-of-coupling/integration-strength/) of what a "command module" is: sometimes a sub-app, sometimes a bag of functions.
+
+### Complexity Impact
+
+This is a [low cohesion](https://coupling.dev/posts/core-concepts/balance/) signal: the `app` object in `build.py`/`network.py`/`run.py` is dead code that actively misleads. A reader who has seen `agents.app` mounted reasonably assumes `build.app` is mounted too, and must trace `main.py` to discover it is not. The cost is small and contained to one short file, but every new command module forces an unstated choice between two patterns — accidental complexity with no functional purpose.
+
+### Cascading Changes
+
+Low. `main.py` and `commands/*` are at minimal [distance](https://coupling.dev/posts/dimensions-of-coupling/distance/) — one package — and registration changes are infrequent. There is no volatile cascade here; the harm is purely cognitive friction, which is why this is Minor.
+
+### Recommended Improvement
+
+Pick one pattern and apply it everywhere. The simplest: every command module exposes a Typer `app`, and `main.py` only ever calls `add_typer`. If flat top-level commands like `q build` must be preserved (rather than `q build build`), then drop the unused `typer.Typer()` from `build.py`, `network.py`, and `run.py` so their functions are honestly just functions, registered directly. Either way the rule "a command module looks like *X*" becomes true everywhere, and the dead `app` objects disappear. The only cost is a one-time edit; the gain is one fewer decision per future command module.
+
+---
+
+_This analysis was performed using the [Balanced Coupling](https://coupling.dev) model by [Vlad Khononov](https://vladikk.com)._

From 1401f19bd5ffea775f7a3546236df96143ec11f6 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Fri, 22 May 2026 21:24:18 -0500
Subject: [PATCH 8/9] feat(agents): add --model flag (default opus), persist
 lifecycle markers, bump bundled claude CLI

Add a --model option to `q spawn` so the orchestrator governs the headless
agent's Claude model instead of inheriting the host's interactive
~/.claude/settings.json preference (which can belong to a different account
than the container's OAuth token). The flag flows CLI -> Makefile
(MODEL ?= opus) -> -e AGENT_MODEL -> entrypoint (`claude --model "$AGENT_MODEL"`),
default opus.

Fixes a second bug that the original failure masked: emit_marker() echoed
[agent:status] events only to the container's stdout, so once the --rm
container was gone `q agents summary` fell back to agent.log and printed
"(no structured events found)". emit_marker now also appends to agent.log
(group redirect so the open error is suppressed cleanly in the shellspec
mock env), and tee -a preserves the early starting/working markers across
the claude run.

Bumps the bundled claude CLI in the Wolfi image. The previous version
(2.1.138) had a bug rejecting Opus with `400 role 'system' is not supported
on this model`. The Dockerfile now ends the install RUN with
`&& claude --version`, both as a build-time trace and as a cache-busting
tail so a plain rebuild picks up newer CLIs without forcing --no-cache.

Tests and docs:
- new acceptance scenario for `q spawn --model opus`
- unit tests for MODEL presence/absence in make_vars
- shellspec asserts default opus and AGENT_MODEL=sonnet override
- SKILL.md, docs/agents/{cli,container-agent,spawn-agent-skill}.md

Gates after rebuild to claude 2.1.149: ruff, 100 unit/acceptance, 59
shellspec, mutation 95.2%, and make e2e-test (Claude round-trip) all
green; PI e2e remains opt-in (local mlx server).

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 .claude/skills/spawn-agent/SKILL.md           | 12 ++++++++++
 app/cli/src/container_cli/commands/agents.py  |  6 +++++
 .../tests/acceptance/features/spawn.feature   |  6 +++++
 app/cli/tests/test_agents.py                  | 13 +++++++++-
 config/Dockerfile.wolfi                       | 11 +++++++--
 config/Makefile                               |  8 +++++++
 config/entrypoint.sh                          | 24 ++++++++++++++++---
 config/spec/entrypoint_spec.sh                | 16 ++++++++++++-
 docs/agents/cli.md                            |  8 +++++++
 docs/agents/container-agent.md                |  6 +++--
 docs/agents/spawn-agent-skill.md              |  2 ++
 11 files changed, 103 insertions(+), 9 deletions(-)

diff --git a/.claude/skills/spawn-agent/SKILL.md b/.claude/skills/spawn-agent/SKILL.md
index af3dd7e..ec4b3aa 100644
--- a/.claude/skills/spawn-agent/SKILL.md
+++ b/.claude/skills/spawn-agent/SKILL.md
@@ -46,8 +46,17 @@ PROJECT_NAME=$(basename "$GIT_ROOT")          # e.g. stackai
 WORKTREES_DIR="${AGENTS_HOME}"                # from env var, e.g. ~/agents
 NETWORK=claude-agent-net
 IMAGE=claude-agent:wolfi
+AGENT_MODEL=opus                              # model the headless agent runs
 ```
 
+**Agent model.** The headless agent runs whatever `AGENT_MODEL` is passed via
+`-e AGENT_MODEL`; it defaults to `opus` so the agent runs with the most capable
+model out of the box. Do **not** rely on the host's `~/.claude/settings.json`
+`model` preference — it is copied into the container but a headless agent must
+not inherit a personal interactive setting, and the container's OAuth token may
+belong to a different plan. Override per spawn when a lighter task warrants it
+(`q spawn --model sonnet`, or `-e AGENT_MODEL=sonnet` on a raw `container run`).
+
 Container names follow the pattern: `<project>-<sanitized-branch>`
 Branch sanitization — each `/`, `_`, or space becomes a single `-`, lowercased:
 ```bash
@@ -143,6 +152,7 @@ container run -d --rm \
   -v "${HOME}/.claude:/root/.claudenew:ro" \
   -v "${HOME}/.claude.json:/root/.claudenew.json:ro" \
   -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \
+  -e "AGENT_MODEL=${AGENT_MODEL:-opus}" \
   -e "CLAUDE_CODE_OAUTH_TOKEN=${CLAUDE_CONTAINER_OAUTH_TOKEN}" \
   claude-agent:wolfi \
   --worktree "${BRANCH}" --task "${TASK}"
@@ -405,6 +415,8 @@ Full docs: https://github.com/apple/container/blob/main/docs/command-reference.m
   the container exits so you can review the agent's work.
 - **CLAUDE_CONTAINER_OAUTH_TOKEN** must be set — containers authenticate with this,
   not the host Claude session.
+- **AGENT_MODEL** controls the agent's Claude model (`opus` by default). The agent
+  does not inherit the host's `settings.json` model preference — pass it explicitly.
 - The image `claude-agent:wolfi` must exist. If not: `cd <git-root>/config && make build`
 - Multiple agents run in parallel — each gets a unique container + worktree.
 - Branch names with `/` (e.g., `feat/auth`) are valid for git; sanitized for container
diff --git a/app/cli/src/container_cli/commands/agents.py b/app/cli/src/container_cli/commands/agents.py
index 9c1c3d5..c99c850 100644
--- a/app/cli/src/container_cli/commands/agents.py
+++ b/app/cli/src/container_cli/commands/agents.py
@@ -17,6 +17,10 @@ def spawn(
     cpus: Annotated[int | None, typer.Option("--cpus", help="CPU count")] = None,
     memory: Annotated[str | None, typer.Option("--memory", help="Memory limit (e.g. 12G)")] = None,
     image: Annotated[str | None, typer.Option("--image", help="Image tag")] = None,
+    model: Annotated[
+        str | None,
+        typer.Option("--model", help="Claude model the agent runs (e.g. sonnet, opus, haiku)"),
+    ] = None,
 ) -> None:
     """Spawn a detached headless agent container."""
     check_token()
@@ -27,6 +31,8 @@ def spawn(
         make_vars["MEMORY"] = memory
     if image:
         make_vars["IMAGE"] = image
+    if model:
+        make_vars["MODEL"] = model
     run_make(Target.SPAWN, make_vars)
 
 
diff --git a/app/cli/tests/acceptance/features/spawn.feature b/app/cli/tests/acceptance/features/spawn.feature
index 36d98cc..37e6cb1 100644
--- a/app/cli/tests/acceptance/features/spawn.feature
+++ b/app/cli/tests/acceptance/features/spawn.feature
@@ -24,3 +24,9 @@ Feature: Spawn agent containers
     When I run "q spawn --branch feat/bar --task work --cpus 8 --memory 16G --image custom:tag"
     Then the command exits successfully
     And the make vars include CPUS="8" and MEMORY="16G" and IMAGE="custom:tag"
+
+  Scenario: Spawn with an explicit agent model
+    Given the CLAUDE_CONTAINER_OAUTH_TOKEN is set
+    When I run "q spawn --branch feat/baz --task work --model opus"
+    Then the command exits successfully
+    And the make vars include MODEL="opus"
diff --git a/app/cli/tests/test_agents.py b/app/cli/tests/test_agents.py
index 719e9c8..b6374dd 100644
--- a/app/cli/tests/test_agents.py
+++ b/app/cli/tests/test_agents.py
@@ -42,8 +42,18 @@ def test_spawn_with_image(self, mock_run_make, mock_check_token):
         call_vars = mock_run_make["agents"].call_args[0][1]
         assert call_vars["IMAGE"] == "my-img:latest"
 
+    def test_spawn_with_model(self, mock_run_make, mock_check_token):
+        spawn(branch="b", task="t", cpus=None, memory=None, image=None, model="opus")
+        call_vars = mock_run_make["agents"].call_args[0][1]
+        assert call_vars["MODEL"] == "opus"
+
+    def test_spawn_omits_model_when_unset(self, mock_run_make, mock_check_token):
+        spawn(branch="b", task="t", cpus=None, memory=None, image=None, model=None)
+        call_vars = mock_run_make["agents"].call_args[0][1]
+        assert "MODEL" not in call_vars
+
     def test_spawn_all_optional_params(self, mock_run_make, mock_check_token):
-        spawn(branch="b", task="t", cpus=2, memory="4G", image="img")
+        spawn(branch="b", task="t", cpus=2, memory="4G", image="img", model="opus")
         call_vars = mock_run_make["agents"].call_args[0][1]
         assert call_vars == {
             "BRANCH": "b",
@@ -51,6 +61,7 @@ def test_spawn_all_optional_params(self, mock_run_make, mock_check_token):
             "CPUS": "2",
             "MEMORY": "4G",
             "IMAGE": "img",
+            "MODEL": "opus",
         }
 
 
diff --git a/config/Dockerfile.wolfi b/config/Dockerfile.wolfi
index 7a17aa1..1bed1c8 100644
--- a/config/Dockerfile.wolfi
+++ b/config/Dockerfile.wolfi
@@ -80,10 +80,17 @@ COPY --from=builder /root/.cargo/bin/dust   /usr/local/bin/dust
 COPY --from=builder /root/.cargo/bin/procs  /usr/local/bin/procs
 COPY --from=builder /root/.cargo/bin/btm    /usr/local/bin/btm
 
-# Claude Code CLI
+# Claude Code CLI — pin the install at build time. CLAUDE_CODE_DISABLE_AUTOUPDATE=1
+# in this image means the CLI never updates inside the container, so the image
+# ships exactly the version the installer returns at build time. `claude --version`
+# at the end serves two purposes: log the version into the build output and act
+# as a cache-busting tail so a `--no-cache-filter` rebuild picks up a newer CLI
+# (e.g. 2.1.148+ for the Opus `role 'system'` fix) without forcing a full
+# --no-cache that would rebuild the slow Rust builder stage.
 RUN curl -fsSL https://claude.ai/install.sh -o /tmp/claude-install.sh \
     && yes | bash /tmp/claude-install.sh \
-    && rm -f /tmp/claude-install.sh
+    && rm -f /tmp/claude-install.sh \
+    && claude --version
 
 # OpenCode
 RUN curl -fsSL https://opencode.ai/install | bash \
diff --git a/config/Makefile b/config/Makefile
index ae31d69..d56ba76 100644
--- a/config/Makefile
+++ b/config/Makefile
@@ -44,6 +44,11 @@ MEMORY     ?= 3G
 # Agent variables
 BRANCH     ?= agent-$(shell date +%s)
 TASK       ?= Explore the codebase and report what you find.
+# MODEL: Claude model the headless agent runs with. Defaults to opus — the
+# agent doesn't inherit the host's interactive `model` preference (see the
+# entrypoint); a fixed default keeps the choice in the orchestrator. Override
+# per spawn, e.g. `make spawn ... MODEL=sonnet`.
+MODEL      ?= opus
 
 # Derived paths
 GIT_ROOT      := $(shell git -C $(CURDIR) rev-parse --show-toplevel)
@@ -109,6 +114,7 @@ spawn: network _check-token
 	@echo "[spawn] Launching agent: $(PROJECT_NAME)-$(CONTAINER_BRANCH)"
 	@echo "[spawn] Worktree: $(WORKTREES_DIR)/$(BRANCH)"
 	@echo "[spawn] Task: $(TASK)"
+	@echo "[spawn] Model: $(MODEL)"
 	container run -d --rm \
 		--name $(PROJECT_NAME)-$(CONTAINER_BRANCH) \
 		--network $(NETWORK) \
@@ -120,6 +126,7 @@ spawn: network _check-token
 		-v $(HOME)/.claude:/root/.claudenew:ro \
 		-v $(HOME)/.claude.json:/root/.claudenew.json:ro \
 		-e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \
+		-e AGENT_MODEL=$(MODEL) \
 		-e CLAUDE_CODE_OAUTH_TOKEN=$${$(HOST_TOKEN_VAR)} \
 		$(IMAGE) \
 		--worktree "$(BRANCH)" --task "$(TASK)"
@@ -392,6 +399,7 @@ help:
 	@printf "    %-20s %s\n" "CPUS"           "$(CPUS)"
 	@printf "    %-20s %s\n" "MEMORY"         "$(MEMORY)"
 	@printf "    %-20s %s\n" "BRANCH"         "$(BRANCH)"
+	@printf "    %-20s %s\n" "MODEL"          "$(MODEL)"
 	@printf "    %-20s %s\n" "PROJECT_NAME"   "$(PROJECT_NAME)"
 	@printf "    %-20s %s\n" "GIT_ROOT"       "$(GIT_ROOT)"
 	@printf "    %-20s %s\n" "AGENTS_HOME"    "$(AGENTS_HOME)"
diff --git a/config/entrypoint.sh b/config/entrypoint.sh
index cbbfd82..7f4ba60 100644
--- a/config/entrypoint.sh
+++ b/config/entrypoint.sh
@@ -22,6 +22,12 @@ AGENT_TASK=""
 PROJECT_NAME=""
 PASSTHROUGH_ARGS=()
 
+# Model the headless agent runs with. The orchestrator passes it via
+# -e AGENT_MODEL; it defaults to opus so the agent runs with the most capable
+# model by default and never inherits the host user's personal `model`
+# preference from the copied settings.json (which can be a different account).
+AGENT_MODEL="${AGENT_MODEL:-opus}"
+
 # ── Functions ─────────────────────────────────────────────────────────────────
 
 parse_args() {
@@ -70,6 +76,7 @@ create_worktree() {
 setup_agent_perms() {
     echo "[entrypoint] Starting Claude agent (headless)..."
     echo "[entrypoint] Task: ${AGENT_TASK}"
+    echo "[entrypoint] Model: ${AGENT_MODEL}"
     echo "---"
     # Make claude's install path traversable for non-root users (installed under /root/)
     # go+x required: agent is in group root (gid=0), so group bits apply, not others bits
@@ -94,7 +101,12 @@ write_status() {
 
 emit_marker() {
     local phase="$1"; shift
-    echo "[agent:status] PHASE=${phase} BRANCH=${WORKTREE_BRANCH} $*"
+    local line="[agent:status] PHASE=${phase} BRANCH=${WORKTREE_BRANCH} $*"
+    # Echo to the container's stdout (read live via `container logs`) and append
+    # to the persisted log so `summary-agent` still finds the markers once the
+    # --rm container is gone and `container logs` no longer works.
+    echo "$line"
+    { echo "$line" >> "${AGENT_DIR}/agent.log"; } 2>/dev/null || true
 }
 
 run_agent() {
@@ -102,6 +114,11 @@ run_agent() {
     mkdir -p "$AGENT_DIR"
     chown -R agent:agent "$AGENT_DIR"
 
+    # Start the persisted log empty so a re-spawn on an existing worktree does
+    # not accumulate a previous run's output. Markers and the claude output
+    # below both append to it.
+    { : > "${AGENT_DIR}/agent.log"; } 2>/dev/null || true
+
     # Add .agent/ to worktree .gitignore (safe if dir doesn't exist yet)
     if ! grep -qxF '.agent/' "${WORKTREE_PATH}/.gitignore" 2>/dev/null; then
         echo '.agent/' >> "${WORKTREE_PATH}/.gitignore" 2>/dev/null || true
@@ -117,10 +134,11 @@ run_agent() {
     write_status "working"
     emit_marker "working"
 
-    # Run claude with tee to persist logs; capture exit code through pipe
+    # Run claude with tee to persist logs; capture exit code through pipe.
+    # `tee -a` appends so the "starting"/"working" markers above survive.
     set +e
     su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions \
-        -p "$AGENT_TASK" 2>&1 | tee "$AGENT_DIR/agent.log"
+        --model "$AGENT_MODEL" -p "$AGENT_TASK" 2>&1 | tee -a "$AGENT_DIR/agent.log"
     local exit_code=${PIPESTATUS[0]}
     set -e
 
diff --git a/config/spec/entrypoint_spec.sh b/config/spec/entrypoint_spec.sh
index 0770c0e..c7485c8 100644
--- a/config/spec/entrypoint_spec.sh
+++ b/config/spec/entrypoint_spec.sh
@@ -342,7 +342,21 @@ WRAPPER_EOF
 
     It "runs su-exec with correct claude command"
       When run run_entrypoint --worktree agent-br --task "do work"
-      The output should include "[MOCK] su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions -p do work"
+      The output should include "[MOCK] su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions --model opus -p do work"
+      The status should equal 0
+    End
+
+    It "defaults the agent model to opus"
+      When run run_entrypoint --worktree agent-br --task "do work"
+      The output should include "[entrypoint] Model: opus"
+      The status should equal 0
+    End
+
+    It "honors AGENT_MODEL when the orchestrator sets it"
+      export AGENT_MODEL=sonnet
+      When run run_entrypoint --worktree agent-br --task "do work"
+      The output should include "claude --dangerously-skip-permissions --model sonnet -p do work"
+      The output should include "[entrypoint] Model: sonnet"
       The status should equal 0
     End
 
diff --git a/docs/agents/cli.md b/docs/agents/cli.md
index b5d5dc0..f4e1ae8 100644
--- a/docs/agents/cli.md
+++ b/docs/agents/cli.md
@@ -69,6 +69,7 @@ Spawns a detached headless agent in an isolated git worktree.
 ```bash
 q spawn --branch feat/oauth2 --task "Implement OAuth2 with JWT tokens"
 q spawn --branch test/payments --task "Write unit tests for payments module" --cpus 4
+q spawn --branch feat/quick-fix --task "Tighten the readme" --model sonnet
 ```
 
 | Option | Required | Description |
@@ -78,9 +79,16 @@ q spawn --branch test/payments --task "Write unit tests for payments module" --c
 | `--cpus` | no | CPU count override |
 | `--memory` | no | Memory limit override (e.g. `8G`) |
 | `--image` | no | Image tag override |
+| `--model` | no | Claude model the agent runs (`opus` default; e.g. `sonnet`, `haiku`) |
 
 Requires `CLAUDE_CONTAINER_OAUTH_TOKEN` to be set.
 
+> **Agent model:** the agent runs `opus` unless `--model` says otherwise. It
+> does **not** inherit the host's `~/.claude/settings.json` `model` preference —
+> that file is copied into the container for credentials only, and a headless
+> agent must not depend on a personal interactive setting. Drop to `sonnet` for
+> lighter tasks. The Makefile passes the choice through as `-e AGENT_MODEL`.
+
 > **Branch naming:** avoid `/` in branch names — they create nested subdirectories in `$AGENTS_HOME` which the entrypoint cannot handle. Use flat names like `feat-oauth2` instead of `feat/oauth2`.
 
 ---
diff --git a/docs/agents/container-agent.md b/docs/agents/container-agent.md
index 1af685c..d6f33c6 100644
--- a/docs/agents/container-agent.md
+++ b/docs/agents/container-agent.md
@@ -127,10 +127,12 @@ container run -d --rm claude-agent:wolfi \
 2. `git -C /workspace worktree add /worktrees/<branch> -b <branch>`
    - If the branch already exists: `git worktree add /worktrees/<branch> <branch>`
 3. `cd /worktrees/<branch>`
-4. `claude --dangerously-skip-permissions -p "<task>"`
+4. `claude --dangerously-skip-permissions --model "$AGENT_MODEL" -p "<task>"`
 
 **Why `--dangerously-skip-permissions`:** In headless mode there is no interactive user to approve permissions. The container is a sandboxed environment with access only to the mounted worktree, so it is safe to skip confirmations.
 
+**Why `--model "$AGENT_MODEL"`:** The host's `~/.claude` is copied into the container for credentials, and that copy includes the host user's personal `settings.json` — whose `model` preference must not silently govern a headless agent (the container's OAuth token may belong to a different plan). The entrypoint pins the model explicitly: `AGENT_MODEL` is passed via `-e AGENT_MODEL` and defaults to `opus`. Override per spawn with `q spawn --model <model>` or `make spawn ... MODEL=<model>`.
+
 **Why run as `agent` (non-root):** Claude CLI blocks `--dangerously-skip-permissions` when the process runs as `root` (uid 0). The entrypoint uses `su-exec` to drop to the `agent` user before executing Claude.
 
 IMPORTANT: **Why the worktree is created inside the container:** Git needs access to the repository to register the worktree in `.git/worktrees/`. Since the repo is mounted at `/workspace` inside the container, the worktree must be created from there. If it were created directly from the host, the path registered in git would be the host path (`/Users/...`), which would not exist inside the container.
@@ -366,7 +368,7 @@ RUN addgroup -S agent \
        │
        └─► /home/agent/.claude/   (copied + chown → agent)
                   │
-           su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions -p "..."
+           su-exec agent env HOME=/home/agent claude --dangerously-skip-permissions --model "$AGENT_MODEL" -p "..."
 ```
 
 The entrypoint:
diff --git a/docs/agents/spawn-agent-skill.md b/docs/agents/spawn-agent-skill.md
index f7b20d7..78acdfd 100644
--- a/docs/agents/spawn-agent-skill.md
+++ b/docs/agents/spawn-agent-skill.md
@@ -85,6 +85,7 @@ export CLAUDE_CONTAINER_OAUTH_TOKEN=<your-oauth-token>
 5. container run -d --rm  ← detached (non-blocking)
    • -v $GIT_ROOT:/workspace          ← full repo (read/write)
    • -v $AGENTS_HOME:/worktrees       ← worktree destination
+   • -e AGENT_MODEL=$MODEL            → claude --model $MODEL (default opus)
    • --worktree $BRANCH               → entrypoint creates the worktree
    • --task "$TASK"                   → claude -p "$TASK" in the worktree
          │
@@ -245,6 +246,7 @@ container run -d --rm \
   -v "${HOME}/.claude:/root/.claudenew:ro" \
   -v "${HOME}/.claude.json:/root/.claudenew.json:ro" \
   -e CLAUDE_CODE_EXPERIMENTAL_AGENT_TEAMS=1 \
+  -e "AGENT_MODEL=${AGENT_MODEL:-opus}" \
   -e "CLAUDE_CODE_OAUTH_TOKEN=${CLAUDE_CONTAINER_OAUTH_TOKEN}" \
   claude-agent:wolfi \
   --worktree "feat/oauth2" --task "${TASK}"

From 10ed154185d2dfeec0fd1d662ee7471be40ba289 Mon Sep 17 00:00:00 2001
From: deimagjas <deimagjas@127.0.0.1>
Date: Fri, 22 May 2026 22:01:52 -0500
Subject: [PATCH 9/9] test(e2e): opt-in mlx_lm.server autostart fixture for the
 PI round-trip
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Add a session-scoped fixture in tests/e2e/conftest.py that — when
STACKAI_E2E_AUTOSTART_MLX=1 is exported — spawns mlx_lm.server itself,
waits for /v1/models to respond, yields to the tests, and tears the
subprocess down on session teardown. Without the env var, or when a
server is already reachable on the URL, the fixture is a no-op (a
manually-started server is honoured and not touched).

The invocation is the exact set of flags the PI agent expects (model
mlx-community/gemma-4-26b-a4b-it-4bit, port 8080, --temp 0.9,
--top-p 0.95, 6 GB prompt cache, etc.) — no coupling to the iac CLI:
the e2e suite must run standalone. Boot timeout is configurable via
STACKAI_E2E_MLX_BOOT_TIMEOUT (default 600s) to absorb first-run model
downloads, and the subprocess log lands in $TMPDIR/stackai-e2e-mlx.log
for diagnosis.

Verified end-to-end on a warm cache: both the Claude and PI
round-trips pass in ~33s.

Co-Authored-By: Claude Opus 4.7 <noreply@anthropic.com>
---
 app/cli/tests/e2e/conftest.py | 123 +++++++++++++++++++++++++++++++---
 docs/agents/e2e-tests.md      |  26 +++++++
 2 files changed, 140 insertions(+), 9 deletions(-)

diff --git a/app/cli/tests/e2e/conftest.py b/app/cli/tests/e2e/conftest.py
index 66ec2d8..eb65fce 100644
--- a/app/cli/tests/e2e/conftest.py
+++ b/app/cli/tests/e2e/conftest.py
@@ -18,6 +18,7 @@
 import shutil
 import subprocess
 import sys
+import tempfile
 import time
 import urllib.error
 import urllib.request
@@ -96,16 +97,120 @@ def require_pi_image() -> None:
         pytest.skip("image claude-pi:ubuntu not built — run `q pi build` first")
 
 
-@pytest.fixture()
-def require_mlx_server() -> None:
-    """Skip the test unless the local mlx_lm.server is reachable on the host."""
-    url = os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models")
+def _http_ok(url: str, *, timeout: float) -> bool:
+    """Tiny probe — True when the URL responds 200."""
     try:
-        with urllib.request.urlopen(url, timeout=5) as resp:
-            if resp.status != 200:
-                pytest.skip(f"mlx_lm.server at {url} returned HTTP {resp.status}")
-    except (urllib.error.URLError, OSError) as exc:
-        pytest.skip(f"mlx_lm.server unreachable at {url} ({exc}) — run `uv run iac server start`")
+        with urllib.request.urlopen(url, timeout=timeout) as resp:
+            return resp.status == 200
+    except (urllib.error.URLError, OSError):
+        return False
+
+
+def _mlx_url() -> str:
+    return os.environ.get("STACKAI_E2E_MLX_URL", "http://localhost:8080/v1/models")
+
+
+@pytest.fixture(scope="session")
+def _mlx_server_lifecycle() -> Iterator[subprocess.Popen[bytes] | None]:
+    """Optionally spawn mlx_lm.server for the duration of the e2e session.
+
+    Activated by exporting ``STACKAI_E2E_AUTOSTART_MLX=1``. Without that flag,
+    or when a server is already reachable on the expected URL, this is a no-op
+    — tests still see whatever process the user manages out of band.
+
+    The exact invocation mirrors the parameters the PI agent expects (no
+    coupling to the `iac` CLI: the e2e suite must run standalone). First-run
+    cost is significant — model download + warmup can take several minutes;
+    raise the wait ceiling with ``STACKAI_E2E_MLX_BOOT_TIMEOUT`` if needed.
+    """
+    if os.environ.get("STACKAI_E2E_AUTOSTART_MLX") != "1":
+        yield None
+        return
+
+    url = _mlx_url()
+    if _http_ok(url, timeout=2):
+        # Pre-existing server (e.g. started via `uv run iac server start`) —
+        # leave it alone so manual setups survive teardown.
+        yield None
+        return
+
+    # conftest.py lives at app/cli/tests/e2e/ → repo root is parents[4].
+    iac_dir = Path(__file__).resolve().parents[4] / "iac"
+    cmd = [
+        "uv", "run", "--directory", str(iac_dir), "mlx_lm.server",
+        "--model", "mlx-community/gemma-4-26b-a4b-it-4bit",
+        "--host", "0.0.0.0",
+        "--port", "8080",
+        "--prompt-cache-size", "5",
+        "--prompt-cache-bytes", "6GB",
+        "--decode-concurrency", "4",
+        "--prompt-concurrency", "2",
+        "--prefill-step-size", "1024",
+        "--temp", "0.9",
+        "--top-p", "0.95",
+        "--top-k", "40",
+        "--min-p", "0.0",
+        "--max-tokens", "2048",
+        "--use-default-chat-template",
+        "--log-level", "INFO",
+    ]
+
+    log_path = Path(tempfile.gettempdir()) / "stackai-e2e-mlx.log"
+    log_handle = log_path.open("ab")
+    print(f"\n[mlx] starting mlx_lm.server (logs: {log_path})", flush=True)
+    proc = subprocess.Popen(  # noqa: S603 — fixed argv, no shell.
+        cmd,
+        stdout=log_handle,
+        stderr=subprocess.STDOUT,
+        stdin=subprocess.DEVNULL,
+        start_new_session=True,
+    )
+
+    try:
+        timeout = float(os.environ.get("STACKAI_E2E_MLX_BOOT_TIMEOUT", "600"))
+        deadline = time.monotonic() + timeout
+        while time.monotonic() < deadline:
+            if proc.poll() is not None:
+                raise RuntimeError(
+                    f"mlx_lm.server exited during startup (code {proc.returncode});"
+                    f" see {log_path}"
+                )
+            if _http_ok(url, timeout=2):
+                break
+            time.sleep(3)
+        else:
+            proc.terminate()
+            raise TimeoutError(
+                f"mlx_lm.server did not become reachable at {url} within"
+                f" {timeout}s; see {log_path}"
+            )
+        print(f"[mlx] ready on {url}", flush=True)
+        yield proc
+    finally:
+        log_handle.close()
+        if proc.poll() is None:
+            proc.terminate()
+            try:
+                proc.wait(timeout=10)
+            except subprocess.TimeoutExpired:
+                proc.kill()
+                proc.wait(timeout=5)
+
+
+@pytest.fixture()
+def require_mlx_server(_mlx_server_lifecycle) -> None:
+    """Skip the test unless the local mlx_lm.server is reachable on the host.
+
+    Composes with ``_mlx_server_lifecycle`` so when the autostart env var is
+    set, the server is already up by the time this check runs.
+    """
+    url = _mlx_url()
+    if _http_ok(url, timeout=5):
+        return
+    pytest.skip(
+        f"mlx_lm.server unreachable at {url} — run `uv run iac server start`"
+        " or set STACKAI_E2E_AUTOSTART_MLX=1 to let the suite spawn it"
+    )
 
 
 @pytest.fixture()
diff --git a/docs/agents/e2e-tests.md b/docs/agents/e2e-tests.md
index 39b8302..7d2c687 100644
--- a/docs/agents/e2e-tests.md
+++ b/docs/agents/e2e-tests.md
@@ -37,6 +37,25 @@ Each test skips itself, with a reason, when a prerequisite is missing.
 The PI server URL defaults to `http://localhost:8080/v1/models`; override the
 reachability check with `STACKAI_E2E_MLX_URL` if your server listens elsewhere.
 
+### Auto-launching `mlx_lm.server` from the suite
+
+Export `STACKAI_E2E_AUTOSTART_MLX=1` and the e2e session bootstraps the server
+itself — there's no need to start it out of band. The session-scoped fixture
+in `tests/e2e/conftest.py`:
+
+1. Skips if the URL is already reachable (an existing server you launched
+   manually is honoured and not torn down).
+2. Otherwise spawns the exact `mlx_lm.server` invocation the PI agent
+   expects (`mlx-community/gemma-4-26b-a4b-it-4bit`, port 8080,
+   `--temp 0.9 --top-p 0.95`, `--prompt-cache-bytes 6GB`, etc.).
+3. Polls `/v1/models` until ready, then yields to the tests.
+4. Terminates the subprocess at session teardown.
+
+Logs land in `$TMPDIR/stackai-e2e-mlx.log`. The first run downloads the model
+(~16 GB) and warms the prompt cache, so the boot can take several minutes —
+override the wait ceiling with `STACKAI_E2E_MLX_BOOT_TIMEOUT` (seconds,
+default 600).
+
 ## How to run
 
 ```bash
@@ -44,6 +63,13 @@ cd app/cli
 make e2e-test          # STACKAI_E2E=1 uv run pytest tests/e2e -v -m e2e
 ```
 
+With auto-launched mlx_lm.server (PI test runs end-to-end too):
+
+```bash
+cd app/cli
+STACKAI_E2E_AUTOSTART_MLX=1 make e2e-test
+```
+
 Run a single test:
 
 ```bash