From 222c030fd7b3ebb46c51a2a71a63c1adcab931b1 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Wed, 17 Jun 2026 21:19:38 -0700
Subject: [PATCH 1/3] Fix `assembly code` TUI: CLI-style approval, voice-mode
 banner, mic robustness, unique sessions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- Approval prompt: replace the Textual Button row with a plain y/a/n
  keyboard-hint line, so it reads like a CLI prompt rather than chrome.
- Voice mode banner: defer the first mic open until after the splash paints
  (call_after_refresh) — opening PortAudio inline on mount raced Textual's
  initial render and left the banner blank until a resize/focus repaint.
- Mic open: redirect PortAudio's C-level stderr noise (which corrupted the
  TUI screen) via a safe-by-construction stdio.suppress_native_stderr; and on
  a mono open failure, reopen at the device's real channel count and downmix
  to mono, with a clear permission error when the device exposes 0 channels.
- Sessions: give each `assembly code` run a unique thread id instead of
  reusing a fixed "default" thread (which silently resumed prior chats);
  `--session NAME` still resumes a named one.

Gates verified pre-commit: ruff, pyright, mypy, full pytest suite, 100% patch
coverage, mutation gate. Full check.sh not run at user request.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/store.py                   | 15 +++
 aai_cli/code_agent/tui.py                     | 32 +++----
 aai_cli/commands/code/__init__.py             | 11 ++-
 aai_cli/core/microphone.py                    | 96 ++++++++++++++++---
 aai_cli/core/stdio.py                         | 52 +++++++++-
 .../test_snapshots_help_run.ambr              |  5 +-
 tests/test_code_agent.py                      |  7 ++
 tests/test_code_command.py                    | 15 ++-
 tests/test_code_tui.py                        | 29 ++----
 tests/test_microphone.py                      | 79 +++++++++++++++
 tests/test_stdio.py                           | 57 +++++++++++
 11 files changed, 342 insertions(+), 56 deletions(-)
diff --git a/aai_cli/code_agent/store.py b/aai_cli/code_agent/store.py
index 7c0b2975..01b218da 100644
--- a/aai_cli/code_agent/store.py
+++ b/aai_cli/code_agent/store.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+import uuid
 from pathlib import Path
 from typing import TYPE_CHECKING
 
@@ -18,6 +19,20 @@
 
 _APP = "assemblyai"
 
+# Length of a generated session id — short enough to read off the splash and retype as
+# ``--session <id>`` to resume, with ample uniqueness for one user's sessions.
+_SESSION_ID_LEN = 12
+
+
+def new_session_id() -> str:
+    """A fresh, unique session id so each run starts a clean conversation by default.
+
+    `assembly code` no longer reuses a fixed ``"default"`` thread (which silently resumed the
+    previous conversation); each run gets its own id unless ``--session NAME`` names one to
+    resume. Shown on the splash as ``Thread: <id>`` so it can be resumed later.
+    """
+    return uuid.uuid4().hex[:_SESSION_ID_LEN]
+
 
 def sessions_db_path() -> Path:
     """Path to the SQLite file holding persisted coding sessions (dir created)."""
diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py
index cb699cf1..264d64c9 100644
--- a/aai_cli/code_agent/tui.py
+++ b/aai_cli/code_agent/tui.py
@@ -19,7 +19,7 @@
 from textual.app import App, ComposeResult
 from textual.containers import Horizontal, Vertical
 from textual.screen import ModalScreen
-from textual.widgets import Button, Input, Label, RichLog, Static
+from textual.widgets import Input, Label, RichLog, Static
 from textual.worker import Worker
 
 from aai_cli.code_agent import banner
@@ -60,11 +60,6 @@ def _spinner_text(elapsed_s: int, frame: str) -> str:
     return f"{frame} Working… ({elapsed_s}s)"
 
 
-def _approval_decision(button_id: str | None) -> str:
-    """Map a pressed approval button's id to a decision, defaulting to reject if unset."""
-    return button_id or "reject"
-
-
 def _abbrev_home(path: Path) -> str:
     """Render ``path`` with the home directory collapsed to ``~``."""
     try:
@@ -97,8 +92,10 @@ def _status_text(cwd: Path, *, auto_approve: bool) -> str:
 class ApprovalScreen(ModalScreen[str]):
     """A compact, bottom-docked prompt to approve/auto-approve/reject one tool call.
 
-    The transparent screen background leaves the transcript visible above (no full-screen
-    takeover); the decision is one of ``"approve"``, ``"auto"``, or ``"reject"``.
+    Keyboard-only — a plain one-line ``y / a / n`` hint instead of clickable buttons, so it
+    reads like a CLI prompt rather than a chrome-heavy dialog. The transparent screen
+    background leaves the transcript visible above (no full-screen takeover); the decision is
+    one of ``"approve"``, ``"auto"``, or ``"reject"``.
     """
 
     DEFAULT_CSS = """
@@ -108,8 +105,6 @@ class ApprovalScreen(ModalScreen[str]):
         border: round #f59e0b; background: #000000; padding: 0 1; margin: 0 1 1 1;
     }
     ApprovalScreen #approvalbox Label { height: auto; }
-    ApprovalScreen #approvalbox Horizontal { height: auto; }
-    ApprovalScreen #approvalbox Button { margin: 0 1 0 0; }
     """
     BINDINGS: ClassVar = [
         ("y", "approve", "Approve"),
@@ -128,13 +123,10 @@ def compose(self) -> ComposeResult:
                 f"Run tool [b]{escape(self._tool_name)}[/b]?  "
                 f"[dim]{escape(_format_args(self._args))}[/dim]"
             )
-            with Horizontal():
-                yield Button("Approve (y)", id="approve", variant="success")
-                yield Button("Auto-approve (a)", id="auto", variant="primary")
-                yield Button("Reject (n)", id="reject", variant="error")
-
-    def on_button_pressed(self, event: Button.Pressed) -> None:
-        self.dismiss(_approval_decision(event.button.id))
+            yield Label(
+                f"[b #22c55e]y[/] approve   [b {banner.BRAND_HEX}]a[/] auto-approve   "
+                "[b #f04438]n[/] reject"
+            )
 
     def action_approve(self) -> None:
         self.dismiss("approve")
@@ -270,7 +262,11 @@ def on_mount(self) -> None:
         if self._initial:
             self._submit(self._initial)
         else:
-            self._begin_listening()  # in voice mode, capture the first spoken turn
+            # Defer the first mic open until *after* the splash has painted. Opening PortAudio
+            # is a GIL-holding C call; run inline on mount it races Textual's initial render and
+            # the banner never flushes — it stays blank until a resize/focus forces a full
+            # repaint. call_after_refresh runs once the screen is on-screen, so the splash wins.
+            self.call_after_refresh(self._begin_listening)  # in voice mode, capture first turn
 
     # --- event rendering (always called on the UI thread) ---------------------
 
diff --git a/aai_cli/commands/code/__init__.py b/aai_cli/commands/code/__init__.py
index b37052d0..9d71d404 100644
--- a/aai_cli/commands/code/__init__.py
+++ b/aai_cli/commands/code/__init__.py
@@ -6,6 +6,7 @@
 
 from aai_cli import command_registry, help_panels
 from aai_cli.app.context import run_with_options
+from aai_cli.code_agent import store
 from aai_cli.code_agent.prompt import DEFAULT_MODEL
 from aai_cli.commands.code import _exec as code_exec
 from aai_cli.core import llm as gateway
@@ -62,8 +63,10 @@ def code(
     memory: bool = typer.Option(
         True, "--memory/--no-memory", help="Load and persist the agent's long-term memory"
     ),
-    session: str = typer.Option(
-        "default", "--session", help="Conversation session name (reuse to resume it)"
+    session: str | None = typer.Option(
+        None,
+        "--session",
+        help="Resume a named session. Default: a new unique session each run",
     ),
     persist: bool = typer.Option(
         True, "--persist/--fresh", help="Persist the session to disk (--fresh: ephemeral)"
@@ -98,7 +101,9 @@ def code(
         skills=skills,
         web=web,
         memory=memory,
-        session=session,
+        # No --session given -> a fresh unique id, so each run starts a clean conversation
+        # instead of silently resuming the previous one.
+        session=session if session is not None else store.new_session_id(),
         persist=persist,
         tui=tui,
         voice=voice,
diff --git a/aai_cli/core/microphone.py b/aai_cli/core/microphone.py
index 755858f1..e75576d4 100644
--- a/aai_cli/core/microphone.py
+++ b/aai_cli/core/microphone.py
@@ -6,6 +6,7 @@
 from types import ModuleType
 from typing import Any, Protocol, cast
 
+from aai_cli.core import stdio
 from aai_cli.core.errors import CLIError
 
 with warnings.catch_warnings():
@@ -17,6 +18,8 @@
 
 # Used when the device's native rate can't be determined (e.g. headless CI).
 _FALLBACK_RATE = 48000
+# Channel count for the multichannel-input fallback: capture stereo, then downmix to mono.
+_STEREO_CHANNELS = 2
 
 
 class _RawInputStream(Protocol):
@@ -82,7 +85,11 @@ def default_rate(kind: str, device: int | None = None) -> int:
     """
     sd = _sounddevice()
     try:
-        raw_rate = sd.query_devices(device, kind).get("default_samplerate", _FALLBACK_RATE)
+        # query_devices triggers PortAudio's lazy init, which prints device-probe noise to
+        # the C-level stderr; suppress it so a TUI mic-open can't corrupt the rendered screen.
+        with stdio.suppress_native_stderr():
+            devices = sd.query_devices(device, kind)
+        raw_rate = devices.get("default_samplerate", _FALLBACK_RATE)
         if not isinstance(raw_rate, str | int | float):
             return _FALLBACK_RATE
         rate = int(float(raw_rate))
@@ -104,35 +111,100 @@ def resample_pcm16(chunk: bytes, state: Any, *, src_rate: int, dst_rate: int) ->
 class _SoundDeviceMic:
     """Iterator of PCM16 byte chunks from a sounddevice raw input stream.
 
-    Yields ~100 ms blocks; closeable so MicrophoneSource can tear it down.
+    Yields ~100 ms blocks; closeable so MicrophoneSource can tear it down. When opened with
+    ``channels=2`` (the multichannel-input fallback below), each interleaved stereo block is
+    downmixed to mono so downstream — resampling and the STT stream — always sees one channel.
     """
 
-    def __init__(self, stream: _RawInputStream, blocksize: int) -> None:
+    def __init__(self, stream: _RawInputStream, blocksize: int, *, channels: int = 1) -> None:
         self._stream = stream
         self._blocksize = blocksize
+        self._channels = channels
 
     def __iter__(self) -> Iterator[bytes]:
         return self
 
     def __next__(self) -> bytes:
         data, _overflowed = self._stream.read(self._blocksize)
-        return bytes(data)
+        pcm = bytes(data)
+        if self._channels == _STEREO_CHANNELS:
+            # Average L/R into a single channel (width=2 → int16).
+            pcm = audioop.tomono(pcm, 2, 0.5, 0.5)
+        return pcm
 
     def close(self) -> None:
         self._stream.stop()
         self._stream.close()
 
 
+def _open_input_stream(
+    sd: _SoundDeviceModule, *, sample_rate: int, device: int | None, channels: int, blocksize: int
+) -> _RawInputStream:
+    """Open and start a started PCM16 input stream at ``channels`` channels.
+
+    Wrapped in ``suppress_native_stderr`` because opening/starting is PortAudio's stderr-noisy
+    moment — kept off the terminal so a TUI mic-open can't corrupt the rendered screen.
+    """
+    with stdio.suppress_native_stderr():
+        stream = sd.RawInputStream(
+            samplerate=sample_rate,
+            device=device,
+            channels=channels,
+            dtype="int16",
+            blocksize=blocksize,
+        )
+        stream.start()
+    return stream
+
+
+def _max_input_channels(sd: _SoundDeviceModule, device: int | None) -> int:
+    """The device's advertised input-channel count (0 when it exposes no input)."""
+    with stdio.suppress_native_stderr():
+        info = sd.query_devices(device, "input")
+    raw = info.get("max_input_channels", 0)
+    return raw if isinstance(raw, int) else 0
+
+
 def _default_mic_stream(*, sample_rate: int, device: int | None) -> Iterator[bytes]:
-    """A sounddevice-backed PCM16 mic stream (imported lazily to keep startup fast)."""
-    sd = _sounddevice()
+    """A sounddevice-backed PCM16 mono mic stream (imported lazily to keep startup fast).
 
+    Tries a mono open first. PortAudio rejects ``channels=1`` (``-9998``) when the device
+    exposes no usable mono input: either it has zero input channels (no mic permission, or the
+    default input isn't a microphone) — which no channel count can fix, so we raise an
+    actionable error — or it's a multichannel-only input, which we reopen at stereo and
+    downmix. Devices that already do mono never reach the fallback.
+    """
+    sd = _sounddevice()
     blocksize = max(1, sample_rate // 10)  # ~100 ms per read
-    stream = sd.RawInputStream(
-        samplerate=sample_rate, device=device, channels=1, dtype="int16", blocksize=blocksize
-    )
-    stream.start()
-    return _SoundDeviceMic(stream, blocksize)
+    try:
+        return _SoundDeviceMic(
+            _open_input_stream(
+                sd, sample_rate=sample_rate, device=device, channels=1, blocksize=blocksize
+            ),
+            blocksize,
+        )
+    except Exception:
+        max_in = _max_input_channels(sd, device)
+        if max_in < 1:
+            raise CLIError(
+                "The default microphone reports no input channels.",
+                error_type="mic_error",
+                exit_code=1,
+                suggestion=(
+                    "Grant microphone access to your terminal in System Settings > Privacy & "
+                    "Security > Microphone, or pick another input with --device."
+                ),
+            ) from None
+        if max_in < _STEREO_CHANNELS:
+            raise  # a 1-channel device should accept mono; surface the real PortAudio error
+        stream = _open_input_stream(
+            sd,
+            sample_rate=sample_rate,
+            device=device,
+            channels=_STEREO_CHANNELS,
+            blocksize=blocksize,
+        )
+        return _SoundDeviceMic(stream, blocksize, channels=_STEREO_CHANNELS)
 
 
 class MicrophoneSource:
@@ -172,6 +244,8 @@ def __iter__(self) -> Iterator[bytes]:
             stream: Any = self._factory(sample_rate=self._capture_rate, device=self.device)
         except ImportError as exc:
             raise audio_missing_error() from exc
+        except CLIError:
+            raise  # the factory already raised an actionable error; don't bury it in a re-wrap
         except Exception as exc:
             # "device None" reads like a bug; name the default mic in plain words.
             target = (
diff --git a/aai_cli/core/stdio.py b/aai_cli/core/stdio.py
index 05db2ef3..f6905644 100644
--- a/aai_cli/core/stdio.py
+++ b/aai_cli/core/stdio.py
@@ -3,7 +3,57 @@
 import contextlib
 import os
 import sys
-from collections.abc import Iterator
+from collections.abc import Generator, Iterator
+
+# The OS-level stderr descriptor. Used as a literal rather than ``sys.stderr.fileno()``
+# because inside a Textual app ``sys.stderr`` is swapped for a redirector whose ``fileno()``
+# returns an unusable fd — duping that raised EBADF and broke the very mic open we were
+# trying to quiet. fd 2 is the real inherited stderr, which Textual never touches.
+_STDERR_FD = 2
+
+
+@contextlib.contextmanager
+def suppress_native_stderr() -> Generator[None]:
+    """Send OS-level stderr to /dev/null for the block, then restore it.
+
+    Catches diagnostics that C extensions write straight to fd 2 via ``fprintf`` —
+    PortAudio/CoreAudio/ALSA print device-probe noise there on the first audio call,
+    *below* Python's logging, so silencing loggers can't reach it. Inside a full-screen
+    TUI (which draws to stdout and never repaints stderr) those raw writes scribble over
+    the rendered screen; the mic-open path wraps its PortAudio calls in this so they land
+    in the void instead.
+
+    Safe by construction: if the descriptor can't be duplicated/redirected for any reason,
+    the block runs with stderr untouched rather than raising — suppression is cosmetic and
+    must never break the operation it wraps. Exceptions from the body propagate normally
+    (only the fd is redirected, not raised errors).
+    """
+    saved_fd: int | None = None
+    devnull_fd: int | None = None
+    try:
+        saved_fd = os.dup(_STDERR_FD)
+        devnull_fd = os.open(os.devnull, os.O_WRONLY)
+        os.dup2(devnull_fd, _STDERR_FD)
+    except OSError:
+        # Couldn't redirect — abandon suppression, never break the caller.
+        _close_quietly(saved_fd)
+        _close_quietly(devnull_fd)
+        yield
+        return
+    try:
+        yield
+    finally:
+        with contextlib.suppress(OSError):
+            os.dup2(saved_fd, _STDERR_FD)  # restore the real stderr
+        _close_quietly(saved_fd)
+        _close_quietly(devnull_fd)
+
+
+def _close_quietly(fd: int | None) -> None:
+    """Close ``fd`` if it was opened, ignoring an already-closed/invalid descriptor."""
+    if fd is not None:
+        with contextlib.suppress(OSError):
+            os.close(fd)
 
 
 def silence_stdout() -> None:
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 2879f6f9..a36aa130 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -416,9 +416,8 @@
   │ --memory       --no-memory               Load and persist the agent's        │
   │                                          long-term memory                    │
   │                                          [default: memory]                   │
-  │ --session                     TEXT       Conversation session name (reuse to │
-  │                                          resume it)                          │
-  │                                          [default: default]                  │
+  │ --session                     TEXT       Resume a named session. Default: a  │
+  │                                          new unique session each run         │
   │ --persist      --fresh                   Persist the session to disk         │
   │                                          (--fresh: ephemeral)                │
   │                                          [default: persist]                  │
diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py
index 739285bf..76af37af 100644
--- a/tests/test_code_agent.py
+++ b/tests/test_code_agent.py
@@ -166,6 +166,13 @@ def test_checkpointer_in_memory_vs_sqlite(tmp_path, monkeypatch):  # untyped: to
     saver.conn.close()
 
 
+def test_new_session_id_is_unique_and_short() -> None:
+    a = store.new_session_id()
+    b = store.new_session_id()
+    assert a != b  # each run gets its own thread id (no silent resume of a shared default)
+    assert len(a) == 12 and a.isalnum()  # short hex, readable off the splash to resume later
+
+
 def test_cli_tool_invokes_runner_with_args() -> None:
     captured: list[list[str]] = []
 
diff --git a/tests/test_code_command.py b/tests/test_code_command.py
index a4384db9..b548cd6e 100644
--- a/tests/test_code_command.py
+++ b/tests/test_code_command.py
@@ -41,7 +41,20 @@ def test_command_parses_flags_into_options(monkeypatch):
     opts = captured["o"]
     assert opts.prompt == "build a thing"
     assert opts.auto is True and opts.web is False
-    assert opts.session == "s1" and opts.persist is False
+    assert opts.session == "s1" and opts.persist is False  # an explicit --session is honored
+
+
+def test_command_defaults_to_a_fresh_unique_session_each_run(monkeypatch):
+    # No --session: each invocation gets its own id (so a run never silently resumes the
+    # previous conversation), and two runs differ.
+    seen = []
+    monkeypatch.setattr(
+        _exec, "run_code", lambda opts, state, *, json_mode: seen.append(opts.session)
+    )
+    assert runner.invoke(app, ["code"]).exit_code == 0
+    assert runner.invoke(app, ["code"]).exit_code == 0
+    assert seen[0] != "default"  # not the old shared, auto-resumed thread
+    assert seen[0] and seen[1] and seen[0] != seen[1]  # a distinct id per run
 
 
 def test_run_code_dispatches_to_tui_with_voice_by_default_when_tty(monkeypatch):
diff --git a/tests/test_code_tui.py b/tests/test_code_tui.py
index df36ed0d..8abeee08 100644
--- a/tests/test_code_tui.py
+++ b/tests/test_code_tui.py
@@ -14,7 +14,7 @@
 
 import pytest
 from langchain_core.messages import AIMessage, HumanMessage
-from textual.widgets import Input, RichLog, Static
+from textual.widgets import Input, Label, RichLog, Static
 
 from aai_cli.code_agent import tui
 from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult
@@ -51,14 +51,6 @@ def test_format_args_and_abbrev_home() -> None:
     assert tui._abbrev_home(outside) == str(outside)
 
 
-def test_approval_decision_defaults_to_reject() -> None:
-    assert tui._approval_decision("approve") == "approve"
-    assert tui._approval_decision("auto") == "auto"
-    # A button with no id (Textual allows None) is treated as a rejection, not approval.
-    assert tui._approval_decision(None) == "reject"
-    assert tui._approval_decision("") == "reject"
-
-
 def test_git_branch_and_status(tmp_path: Path) -> None:
     assert tui._git_branch(tmp_path) is None  # no .git
     (tmp_path / ".git").mkdir()
@@ -206,23 +198,22 @@ async def go() -> None:
     _run(go())
 
 
-def test_approval_button_press_dismisses() -> None:
-    # Covers ApprovalScreen.on_button_pressed (the click path; key paths are covered
-    # by the approve/reject modal tests above). The bracketed name/args also guard the
-    # compose() escape() — without it, Label markup parsing would raise on mount.
-    results: list[str | None] = []
-
+def test_approval_prompt_renders_keyboard_hint() -> None:
+    # The prompt is a plain y/a/n keyboard hint, not clickable buttons — assert each
+    # option's copy renders so dropping one is caught. The bracketed name/args also guard
+    # the compose() escape(): without it, Label markup parsing would raise on mount.
     async def go() -> None:
         app = CodeAgentApp(agent=FakeAgent([]))
         async with app.run_test(size=(100, 30)) as pilot:
             await pilot.pause()
-            app.push_screen(ApprovalScreen("exec[", {"cmd": "[ls"}), results.append)
-            await pilot.pause()
-            await pilot.click("#reject")
+            app.push_screen(ApprovalScreen("exec[", {"cmd": "[ls"}))
             await pilot.pause()
+            rendered = " ".join(str(label.render()) for label in app.screen.query(Label))
+            assert "approve" in rendered
+            assert "auto-approve" in rendered
+            assert "reject" in rendered
 
     _run(go())
-    assert results == ["reject"]
 
 
 def test_approval_box_is_compact_and_bottom_docked() -> None:
diff --git a/tests/test_microphone.py b/tests/test_microphone.py
index d215e187..207f6b4a 100644
--- a/tests/test_microphone.py
+++ b/tests/test_microphone.py
@@ -286,3 +286,82 @@ def test_default_mic_stream_missing_sounddevice_raises_mic_missing(monkeypatch):
         _default_mic_stream(sample_rate=16000, device=None)
     assert exc.value.error_type == "mic_missing"
     assert exc.value.exit_code == 2
+
+
+class _FakeStereoStream(_FakeRawStream):
+    """A 2-channel input stream: one interleaved stereo frame (L=256, R=768)."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # int16 LE: L=256 (b"\x00\x01"), R=768 (b"\x00\x03"), interleaved one frame.
+        self._chunks = [(b"\x00\x01\x00\x03", False)]
+
+
+def test_sounddevice_mic_downmixes_stereo_to_mono():
+    # channels=2 averages L/R per frame: (256 + 768) / 2 == 512 (b"\x00\x02").
+    mic = _SoundDeviceMic(_FakeStereoStream(), blocksize=1, channels=2)
+    assert next(iter(mic)) == b"\x00\x02"
+
+
+def _fake_sd_rejecting_mono(max_input_channels: int, opened: list[int]) -> Any:
+    """A sounddevice whose mono open fails with -9998; query reports ``max_input_channels``."""
+
+    def raw_input_stream(**kwargs):
+        opened.append(kwargs["channels"])
+        if kwargs["channels"] == 1:
+            raise OSError("Error opening RawInputStream: Invalid number of channels [-9998]")
+        return _FakeStereoStream(**kwargs)
+
+    fake_sd: Any = types.ModuleType("sounddevice")
+    fake_sd.RawInputStream = raw_input_stream
+    fake_sd.query_devices = lambda device, kind: {"max_input_channels": max_input_channels}
+    return fake_sd
+
+
+def test_default_mic_stream_falls_back_to_stereo_downmix(monkeypatch):
+    # A multichannel-only input (mono rejected, but >=2 channels available) is reopened at
+    # stereo and downmixed to mono — so voice works on devices that won't open as mono.
+    opened: list[int] = []
+    monkeypatch.setitem(sys.modules, "sounddevice", _fake_sd_rejecting_mono(2, opened))
+    stream = _default_mic_stream(sample_rate=16000, device=None)
+    assert opened == [1, 2]  # tried mono, then reopened stereo
+    assert next(iter(stream)) == b"\x00\x02"  # yields downmixed mono
+
+
+def test_default_mic_stream_zero_input_channels_raises_permission_error(monkeypatch):
+    # 0 input channels can't be salvaged (no mic permission / wrong default device): raise an
+    # actionable error pointing at the macOS Microphone privacy setting, not the cryptic code.
+    opened: list[int] = []
+    monkeypatch.setitem(sys.modules, "sounddevice", _fake_sd_rejecting_mono(0, opened))
+    with pytest.raises(CLIError) as exc:
+        _default_mic_stream(sample_rate=16000, device=None)
+    assert opened == [1]  # only the mono attempt; no pointless stereo retry
+    assert exc.value.error_type == "mic_error"
+    assert "no input channels" in exc.value.message.lower()
+    assert exc.value.suggestion is not None
+    assert "Microphone" in exc.value.suggestion
+
+
+def test_default_mic_stream_single_channel_failure_reraises_original(monkeypatch):
+    # A genuine 1-channel device should accept mono; if it still failed, the channel fallback
+    # can't help, so surface the real PortAudio error rather than masking it.
+    opened: list[int] = []
+    monkeypatch.setitem(sys.modules, "sounddevice", _fake_sd_rejecting_mono(1, opened))
+    with pytest.raises(OSError, match="Invalid number of channels"):
+        _default_mic_stream(sample_rate=16000, device=None)
+    assert opened == [1]  # no stereo retry on a 1-channel device
+
+
+def test_microphone_source_passes_through_factory_clierror():
+    # An actionable CLIError from the factory (e.g. the zero-channel case) must propagate
+    # intact, not get re-wrapped into the generic "Could not open" message.
+    err = CLIError("no input channels", error_type="mic_error", exit_code=1, suggestion="grant it")
+
+    def boom(**_kwargs):
+        raise err
+
+    mic = MicrophoneSource(capture_rate=16000, stream_factory=boom)
+    with pytest.raises(CLIError) as exc:
+        list(mic)
+    assert exc.value is err  # passed through unchanged
+    assert exc.value.suggestion == "grant it"
diff --git a/tests/test_stdio.py b/tests/test_stdio.py
index 9b0e2fc0..8a9a58f7 100644
--- a/tests/test_stdio.py
+++ b/tests/test_stdio.py
@@ -1,4 +1,5 @@
 import io
+import os
 
 from aai_cli.core import stdio
 
@@ -121,3 +122,59 @@ def boom(*_a, **_k):
     # Raising inside the suppressed block must not propagate.
     monkeypatch.setattr("os.open", boom)
     stdio.silence_stdout()
+
+
+def test_suppress_native_stderr_redirects_during_block_then_restores(monkeypatch):
+    # The fd dance: dup the real stderr (fd 2 itself — never sys.stderr.fileno(), which is
+    # an unusable redirector inside a TUI), point it at /dev/null for the body, then restore
+    # and close both temporaries. The body must run *while* redirected (between the dup2s).
+    events: list[object] = []
+    monkeypatch.setattr("os.dup", lambda fd: events.append(("dup", fd)) or 50)
+    monkeypatch.setattr("os.open", lambda path, flags: events.append(("open", path)) or 99)
+    monkeypatch.setattr("os.dup2", lambda src, dst: events.append(("dup2", src, dst)))
+    monkeypatch.setattr("os.close", lambda fd: events.append(("close", fd)))
+
+    with stdio.suppress_native_stderr():
+        events.append("body")
+
+    assert events == [
+        ("dup", 2),  # save the real stderr fd (literal 2)
+        ("open", os.devnull),  # open /dev/null
+        ("dup2", 99, 2),  # point stderr at /dev/null
+        "body",  # the block runs while stderr is redirected
+        ("dup2", 50, 2),  # restore the saved fd
+        ("close", 50),
+        ("close", 99),
+    ]
+
+
+def test_suppress_native_stderr_runs_body_when_redirect_fails(monkeypatch):
+    # Safe by construction: if the fd can't be duplicated, the block still runs (suppression
+    # is cosmetic and must never break the wrapped mic open) and stderr is never redirected.
+    def boom(_fd: int) -> int:
+        raise OSError("cannot dup")
+
+    redirected: list[tuple[int, int]] = []
+    monkeypatch.setattr("os.dup", boom)
+    monkeypatch.setattr("os.dup2", lambda src, dst: redirected.append((src, dst)))
+    ran: list[bool] = []
+
+    with stdio.suppress_native_stderr():
+        ran.append(True)
+
+    assert ran == [True]  # body ran despite the dup failure
+    assert redirected == []  # never redirected -> nothing left to restore
+
+
+def test_suppress_native_stderr_swallows_close_failure(monkeypatch):
+    # A teardown close hitting an already-closed/invalid fd must not escape the block.
+    def boom(_fd: int) -> None:
+        raise OSError("already closed")
+
+    monkeypatch.setattr("os.dup", lambda _fd: 50)
+    monkeypatch.setattr("os.open", lambda _path, _flags: 99)
+    monkeypatch.setattr("os.dup2", lambda _src, _dst: None)
+    monkeypatch.setattr("os.close", boom)
+
+    with stdio.suppress_native_stderr():
+        pass  # exits cleanly even though both teardown closes raise

From af0d3a8844914ea762961435e3a257721dfb28fe Mon Sep 17 00:00:00 2001
From: Alex Kroman <12372+alexkroman@users.noreply.github.com>
Date: Thu, 18 Jun 2026 09:20:59 -0700
Subject: [PATCH 2/3] Refactor TUI into modular components; rename
 agent-cascade to live (#240)

## Summary

This PR refactors the coding-agent TUI into smaller, more maintainable
modules while keeping the main `CodeAgentApp` class intact. It also
renames the `agent-cascade` command to `live` for clarity. The changes
improve code organization without altering user-facing behavior.

## Key Changes

**TUI Refactoring:**
- **Split `tui.py`** into focused modules to stay under the 500-line
file-length gate:
- `modals.py`: `ApprovalScreen` and `AskScreen` modal dialogs with voice
support
- `messages.py`: Transcript widget classes (`UserMessage`,
`AssistantMessage`, `ToolOutput`, etc.)
- `voice_ui.py`: Voice capture/readback mechanics (`_VoiceIO` protocol,
`_VoiceLegs` mixin)
- `tui_status.py`: Pure text helpers for status line and spinner
(`_spinner_text`, `_status_text`, etc.)
- `summarize.py`: Tool activity summaries shared by TUI and Rich
fallback
  - `risk.py`: Risk heuristics for tool approval prompts

- **Extracted helper modules:**
- `agent_cascade/brain.py`: Deepagents graph builder for the live
cascade (system prompt, tool guidance, completer)
- Moved `approval_from_speech` mapping to `modals.py` for
voice-answerable approval

**Command Rename:**
- Renamed `agent-cascade` command to `live` throughout (command
registration, help text, tests, docs, templates)

**Test Reorganization:**
- Split large test files to stay under the gate:
  - `test_code_modals.py`: Modal screen tests with voice doubles
  - `test_code_messages.py`: Transcript widget rendering tests
  - `test_code_tui_voice.py`: Voice toggle and readback tests (expanded)
  - `test_code_tui_status.py`: Pure status/spinner text helpers
  - `test_code_summarize.py`: Tool summarization tests
  - `test_code_session_stream.py`: Streaming and cancellation tests
  - `test_code_risk.py`: Risk heuristic tests
  - `test_agent_cascade_brain.py`: Deepagents graph builder tests

**Installer Improvements:**
- Enhanced `install.sh` with dev mode support (`--install-method git` /
`--dev`)
- Added usage help and environment variable overrides
- Supports both release (published) and editable (development) installs

**Events & Session:**
- Added `AssistantDelta` event for per-token streaming (frozen,
hashable)
- Updated `CodeSession` to handle dual-mode streaming (`values` +
`messages`)

## Implementation Details

- **Voice modals** speak prompts and listen for spoken replies off the
UI thread (daemon threads), marshaling back via `call_from_thread`
- **Risk warnings** are pure functions (no Textual imports) so they
unit-test cleanly
- **Summarizers** clip long tool args/output to keep the transcript
scannable (mirroring deepagents-code's collapsible rows)
- **Deepagents brain** builds system prompts that advertise only
available tools, preventing the agent from narrating actions it can't
take
- All refactored modules maintain the same public APIs; `CodeAgentApp`
remains the single entry point

## Testing

- New test files cover the extracted modules with real Textual app
headless tests and pure function unit tests
- Snapshot tests updated for the `live` command rename
- CI workflow enhanced with end-to-end install.sh validation

https://claude.ai/code/session_01Ad72JciKrsz4TKG7ZY9GR6

---------

Co-authored-by: Claude <noreply@anthropic.com>
---
 .github/workflows/ci.yml                      |  37 ++
 README.md                                     |  37 +-
 REFERENCE.md                                  |   2 +-
 aai_cli/agent_cascade/brain.py                | 189 ++++++++
 aai_cli/agent_cascade/engine.py               |  15 +-
 aai_cli/code_agent/events.py                  |  29 +-
 aai_cli/code_agent/messages.py                | 110 +++++
 aai_cli/code_agent/modals.py                  | 202 ++++++++
 aai_cli/code_agent/model.py                   |  17 +-
 aai_cli/code_agent/render.py                  |  19 +-
 aai_cli/code_agent/risk.py                    |  68 +++
 aai_cli/code_agent/session.py                 |  39 +-
 aai_cli/code_agent/summarize.py               |  96 ++++
 aai_cli/code_agent/tui.py                     | 438 +++++++++---------
 aai_cli/code_agent/tui_status.py              |  51 ++
 aai_cli/code_agent/voice_ui.py                | 107 +++++
 aai_cli/code_agent/web_search.py              |   4 +
 aai_cli/code_gen/agent_cascade.py             |   4 +-
 aai_cli/commands/agent/__init__.py            |   2 +-
 aai_cli/commands/agent_cascade/__init__.py    |  34 +-
 aai_cli/commands/agent_cascade/_exec.py       |   4 +-
 aai_cli/core/microphone.py                    |  40 ++
 install.sh                                    | 188 +++++++-
 pyproject.toml                                |   7 +-
 pyrightconfig.tests.json                      |   3 +-
 scripts/generated_code_compile_gate.py        |   4 +-
 .../test_snapshots_help_root.ambr             |  83 ++--
 .../test_snapshots_help_run.ambr              | 233 +++++-----
 tests/test_agent_cascade_brain.py             | 235 ++++++++++
 tests/test_agent_cascade_command.py           |  51 +-
 tests/test_agent_cascade_show_code.py         |  24 +-
 tests/test_code_agent.py                      |  43 --
 tests/test_code_messages.py                   | 149 ++++++
 tests/test_code_modals.py                     | 236 ++++++++++
 tests/test_code_risk.py                       |  46 ++
 tests/test_code_session_stream.py             | 157 +++++++
 tests/test_code_summarize.py                  |  93 ++++
 tests/test_code_tui.py                        | 115 +++--
 tests/test_code_tui_status.py                 |  49 ++
 tests/test_code_tui_voice.py                  | 166 ++++++-
 tests/test_microphone.py                      | 101 +++-
 tests/test_sandbox_access.py                  |   6 +-
 tests/test_smoke.py                           |   2 +-
 43 files changed, 2930 insertions(+), 605 deletions(-)
 create mode 100644 aai_cli/agent_cascade/brain.py
 create mode 100644 aai_cli/code_agent/messages.py
 create mode 100644 aai_cli/code_agent/modals.py
 create mode 100644 aai_cli/code_agent/risk.py
 create mode 100644 aai_cli/code_agent/summarize.py
 create mode 100644 aai_cli/code_agent/tui_status.py
 create mode 100644 aai_cli/code_agent/voice_ui.py
 create mode 100644 tests/test_agent_cascade_brain.py
 create mode 100644 tests/test_code_messages.py
 create mode 100644 tests/test_code_modals.py
 create mode 100644 tests/test_code_risk.py
 create mode 100644 tests/test_code_session_stream.py
 create mode 100644 tests/test_code_summarize.py
 create mode 100644 tests/test_code_tui_status.py

diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
index a181d476..7a38cff6 100644
--- a/.github/workflows/ci.yml
+++ b/.github/workflows/ci.yml
@@ -316,3 +316,40 @@ jobs:
           python -m pip install -e . pip-audit
           # Append `--ignore-vuln <ID>` to accept an unfixable transitive advisory.
           python -m pip_audit
+
+  # End-to-end check that install.sh actually installs a working `assembly`. Runs
+  # the script in dev mode (--install-method git) so it installs *this* checkout
+  # editable via uv — exercising both the installer and the PR's own code — then
+  # smoke-tests the resulting CLI. Catches install.sh regressions (arg parsing,
+  # the uv/pipx selection, the editable path) that shellcheck alone can't.
+  install-script:
+    name: install script smoke
+    runs-on: ubuntu-latest
+    timeout-minutes: 15
+    steps:
+      - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3
+        with:
+          persist-credentials: false # no job pushes; don't leave the token in .git/config
+          fetch-depth: 0 # hatch-vcs derives the version from git history for the editable build
+      # Provide uv so install.sh takes its preferred (uv) path rather than
+      # bootstrapping it over the network.
+      - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0
+        with:
+          enable-cache: true
+          cache-dependency-glob: uv.lock
+
+      # PortAudio + ffmpeg so `assembly --help` (which imports the full command
+      # tree) loads cleanly; also lets install.sh's dep check find them present.
+      - name: System deps (PortAudio + ffmpeg)
+        run: sudo apt-get update && sudo apt-get install -y libportaudio2 ffmpeg
+
+      - name: Run install.sh (editable, from this checkout)
+        run: ./install.sh --install-method git
+
+      - name: Smoke-test the installed CLI
+        run: |
+          # uv tool installs land in ~/.local/bin; put it on PATH for this step.
+          export PATH="$HOME/.local/bin:$PATH"
+          assembly --version
+          help_out="$(assembly --help)"
+          echo "$help_out" | grep -q transcribe
diff --git a/README.md b/README.md
index ed70dd87..674a9818 100644
--- a/README.md
+++ b/README.md
@@ -17,10 +17,10 @@ Learn more about the platform in the [AssemblyAI docs](https://www.assemblyai.co
 Install on macOS or Linux with one command:
 
 ```sh
-curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | sh
+curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash
 ```
 
-This installs [uv](https://docs.astral.sh/uv/) if needed, then installs `assembly` as a uv tool.
+This installs `assembly` with [uv](https://docs.astral.sh/uv/) (or pipx), bootstrapping uv if needed.
 
 Sign in (stores your API key in the OS keyring) and run your first transcription:
 
@@ -36,7 +36,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 - **🎯 One command for everything**: transcription, real-time streaming, voice agents, LLM prompts, and WER benchmarking — no SDK boilerplate.
 - **🔌 Built for pipelines**: data goes to stdout, errors to stderr, `--json` gives stable machine-readable output, and `-` reads audio from stdin.
 - **🔐 Secure by default**: your API key lives in the OS keyring, never in a dotfile — and run commands have no `--api-key` flag, so keys can't leak into `ps` or shell history.
-- **🛠️ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `agent-cascade`).
+- **🛠️ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `live`).
 - **🤖 Agent-ready**: `assembly setup install` wires your coding agent up with the AssemblyAI docs MCP server and skills.
 - **📖 Open source**: MIT licensed.
 
@@ -48,7 +48,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 | `assembly stream` | Real-time transcription from your microphone, a file, or a URL — on macOS it can capture system audio too |
 | `assembly dictate` | Signal-driven dictation: records immediately, send SIGTERM for instant text — scriptable from hotkey tools like Hammerspoon (Sync STT API, up to 120 s per utterance) |
 | `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal |
-| `assembly agent-cascade` | Same live conversation, but wired client-side from Streaming STT + the LLM Gateway + streaming TTS, like the `agent-cascade` starter (sandbox-only) |
+| `assembly live` | Talk live to a tool-using voice agent, wired client-side from Streaming STT + a deepagents brain on the LLM Gateway + streaming TTS — it can web-search, fetch URLs, and read the docs mid-conversation, like the `agent-cascade` starter (sandbox-only) |
 | `assembly speak` | Synthesize text to speech over the streaming-TTS WebSocket (sandbox-only) |
 | `assembly llm` | Prompt the LLM Gateway over a transcript, files, stdin, or a live stream |
 | `assembly code` | Terminal coding agent (deepagents SDK) backed only by the LLM Gateway — reads/writes/edits files, runs shell, searches the docs MCP, and can invoke the `assembly` CLI itself; mutating actions ask for approval. Defaults to voice in a terminal (speak your request, replies read back via streaming TTS in the sandbox); pass `--no-voice` for the keyboard TUI |
@@ -63,7 +63,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins
 | `assembly transcripts` / `sessions` | Browse and fetch past transcripts and streaming sessions |
 | `assembly keys` / `balance` / `usage` / `limits` / `audit` | Account self-service via browser login |
 
-Add `--show-code` to `transcribe` / `stream` / `agent` / `agent-cascade` to print the equivalent Python SDK script instead of running — the built-in path from CLI experiment to SDK code.
+Add `--show-code` to `transcribe` / `stream` / `agent` / `live` to print the equivalent Python SDK script instead of running — the built-in path from CLI experiment to SDK code.
 
 ## ✨ Things you can do with it
 
@@ -194,7 +194,7 @@ assembly transcripts list --json --limit 5 \
 assembly agent --voice ivy --system-prompt "you're a helpful interviewer"
 ```
 
-**Graduate to the SDK** — `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`agent-cascade` run instead of executing it:
+**Graduate to the SDK** — `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`live` run instead of executing it:
 
 ```sh
 assembly agent --system-prompt "you're a story generator" --show-code > story.py
@@ -231,12 +231,26 @@ Requires Python 3.12+ (Homebrew brings its own; for pipx/uv see the `--python` h
 ### Install script (recommended — macOS / Linux)
 
 ```sh
-curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | sh
+curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash
 ```
 
-The [`install.sh`](install.sh) script bootstraps [uv](https://docs.astral.sh/uv/) if it
-isn't already present, then runs `uv tool install` to put `assembly` on your `PATH`. Re-run
-it any time to update to the latest version.
+The [`install.sh`](install.sh) script installs `assembly` with whichever tool installer you
+already have — [uv](https://docs.astral.sh/uv/) if present, otherwise [pipx](https://pipx.pypa.io) —
+and bootstraps uv only when neither is found. It then installs the optional live-audio system
+dependencies via [Homebrew](https://brew.sh) when `brew` is available, or prints the right
+install command for your platform otherwise. Re-run it any time to update to the latest version.
+
+For a **development install** — an editable checkout so local source edits take effect without
+reinstalling (`uv tool install -e .`) — pass `--install-method git` (or `--dev`). It reuses the
+checkout you run it from, or clones the repo to `~/.local/share/assembly-cli` (override with
+`--dir`):
+
+```sh
+# from a clone you already have
+./install.sh --dev
+# or fetch + editable-install in one shot
+curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash -s -- --install-method git
+```
 
 ### Homebrew (macOS / Linux)
 
@@ -266,7 +280,8 @@ Only the live-audio commands need anything extra: `stream`, `dictate`, and `agen
 microphone capture and [`ffmpeg`](https://ffmpeg.org) on `PATH` to stream non-WAV audio; `assembly share`
 uses [`cloudflared`](https://github.com/cloudflare/cloudflared) for its public tunnel.
 Plain `transcribe` uploads your file directly and needs none of them. The
-[`install.sh`](install.sh) script checks for these and prints the right install command when any are missing.
+[`install.sh`](install.sh) script checks for these and installs them via Homebrew when `brew` is
+available, otherwise printing the right install command for your platform.
 
 - Debian/Ubuntu: `sudo apt-get install libportaudio2 ffmpeg`
 - Fedora: `sudo dnf install portaudio ffmpeg`
diff --git a/REFERENCE.md b/REFERENCE.md
index bf9f3d8e..09216ef4 100644
--- a/REFERENCE.md
+++ b/REFERENCE.md
@@ -94,7 +94,7 @@ each carrying a `"type"` field to dispatch on:
 | ------- | ----------- |
 | `assembly stream --json` | `begin`, `turn`, `termination` (with `--from-stdin`, a `source` event precedes each file's events) |
 | `assembly agent --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
-| `assembly agent-cascade --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
+| `assembly live --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
 | `assembly dictate --json` | `utterance` |
 | `assembly llm --follow --json` | `answer` |
 | `assembly transcribe <batch> --json` | `result` (one per source), then `reduce` if `--llm-reduce` is set |
diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py
new file mode 100644
index 00000000..966e3e68
--- /dev/null
+++ b/aai_cli/agent_cascade/brain.py
@@ -0,0 +1,189 @@
+"""Deepagents-powered reply brain for the live voice cascade.
+
+`assembly live` answers each spoken turn with a deepagents graph instead of a single
+LLM completion, so the agent can transparently reach for tools — web search, URL
+fetch, the AssemblyAI docs — mid-conversation, mimicking a live multimodal assistant
+(the "talk to Gemini Live" experience). The graph is built once per session
+(:func:`build_graph`) and invoked statelessly per turn with the running history the
+cascade already keeps (:func:`build_completer`); tools are read-only and auto-approved,
+because a spoken turn can't pause for a keyboard confirmation, and the system prompt
+keeps every reply short and speakable.
+
+The graph is the only network seam: :func:`build_completer` accepts an injected graph,
+so the per-turn orchestration is unit-tested against a fake with no sockets — the same
+seam the rest of the cascade uses for its STT/LLM/TTS legs.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Callable, Sequence
+from typing import TYPE_CHECKING
+
+from aai_cli.agent_cascade.config import CascadeConfig
+from aai_cli.code_agent.agent import CompiledAgent
+from aai_cli.code_agent.fetch_tool import FETCH_TOOL_NAME
+from aai_cli.code_agent.web_search import WEB_SEARCH_TOOL_NAME
+
+if TYPE_CHECKING:
+    from langchain_core.tools import BaseTool
+    from openai.types.chat import ChatCompletionMessageParam
+
+# Closes every guidance variant: the reply is spoken, so it must stay short and plain.
+_SPOKEN_TAIL = (
+    "Your reply is read aloud, so keep it short and spoken — no markdown, lists, code, or raw URLs."
+)
+
+# When the session has *no* tools wired (e.g. no web search and the docs host is
+# unreachable), the model must answer from its own knowledge — and crucially must not
+# promise an action it can't take. Without this, telling it "you can search the web" while
+# no search tool is bound makes it narrate "I'll search for that…" and then stop, so the
+# answer never comes (the tool it announced was never actually available to call).
+_NO_TOOLS_GUIDANCE = (
+    "You have no external tools available, so answer from your own knowledge. Never say "
+    "you will search the web, look something up, or fetch a page — you can't do any of "
+    "that, so don't promise it; if a question needs information you don't have, say so "
+    f"briefly instead. {_SPOKEN_TAIL}"
+)
+
+
+def _join_clause(parts: list[str]) -> str:
+    """Join capability phrases into a readable clause: ``a``, ``a and b``, ``a, b, and c``."""
+    *initial, last = parts
+    if not initial:
+        return last
+    # Oxford comma only once there are three-or-more items (two or more lead the last).
+    joiner = ", and " if initial[1:] else " and "
+    return f"{', '.join(initial)}{joiner}{last}"
+
+
+def _tool_capabilities(tools: Sequence[BaseTool]) -> list[str]:
+    """The spoken-capability phrases backed by an actually-present tool.
+
+    Derived from the resolved tool names so the prompt never advertises a capability the
+    agent can't perform: web search is present only with a ``TAVILY_API_KEY``, and the docs
+    tools are best-effort (absent when the docs host is unreachable).
+    """
+    names = {tool.name for tool in tools}
+    capabilities: list[str] = []
+    if WEB_SEARCH_TOOL_NAME in names:
+        capabilities.append("search the web for current or unfamiliar facts")
+    if FETCH_TOOL_NAME in names:
+        capabilities.append("fetch a specific URL")
+    if names - {WEB_SEARCH_TOOL_NAME, FETCH_TOOL_NAME}:
+        capabilities.append("look up the AssemblyAI documentation")
+    return capabilities
+
+
+def build_system_prompt(persona: str, *, tools: Sequence[BaseTool]) -> str:
+    """The live agent's system prompt: the user's persona plus tool guidance.
+
+    The guidance is tailored to ``tools`` so the model is only told about capabilities it
+    actually has — advertising a missing tool (web search without a ``TAVILY_API_KEY``) made
+    the agent announce an action it then couldn't take, leaving the turn hanging with no
+    answer. With no tools at all the model is told to answer from its own knowledge.
+    """
+    capabilities = _tool_capabilities(tools)
+    if not capabilities:
+        return f"{persona}\n\n{_NO_TOOLS_GUIDANCE}"
+    guidance = (
+        f"You can use tools to help answer: {_join_clause(capabilities)}. Reach for a "
+        "tool when a question needs fresh or external information; answer directly and "
+        "instantly when you already know. Only offer to do what these tools allow — don't "
+        f"say you'll search the web or look something up unless it's listed here. {_SPOKEN_TAIL}"
+    )
+    return f"{persona}\n\n{guidance}"
+
+
+def build_live_tools() -> list[BaseTool]:
+    """The live agent's read-only toolset: URL fetch, web search (if keyed), and docs.
+
+    All three are reused from the coding agent's tool modules. Unlike there they are
+    *not* approval-gated — a spoken turn can't wait for a keyboard confirmation, so the
+    live agent only gets read-only tools and runs them automatically. Web search is
+    present only when ``TAVILY_API_KEY`` is set; the docs MCP is best-effort (an empty
+    list when the host is unreachable), so neither blocks a session.
+    """
+    from aai_cli.code_agent.docs_mcp import load_docs_tools
+    from aai_cli.code_agent.fetch_tool import build_fetch_tool
+    from aai_cli.code_agent.web_search import build_web_search_tool
+
+    tools: list[BaseTool] = [build_fetch_tool()]
+    search = build_web_search_tool()
+    if search is not None:
+        tools.append(search)
+    tools.extend(load_docs_tools())
+    return tools
+
+
+def build_graph(
+    api_key: str, config: CascadeConfig, *, tools: Sequence[BaseTool] | None = None
+) -> CompiledAgent:
+    """Compile the deepagents graph for one live session over the gateway model.
+
+    Reuses the coding agent's gateway-bound ``ChatOpenAI`` (so the live agent can only
+    ever reach AssemblyAI), threading the cascade's ``--max-tokens``/``--llm-config``
+    through it. ``tools`` defaults to :func:`build_live_tools`; tests pass an explicit
+    (possibly empty) list to skip the network-touching docs probe.
+    """
+    from deepagents import create_deep_agent
+
+    from aai_cli.code_agent.model import build_model
+
+    model = build_model(
+        api_key, model=config.model, max_tokens=config.max_tokens, extra=config.llm_extra
+    )
+    resolved = build_live_tools() if tools is None else list(tools)
+    return create_deep_agent(
+        model=model,
+        tools=resolved,
+        system_prompt=build_system_prompt(config.system_prompt, tools=resolved),
+    )
+
+
+def build_completer(
+    api_key: str, config: CascadeConfig, *, graph: CompiledAgent | None = None
+) -> Callable[[list[ChatCompletionMessageParam]], str]:
+    """A ``complete_reply`` for the cascade engine backed by the deepagents graph.
+
+    The cascade prepends its own ``system`` message to the history each turn; the graph
+    already owns the system prompt, so we drop it before invoking. The graph runs the
+    full tool loop and we return its final spoken text. ``graph`` is injected in tests
+    so the per-turn wiring runs against a fake with no network.
+    """
+    resolved = build_graph(api_key, config) if graph is None else graph
+
+    def complete_reply(messages: list[ChatCompletionMessageParam]) -> str:
+        conversation = [message for message in messages if message.get("role") != "system"]
+        return _reply_text(resolved.invoke({"messages": conversation}))
+
+    return complete_reply
+
+
+def _reply_text(result: dict[str, object]) -> str:
+    """The agent's final spoken reply: the last assistant message that carries text.
+
+    A tool-using turn ends in an ``AIMessage`` whose ``content`` is the spoken answer,
+    but earlier ``AIMessage``\\s in the same turn (the tool-call requests) have empty
+    text — so we scan from the end for the last one with non-empty content.
+    """
+    messages = result.get("messages")
+    if not isinstance(messages, list):
+        return ""
+    for message in reversed(messages):
+        if type(message).__name__ != "AIMessage":
+            continue
+        text = _content_text(getattr(message, "content", "")).strip()
+        if text:
+            return text
+    return ""
+
+
+def _content_text(content: object) -> str:
+    """Coerce a message's content (a string, or a list of content blocks) to plain text."""
+    if isinstance(content, str):
+        return content
+    if isinstance(content, list):
+        return "".join(
+            block.get("text", "") if isinstance(block, dict) else str(block) for block in content
+        )
+    return str(content)
diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py
index 9c400657..af52f15a 100644
--- a/aai_cli/agent_cascade/engine.py
+++ b/aai_cli/agent_cascade/engine.py
@@ -18,9 +18,10 @@
 from dataclasses import dataclass, field
 from typing import TYPE_CHECKING, Protocol
 
+from aai_cli.agent_cascade import brain
 from aai_cli.agent_cascade.config import CascadeConfig
 from aai_cli.agent_cascade.text import split_sentences, trim_history
-from aai_cli.core import client, llm
+from aai_cli.core import client
 from aai_cli.core.errors import CLIError
 from aai_cli.tts import session as tts_session
 from aai_cli.tts.session import SpeakConfig
@@ -121,15 +122,9 @@ def real(
         def run_stt(on_turn: Callable[[object], None]) -> None:
             client.stream_audio(api_key, audio, params=stt_params, on_turn=on_turn)
 
-        def complete_reply(messages: list[ChatCompletionMessageParam]) -> str:
-            response = llm.complete(
-                api_key,
-                model=config.model,
-                messages=messages,
-                max_tokens=config.max_tokens,
-                extra=dict(config.llm_extra) or None,
-            )
-            return llm.content_of(response)
+        # The LLM leg is a deepagents graph (web search / URL fetch / docs tools), not a
+        # single completion, so a spoken turn can transparently use tools.
+        complete_reply = brain.build_completer(api_key, config)
 
         def synthesize(text: str) -> bytes:
             spec = SpeakConfig(
diff --git a/aai_cli/code_agent/events.py b/aai_cli/code_agent/events.py
index 4ee9136f..ed480bbd 100644
--- a/aai_cli/code_agent/events.py
+++ b/aai_cli/code_agent/events.py
@@ -21,6 +21,19 @@ class AssistantText:
     text: str
 
 
+@dataclass(frozen=True)
+class AssistantDelta:
+    """One streamed token of the in-progress reply, shown live then superseded by AssistantText.
+
+    Emitted from langgraph's per-token ``messages`` stream so the front-end can render the
+    reply as it's generated; the authoritative full text still arrives as an AssistantText
+    when the step lands, so a consumer that ignores deltas (the headless renderer) loses
+    nothing.
+    """
+
+    text: str
+
+
 @dataclass(frozen=True)
 class ToolCall:
     """The agent's request to run a tool (announced when not gated by approval)."""
@@ -44,7 +57,21 @@ class ErrorText:
     text: str
 
 
-Event = AssistantText | ToolCall | ToolResult | ErrorText
+Event = AssistantText | AssistantDelta | ToolCall | ToolResult | ErrorText
+
+
+def assistant_delta(payload: object) -> AssistantDelta | None:
+    """Extract a streaming assistant-text token from a ``messages``-mode stream payload.
+
+    langgraph's ``messages`` mode yields ``(message_chunk, metadata)``; we surface only the
+    AI message's text tokens (tool-call requests and tool results carry no prose, and other
+    message kinds aren't the assistant talking), so the live region streams just the reply.
+    """
+    chunk = payload[0] if isinstance(payload, tuple) and payload else payload
+    if type(chunk).__name__ not in ("AIMessage", "AIMessageChunk"):
+        return None
+    text = _text_of(getattr(chunk, "content", ""))
+    return AssistantDelta(text) if text else None
 
 
 def _text_of(content: object) -> str:
diff --git a/aai_cli/code_agent/messages.py b/aai_cli/code_agent/messages.py
new file mode 100644
index 00000000..8bb1ad2d
--- /dev/null
+++ b/aai_cli/code_agent/messages.py
@@ -0,0 +1,110 @@
+"""Mounted transcript widgets for the coding-agent TUI.
+
+The transcript is a ``VerticalScroll`` of these widgets rather than an append-only ``RichLog``,
+which buys two things deepagents-code has: the assistant reply updates *in place* as it streams
+(no separate live region), and a tool's output is a collapsible row — a clipped preview that
+expands to the full output on Ctrl+O or a click.
+
+Dynamic content (model/tool/user strings) is wrapped in ``rich.text.Text`` so it's shown
+literally — Text doesn't parse console markup, so a stray ``[`` can't raise or inject styling.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+from rich.markdown import Markdown
+from rich.text import Text
+from textual.widgets import Static
+
+from aai_cli.code_agent.summarize import summarize_call, summarize_result
+
+_DIM = "#8a8f98"  # muted gray for tool lines / notes
+_ERROR = "#f04438"
+
+
+class Note(Static):
+    """A dim one-line transcript aside (``cancelling…``, ``copied…``, ``voice off…``)."""
+
+    def __init__(self, text: str) -> None:
+        super().__init__(Text(text, style=_DIM))
+
+
+class UserMessage(Static):
+    """The echoed user prompt, with a top margin so each turn is visually separated."""
+
+    DEFAULT_CSS = "UserMessage { margin-top: 1; }"
+
+    def __init__(self, text: str) -> None:
+        super().__init__(Text(f"» {text}", style="bold #38bdf8"))
+
+
+class AssistantMessage(Static):
+    """The assistant's reply: streams plain text token-by-token, then renders as Markdown."""
+
+    def __init__(self) -> None:
+        super().__init__()
+        self._tokens: list[str] = []  # accumulate tokens, not str +=, to avoid quadratic growth
+
+    @property
+    def text(self) -> str:
+        """The reply text streamed so far (used to finalize a cancelled generation)."""
+        return "".join(self._tokens)
+
+    def stream(self, delta: str) -> None:
+        """Append a streamed token and repaint as plain text (cheap; no per-token markdown)."""
+        self._tokens.append(delta)
+        self.update(Text(self.text))
+
+    def finalize(self, text: str) -> None:
+        """Replace the streamed text with the authoritative reply, rendered as Markdown."""
+        self._tokens = [text]
+        self.update(Markdown(text))
+
+
+class ToolCallLine(Static):
+    """A compact tool-call line, e.g. ``→ write_file(app.py)``."""
+
+    def __init__(self, name: str, args: Mapping[str, object]) -> None:
+        super().__init__(Text(f"→ {summarize_call(name, args)}", style=_DIM))
+
+
+class ErrorMessage(Static):
+    """A failed turn, shown instead of crashing the UI."""
+
+    def __init__(self, text: str) -> None:
+        super().__init__(Text(f"✗ {text}", style=_ERROR))
+
+
+class ToolOutput(Static):
+    """A tool's output: a clipped preview that expands to the full content (Ctrl+O / click)."""
+
+    def __init__(self, name: str, content: str) -> None:
+        super().__init__()
+        self._name = name
+        self._full = content.strip()
+        self._preview = summarize_result(content)
+        self._expandable = self._preview != self._full  # nothing to expand when it fits already
+        self._expanded = False
+
+    def on_mount(self) -> None:
+        self._repaint()
+
+    def on_click(self) -> None:
+        self.toggle()
+
+    def toggle(self) -> None:
+        """Flip between the clipped preview and the full output (no-op when it all fits)."""
+        if not self._expandable:
+            return
+        self._expanded = not self._expanded
+        self._repaint()
+
+    def _repaint(self) -> None:
+        body = self._full if self._expanded else self._preview
+        line = Text(f"  {self._name}: ", style=_DIM)
+        line.append(body, style=_DIM)
+        if self._expandable:
+            hint = " (Ctrl+O to collapse)" if self._expanded else " (Ctrl+O to expand)"
+            line.append(hint, style=f"{_DIM} italic")
+        self.update(line)
diff --git a/aai_cli/code_agent/modals.py b/aai_cli/code_agent/modals.py
new file mode 100644
index 00000000..25c54a7c
--- /dev/null
+++ b/aai_cli/code_agent/modals.py
@@ -0,0 +1,202 @@
+"""Bottom-docked modal screens for the coding-agent TUI: tool approval and agent questions.
+
+Split out of `tui.py` to keep each module under the file-length gate. Both are transparent
+``ModalScreen``s docked at the bottom, so the transcript stays visible above them (see the
+``ModalScreen { background: transparent }`` rule in :class:`~aai_cli.code_agent.tui.CodeAgentApp`).
+
+In voice mode each modal is also **spoken and voice-answerable**: when constructed with a
+``voice`` IO it speaks the prompt and listens for a spoken reply (approve / auto / reject, or a
+free-text answer), off the UI thread. The keyboard path always stays available as a fallback.
+"""
+
+from __future__ import annotations
+
+import re
+import threading
+from typing import TYPE_CHECKING, ClassVar
+
+from rich.markup import escape
+from textual.app import ComposeResult
+from textual.containers import Vertical
+from textual.screen import ModalScreen
+from textual.widgets import Input, Label
+
+from aai_cli.code_agent import banner, risk
+from aai_cli.code_agent.summarize import describe_args, full_args
+from aai_cli.core import errors
+
+if TYPE_CHECKING:
+    from collections.abc import Callable, Mapping
+
+    from aai_cli.code_agent.voice_ui import _VoiceIO
+
+
+def _spawn(target: Callable[[], None]) -> None:
+    """Run ``target`` on a daemon thread — the voice legs block, so they stay off the UI thread."""
+    threading.Thread(target=target, daemon=True).start()  # pragma: no mutate
+
+
+# Spoken-answer vocabulary. "auto" wins first (it implies approval); an unclear answer falls
+# back to "reject" — the same safe default as the keyboard, so a tool never runs on a guess.
+_REJECT_WORDS = frozenset({"no", "reject", "deny", "stop", "cancel", "nope", "nah"})
+_APPROVE_WORDS = frozenset({"yes", "approve", "yeah", "yep", "yup", "sure", "ok", "okay"})
+
+
+def approval_from_speech(text: str) -> str:
+    """Map a spoken reply to ``"approve"`` / ``"auto"`` / ``"reject"`` (unclear → reject)."""
+    lowered = text.lower()
+    words = set(re.findall(r"[a-z]+", lowered))
+    if "auto" in lowered or "always" in lowered:
+        return "auto"
+    if words & _REJECT_WORDS or "don't" in lowered or "do not" in lowered:
+        return "reject"
+    if words & _APPROVE_WORDS or "go ahead" in lowered or "do it" in lowered:
+        return "approve"
+    return "reject"
+
+
+class ApprovalScreen(ModalScreen[str]):
+    """A compact, bottom-docked prompt to approve/auto-approve/reject one tool call.
+
+    Keyboard ``y / a / n`` (and ``e`` to expand the args); in voice mode it also speaks the
+    prompt and accepts a spoken approve/auto/reject. The transparent background leaves the
+    transcript visible, and a risky call (``rm -rf``, an internal fetch) carries a warning.
+    """
+
+    DEFAULT_CSS = """
+    ApprovalScreen { align: center bottom; background: transparent; }
+    ApprovalScreen #approvalbox {
+        dock: bottom; width: 1fr; height: auto;
+        border: round #f59e0b; background: #000000; padding: 0 1; margin: 0 1 1 1;
+    }
+    ApprovalScreen #approvalbox Label { height: auto; }
+    """
+    BINDINGS: ClassVar = [
+        ("y", "approve", "Approve"),
+        ("a", "auto", "Auto-approve"),
+        ("n", "reject", "Reject"),
+        ("e", "expand", "Expand"),
+    ]
+
+    def __init__(
+        self, name: str, args: Mapping[str, object], *, voice: _VoiceIO | None = None
+    ) -> None:
+        super().__init__()
+        self._tool_name = name  # not _name: that shadows Textual Widget's str|None attr
+        self._args = args
+        self._expanded = False  # toggled by `e`; collapsed (one-line) by default
+        self._voice = voice  # when set, the prompt is spoken and a spoken answer is accepted
+        self._answered = False  # guards against a voice answer and a keypress both dismissing
+
+    def compose(self) -> ComposeResult:
+        with Vertical(id="approvalbox"):
+            warning = risk.risk_warning(self._tool_name, self._args)
+            if warning:
+                yield Label(f"[b #f04438]⚠ {escape(warning)}[/]", id="approvalwarn")
+            yield Label(self._detail_markup(), id="approvaldetail")
+            yield Label(
+                f"[b #22c55e]y[/] approve   [b {banner.BRAND_HEX}]a[/] auto-approve   "
+                "[b #f04438]n[/] reject   [b]e[/] expand"
+            )
+
+    def on_mount(self) -> None:
+        if (voice := self._voice) is not None:  # drive the decision by voice, off the UI thread
+            _spawn(lambda: self._drive_by_voice(voice))
+
+    def _drive_by_voice(self, voice: _VoiceIO) -> None:
+        """Speak the prompt and accept a spoken approve/auto/reject (keyboard still works)."""
+        try:
+            voice.speak(self._spoken_prompt())
+            transcript = voice.listen()
+        except errors.CLIError:
+            return  # mic/STT failed: leave the keyboard hint as the way to answer
+        if transcript:  # silence (None) must not auto-reject a tool — wait for speech or a key
+            self.app.call_from_thread(self._decide, approval_from_speech(transcript))
+
+    def _spoken_prompt(self) -> str:
+        """The read-aloud version of the prompt: the tool, its arg, any warning, the options."""
+        parts = [f"Run {self._tool_name}."]
+        detail = describe_args(self._args)
+        if detail:
+            parts.append(f"{detail}.")
+        warning = risk.risk_warning(self._tool_name, self._args)
+        if warning:
+            parts.append(f"Warning: {warning}")
+        parts.append("Say approve, auto-approve, or reject.")
+        return " ".join(parts)
+
+    def _decide(self, decision: str) -> None:
+        """Dismiss once, whether the answer came by spoken reply or keypress."""
+        if self._answered:
+            return
+        self._answered = True
+        self.dismiss(decision)
+
+    def _detail_markup(self) -> str:
+        """The 'Run tool X?' line — the compact arg, or the full args when expanded."""
+        args = full_args(self._args) if self._expanded else describe_args(self._args)
+        return f"Run tool [b]{escape(self._tool_name)}[/b]?  [dim]{escape(args)}[/dim]"
+
+    def action_expand(self) -> None:
+        """Toggle between the compact identifying arg and the full args (``e``)."""
+        self._expanded = not self._expanded
+        self.query_one("#approvaldetail", Label).update(self._detail_markup())
+
+    def action_approve(self) -> None:
+        self._decide("approve")
+
+    def action_auto(self) -> None:
+        self._decide("auto")
+
+    def action_reject(self) -> None:
+        self._decide("reject")
+
+
+class AskScreen(ModalScreen[str]):
+    """A bottom-docked prompt that relays a question from the agent and returns the answer.
+
+    In voice mode it speaks the question and takes a spoken answer; otherwise the user types.
+    """
+
+    DEFAULT_CSS = """
+    AskScreen { align: center bottom; background: transparent; }
+    AskScreen #askbox {
+        dock: bottom; width: 1fr; height: auto;
+        border: round #3a3f55; background: #000000; padding: 0 1; margin: 0 1 1 1;
+    }
+    """
+
+    def __init__(self, question: str, *, voice: _VoiceIO | None = None) -> None:
+        super().__init__()
+        self._question = question
+        self._voice = voice
+        self._answered = False
+
+    def compose(self) -> ComposeResult:
+        with Vertical(id="askbox"):
+            yield Label(f"[b]The agent asks:[/b] {escape(self._question)}")
+            yield Input(id="answer", placeholder="Type your answer and press Enter…")
+
+    def on_mount(self) -> None:
+        if (voice := self._voice) is not None:
+            _spawn(lambda: self._drive_by_voice(voice))
+
+    def _drive_by_voice(self, voice: _VoiceIO) -> None:
+        """Speak the question and submit a spoken answer (typing still works)."""
+        try:
+            voice.speak(f"The agent asks: {self._question}")
+            transcript = voice.listen()
+        except errors.CLIError:
+            return
+        if transcript:
+            self.app.call_from_thread(self._answer, transcript)
+
+    def _answer(self, text: str) -> None:
+        """Dismiss once with the answer, whether spoken or typed."""
+        if self._answered:
+            return
+        self._answered = True
+        self.dismiss(text)
+
+    def on_input_submitted(self, event: Input.Submitted) -> None:
+        self._answer(event.value)
diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index bdb6a4a2..716af2fc 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
 from aai_cli.core import environments
@@ -37,7 +38,13 @@ def _flatten_content(messages: object) -> None:
             )
 
 
-def build_model(api_key: str, *, model: str) -> BaseChatModel:
+def build_model(
+    api_key: str,
+    *,
+    model: str,
+    max_tokens: int | None = None,
+    extra: Mapping[str, object] | None = None,
+) -> BaseChatModel:
     """A ChatOpenAI bound to the active environment's LLM Gateway.
 
     ``use_responses_api=False`` keeps it on the chat-completions endpoint the gateway
@@ -45,6 +52,12 @@ def build_model(api_key: str, *, model: str) -> BaseChatModel:
     Responses API that langchain would otherwise prefer for ``openai:`` models. The
     subclass also flattens content-parts arrays the gateway rejects (see
     :func:`_flatten_content`).
+
+    ``max_tokens`` caps the per-reply length (the live voice agent passes a small cap to
+    keep spoken replies short and fast); ``extra`` passes any additional gateway request
+    fields through as ``extra_body`` (so they reach the request body verbatim, like
+    `aai_cli.core.llm`'s ``extra``). Both default to off so the coding agent's call is
+    unchanged.
     """
     from langchain_openai import ChatOpenAI
     from pydantic import SecretStr
@@ -64,4 +77,6 @@ def _get_request_payload(
         base_url=environments.active().llm_gateway_base,
         api_key=SecretStr(api_key),
         use_responses_api=False,
+        max_tokens=max_tokens,
+        extra_body=dict(extra) if extra else None,
     )
diff --git a/aai_cli/code_agent/render.py b/aai_cli/code_agent/render.py
index 499cd4e9..e0e7d639 100644
--- a/aai_cli/code_agent/render.py
+++ b/aai_cli/code_agent/render.py
@@ -9,20 +9,14 @@
 
 from collections.abc import Callable
 
+from rich.markdown import Markdown
 from rich.markup import escape
 
 from aai_cli.code_agent.events import AssistantText, ErrorText, Event, ToolCall, ToolResult
 from aai_cli.code_agent.session import Approver
+from aai_cli.code_agent.summarize import summarize_call, summarize_result
 from aai_cli.ui import output
 
-# Tool output can be long; clip it for the inline transcript.
-_RESULT_PREVIEW = 2000
-
-
-def _format_args(args: dict[str, object]) -> str:
-    """A compact one-line view of a tool call's arguments."""
-    return ", ".join(f"{key}={value!r}" for key, value in args.items())
-
 
 class RichRenderer:
     """An :data:`~aai_cli.code_agent.session.EventSink` that prints to the Rich console."""
@@ -31,13 +25,16 @@ def __call__(self, event: Event) -> None:
         # escape() dynamic content so a model/tool string with "[" can't inject Rich
         # markup or raise MarkupError (matches the inline-escape convention in output.py).
         if isinstance(event, AssistantText):
-            output.console.print(escape(event.text))
+            # Render as Markdown so fenced code blocks are syntax-highlighted (and lists/
+            # headings format) instead of showing raw ``` markers — Markdown parses its own
+            # syntax, not console markup, so no escape()/injection concern.
+            output.console.print(Markdown(event.text))
         elif isinstance(event, ToolCall):
             output.console.print(
-                f"[aai.muted]→ {escape(event.name)}({escape(_format_args(event.args))})[/aai.muted]"
+                f"[aai.muted]→ {escape(summarize_call(event.name, event.args))}[/aai.muted]"
             )
         elif isinstance(event, ToolResult):
-            preview = escape(event.content.strip()[:_RESULT_PREVIEW])
+            preview = escape(summarize_result(event.content))
             output.console.print(f"[aai.muted]  {escape(event.name)}: {preview}[/aai.muted]")
         elif isinstance(event, ErrorText):
             output.error_console.print(output.fail(escape(event.text)))
diff --git a/aai_cli/code_agent/risk.py b/aai_cli/code_agent/risk.py
new file mode 100644
index 00000000..6c7b7e8e
--- /dev/null
+++ b/aai_cli/code_agent/risk.py
@@ -0,0 +1,68 @@
+"""Heuristic risk flags for tool calls, surfaced on the approval prompt.
+
+The approval modal already shows *what* a tool will do; for the genuinely dangerous calls it
+also shows *why to look twice* — a one-line warning, the way deepagents-code badges suspicious
+shell commands and URLs. Purely advisory (the real SSRF guard lives in ``fetch_tool``); this
+only nudges the human reviewing a manual approval. Pure functions so they unit-test cleanly.
+"""
+
+from __future__ import annotations
+
+import re
+from collections.abc import Mapping
+
+from aai_cli.code_agent.fetch_tool import FETCH_TOOL_NAME
+
+# Shell fragments that can destroy data, escalate privileges, or pipe a remote script straight
+# into a shell — the classic "are you sure?" cases. Word-ish boundaries avoid matching inside
+# innocuous longer tokens (e.g. ``format`` should not trip ``mkfs``).
+_DANGEROUS_SHELL = (
+    (re.compile(r"\brm\s+(-\w*\s+)*-\w*[rf]", re.I), "deletes files recursively/forcibly"),
+    (re.compile(r"\bsudo\b", re.I), "runs with elevated privileges"),
+    (re.compile(r"\bmkfs\b|\bdd\s+if=", re.I), "can overwrite a disk or filesystem"),
+    (re.compile(r":\s*\(\)\s*\{.*\|.*&\s*\}\s*;"), "looks like a fork bomb"),
+    (
+        re.compile(r"\b(curl|wget)\b[^|]*\|\s*(sudo\s+)?(ba)?sh\b", re.I),
+        "pipes a download into a shell",
+    ),
+    (re.compile(r">\s*/dev/(sd|disk|nvme)", re.I), "writes directly to a block device"),
+)
+# URL hosts that mean a fetch is reaching a local/internal target rather than the public web.
+_LOCAL_HOST = re.compile(
+    r"^(localhost|127\.|0\.0\.0\.0|10\.|192\.168\.|169\.254\.|172\.(1[6-9]|2\d|3[01])\.|\[?::1\]?)",
+    re.I,
+)
+
+
+def _shell_warning(command: str) -> str | None:
+    for pattern, reason in _DANGEROUS_SHELL:
+        if pattern.search(command):
+            return f"This command {reason}."
+    return None
+
+
+def _url_warning(url: str) -> str | None:
+    stripped = url.strip()
+    if stripped.lower().startswith("file:"):
+        return "This URL reads a local file (file://)."
+    host = re.sub(r"^[a-z]+://", "", stripped, flags=re.I)
+    if _LOCAL_HOST.match(host):
+        return "This URL targets a local/internal address."
+    return None
+
+
+def risk_warning(name: str, args: Mapping[str, object]) -> str | None:
+    """A one-line caution for a risky tool call, or ``None`` when nothing stands out.
+
+    Flags destructive/privileged shell commands (``execute``) and fetches aimed at local or
+    ``file://`` targets; everything else returns ``None``.
+    """
+    if name == "execute":
+        command = args.get("command")
+        if isinstance(command, str):
+            return _shell_warning(command)
+    elif name == FETCH_TOOL_NAME:
+        url = args.get("url")
+        if isinstance(url, str):
+            return _url_warning(url)
+    return None
diff --git a/aai_cli/code_agent/session.py b/aai_cli/code_agent/session.py
index 64b6c904..b3ce738f 100644
--- a/aai_cli/code_agent/session.py
+++ b/aai_cli/code_agent/session.py
@@ -18,6 +18,7 @@
 from aai_cli.code_agent.events import (
     ErrorText,
     Event,
+    assistant_delta,
     interrupt_request,
     message_events,
     new_messages,
@@ -43,9 +44,18 @@ class _SupportsStream(Protocol):
     """
 
     def stream(
-        self, graph_input: object, config: Mapping[str, object] | None, *, stream_mode: str
-    ) -> Iterator[dict[str, object]]:
-        """Yield the running state (incl. the growing ``messages``) after each super-step."""
+        self,
+        graph_input: object,
+        config: Mapping[str, object] | None,
+        *,
+        stream_mode: list[str],
+    ) -> Iterator[tuple[str, object]]:
+        """Yield ``(mode, payload)`` pairs — ``"values"`` state snapshots and ``"messages"`` deltas.
+
+        With a *list* ``stream_mode`` langgraph tags each yield with its mode, so the caller
+        can render off the per-super-step ``"values"`` state while still seeing the frequent
+        per-token ``"messages"`` deltas (used only as a fine-grained cancellation checkpoint).
+        """
 
 
 @dataclass
@@ -97,17 +107,28 @@ def send(self, text: str) -> None:
     def _run(self, graph_input: object, config: dict[str, object]) -> dict[str, object]:
         """Drive one graph segment, emitting events as each step completes; return the end state.
 
-        Streaming (``stream_mode="values"``) renders intermediate tool calls/results live and
-        lets :meth:`request_cancel` break the loop between steps. A double that only implements
-        ``invoke`` (the TUI/REPL test fakes) emits once at the end instead.
+        We render the finished messages from the per-super-step ``"values"`` snapshots, and
+        stream the ``"messages"`` (per-token) deltas alongside them for two reasons: a live
+        front-end shows the reply as it's generated (emitted as ``AssistantDelta``), and the
+        frequent deltas give :meth:`request_cancel` a checkpoint *within* a long step — a
+        single model generation is one super-step, so a values-only loop couldn't break until
+        the whole reply landed. A double that only implements ``invoke`` (the TUI/REPL test
+        fakes) emits once at the end instead.
         """
         if isinstance(self.agent, _SupportsStream):
             last: dict[str, object] = {}
-            for chunk in self.agent.stream(graph_input, config, stream_mode="values"):
+            for mode, payload in self.agent.stream(
+                graph_input, config, stream_mode=["values", "messages"]
+            ):
                 if self._cancel.is_set():
                     break
-                self._emit_new(chunk)
-                last = chunk
+                if mode == "values" and isinstance(payload, dict):
+                    self._emit_new(payload)
+                    last = payload
+                elif mode == "messages":
+                    delta = assistant_delta(payload)
+                    if delta is not None:
+                        self.sink(delta)
             return last
         result = self.agent.invoke(graph_input, config)
         self._emit_new(result)
diff --git a/aai_cli/code_agent/summarize.py b/aai_cli/code_agent/summarize.py
new file mode 100644
index 00000000..ecb4a0c7
--- /dev/null
+++ b/aai_cli/code_agent/summarize.py
@@ -0,0 +1,96 @@
+"""Compact one-line summaries of tool activity, shared by both front-ends.
+
+A coding agent's tool args and output are routinely whole files or long command output.
+Dumping them verbatim into the transcript buries the conversation — and, because args go
+through ``repr``, renders literal ``\\n`` escapes. Both the Textual TUI (`tui.py`) and the
+Rich fallback (`render.py`) route tool calls/results through these helpers so the
+transcript stays scannable, mirroring how deepagents-code's collapsible tool rows show
+just the identifying arg (a filename / command) and a short output preview with a
+"+N more lines" tail rather than the full payload.
+"""
+
+from __future__ import annotations
+
+from collections.abc import Mapping
+
+# Output preview budget (deepagents-code previews tool output at 4 lines / 300 chars behind
+# an expand toggle; our append-only log has no expander, so we clip and tag the remainder).
+_PREVIEW_LINES = 4
+_PREVIEW_CHARS = 300
+# Per-arg and arg-count caps so one giant value (a file's contents) can't flood the line.
+_MAX_ARG_VALUE = 60
+_MAX_ARGS = 3
+# Per-value cap for the *expanded* approval view: values shown whole (newlines kept) but bounded
+# so a multi-megabyte file can't make the modal unbounded.
+_EXPANDED_VALUE = 1000
+# Args that identify a call on their own — show only this and elide bulky siblings (content).
+_IDENTITY_ARGS = ("file_path", "path", "filename", "command", "url", "query", "pattern")
+
+
+def _one_line(value: object, *, limit: int) -> str:
+    """Collapse ``value`` to a single clipped line (newlines → spaces, ellipsis if long)."""
+    text = " ".join(str(value).split())
+    return text if len(text) <= limit else text[: limit - 1] + "…"
+
+
+def describe_args(args: Mapping[str, object]) -> str:
+    """The compact arg view shared by the transcript line and the approval prompt.
+
+    Prefers a single identifying arg (a path/command/URL) so a ``write_file`` reads as
+    ``app.py`` instead of inlining the file being written; otherwise shows up to a few
+    short ``key=value`` args, each clipped, with a trailing ``…`` when more were elided.
+    """
+    for key in _IDENTITY_ARGS:
+        if key in args:
+            return _one_line(args[key], limit=_MAX_ARG_VALUE)
+    shown = list(args.items())[:_MAX_ARGS]
+    body = ", ".join(f"{key}={_one_line(value, limit=_MAX_ARG_VALUE)}" for key, value in shown)
+    if len(args) > _MAX_ARGS:
+        body = f"{body}, …" if body else "…"
+    return body
+
+
+def summarize_call(name: str, args: Mapping[str, object]) -> str:
+    """A compact ``name(key arg)`` view of a tool call for the transcript."""
+    return f"{name}({describe_args(args)})"
+
+
+def full_args(args: Mapping[str, object]) -> str:
+    """The full ``key=value`` arg view shown when the approval prompt is expanded (``e``).
+
+    Values are shown whole (newlines preserved) but each is capped at ``_EXPANDED_VALUE`` so a
+    huge file can't make the modal unbounded; :func:`describe_args` is the collapsed view.
+    """
+    lines = []
+    for key, value in args.items():
+        text = str(value)
+        if len(text) > _EXPANDED_VALUE:
+            text = (
+                f"{text[:_EXPANDED_VALUE].rstrip()} … (+{len(text) - _EXPANDED_VALUE} more chars)"
+            )
+        lines.append(f"{key}={text}")
+    return "\n".join(lines)
+
+
+def summarize_result(content: str) -> str:
+    """A short preview of tool output: the first few lines, clipped, with a hidden-count tail.
+
+    Returns at most ``_PREVIEW_LINES`` lines and ``_PREVIEW_CHARS`` characters; when the
+    output was longer, appends ``… (+N more lines)`` (or ``… (+N more chars)`` when a single
+    long line was clipped) so the elision is visible rather than silent.
+    """
+    text = content.strip()
+    if not text:
+        return ""
+    lines = text.splitlines()
+    preview_lines = lines[:_PREVIEW_LINES]
+    preview = "\n".join(preview_lines)
+    hidden_lines = len(lines) - len(preview_lines)
+    if len(preview) > _PREVIEW_CHARS:
+        kept = preview[:_PREVIEW_CHARS].rstrip()
+        hidden_chars = len(preview) - len(kept)
+        tail = f"+{hidden_lines} more lines" if hidden_lines else f"+{hidden_chars} more chars"
+        return f"{kept} … ({tail})"
+    if hidden_lines > 0:
+        return f"{preview} … (+{hidden_lines} more lines)"
+    return preview
diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py
index 264d64c9..cc5a4010 100644
--- a/aai_cli/code_agent/tui.py
+++ b/aai_cli/code_agent/tui.py
@@ -13,169 +13,77 @@
 import threading
 import time
 from pathlib import Path
-from typing import TYPE_CHECKING, ClassVar, Protocol
+from typing import TYPE_CHECKING, ClassVar
 
 from rich.markup import escape
-from textual.app import App, ComposeResult
-from textual.containers import Horizontal, Vertical
+from textual.app import ComposeResult
+from textual.containers import Horizontal, VerticalScroll
 from textual.screen import ModalScreen
-from textual.widgets import Input, Label, RichLog, Static
+from textual.widgets import Input, Static
 from textual.worker import Worker
 
 from aai_cli.code_agent import banner
 from aai_cli.code_agent.agent import CompiledAgent
 from aai_cli.code_agent.ask_tool import AskBridge
-from aai_cli.code_agent.events import AssistantText, ErrorText, Event, ToolCall, ToolResult
+from aai_cli.code_agent.events import (
+    AssistantDelta,
+    AssistantText,
+    ErrorText,
+    Event,
+    ToolCall,
+    ToolResult,
+)
+from aai_cli.code_agent.messages import (
+    AssistantMessage,
+    ErrorMessage,
+    Note,
+    ToolCallLine,
+    ToolOutput,
+    UserMessage,
+)
+from aai_cli.code_agent.modals import ApprovalScreen, AskScreen
 from aai_cli.code_agent.session import CodeSession
-from aai_cli.code_agent.voice import spoken_summary
-from aai_cli.core import errors
+from aai_cli.code_agent.tui_status import _spinner_text, _status_text
+from aai_cli.code_agent.voice_ui import _VoiceIO, _VoiceLegs
 
 if TYPE_CHECKING:
-    from collections.abc import Callable, Mapping
-
     from textual.timer import Timer
 
 # Glyphs cycled by the working indicator's animation (purely cosmetic).
 _SPIN_FRAMES = "✶✷✸✹✺"  # pragma: no mutate
 # Seconds the Ctrl-C "press again to quit" hint stays armed (deepagents-code uses 3s too).
 _QUIT_HINT_SECONDS = 3  # pragma: no mutate
+# Animated meter for the voice bar — a 3-cell block-char pulse (BMP, single-width, no emoji).
+_VOICE_FRAMES = ("▁▃▅", "▃▅▇", "▅▇▆", "▆▇▅", "▇▅▃", "▅▃▁")  # pragma: no mutate
+# The three voice phases the bar distinguishes, each (label, accent color).
+_VOICE_PHASES: dict[str, tuple[str, str]] = {
+    "listening": ("Listening — speak your request", banner.BRAND_HEX),
+    "thinking": ("Thinking…", "#f59e0b"),
+    "speaking": ("Speaking…", "#22c55e"),
+}
 
 
-class _VoiceIO(Protocol):
-    """The speak-to-it / read-back slice the TUI drives; :class:`VoiceSession` satisfies it."""
-
-    def listen(self) -> str | None:
-        """Capture one spoken turn and return its transcript (``None`` on no speech)."""
-
-    def speak(self, text: str) -> None:
-        """Read ``text`` back aloud (a no-op when readback is unavailable)."""
-
-
-def _format_args(args: Mapping[str, object]) -> str:
-    return ", ".join(f"{key}={value!r}" for key, value in args.items())
-
-
-def _spinner_text(elapsed_s: int, frame: str) -> str:
-    """The working-indicator line: a spinner glyph and the elapsed seconds."""
-    return f"{frame} Working… ({elapsed_s}s)"
-
-
-def _abbrev_home(path: Path) -> str:
-    """Render ``path`` with the home directory collapsed to ``~``."""
-    try:
-        return f"~/{path.relative_to(Path.home())}"
-    except ValueError:
-        return str(path)
-
-
-def _git_branch(start: Path) -> str | None:
-    """The current git branch for ``start`` (walking up to the repo root), or None."""
-    for directory in (start, *start.parents):
-        head = directory / ".git" / "HEAD"
-        if head.is_file():
-            ref = head.read_text(encoding="utf-8").strip()
-            return ref.removeprefix("ref: refs/heads/") if ref.startswith("ref: ") else ref[:8]
-    return None
-
-
-def _status_text(cwd: Path, *, auto_approve: bool) -> str:
-    """The bottom status line: a mode badge, the working directory, and the git branch."""
-    mode = "auto" if auto_approve else "manual"
-    badge = f"[black on #f59e0b] {mode} [/]"
-    parts = [badge, f"[dim]{_abbrev_home(cwd)}[/dim]"]
-    branch = _git_branch(cwd)
-    if branch:
-        parts.append(f"[dim]↗ {branch}[/dim]")
-    return " ".join(parts)
-
-
-class ApprovalScreen(ModalScreen[str]):
-    """A compact, bottom-docked prompt to approve/auto-approve/reject one tool call.
-
-    Keyboard-only — a plain one-line ``y / a / n`` hint instead of clickable buttons, so it
-    reads like a CLI prompt rather than a chrome-heavy dialog. The transparent screen
-    background leaves the transcript visible above (no full-screen takeover); the decision is
-    one of ``"approve"``, ``"auto"``, or ``"reject"``.
-    """
-
-    DEFAULT_CSS = """
-    ApprovalScreen { align: center bottom; background: transparent; }
-    ApprovalScreen #approvalbox {
-        dock: bottom; width: 1fr; height: auto;
-        border: round #f59e0b; background: #000000; padding: 0 1; margin: 0 1 1 1;
-    }
-    ApprovalScreen #approvalbox Label { height: auto; }
-    """
-    BINDINGS: ClassVar = [
-        ("y", "approve", "Approve"),
-        ("a", "auto", "Auto-approve"),
-        ("n", "reject", "Reject"),
-    ]
-
-    def __init__(self, name: str, args: Mapping[str, object]) -> None:
-        super().__init__()
-        self._tool_name = name  # not _name: that shadows Textual Widget's str|None attr
-        self._args = args
-
-    def compose(self) -> ComposeResult:
-        with Vertical(id="approvalbox"):
-            yield Label(
-                f"Run tool [b]{escape(self._tool_name)}[/b]?  "
-                f"[dim]{escape(_format_args(self._args))}[/dim]"
-            )
-            yield Label(
-                f"[b #22c55e]y[/] approve   [b {banner.BRAND_HEX}]a[/] auto-approve   "
-                "[b #f04438]n[/] reject"
-            )
-
-    def action_approve(self) -> None:
-        self.dismiss("approve")
-
-    def action_auto(self) -> None:
-        self.dismiss("auto")
-
-    def action_reject(self) -> None:
-        self.dismiss("reject")
-
-
-class AskScreen(ModalScreen[str]):
-    """A bottom-docked prompt that relays a question from the agent and returns the answer."""
-
-    DEFAULT_CSS = """
-    AskScreen { align: center bottom; background: transparent; }
-    AskScreen #askbox {
-        dock: bottom; width: 1fr; height: auto;
-        border: round #3a3f55; background: #000000; padding: 0 1; margin: 0 1 1 1;
-    }
-    """
-
-    def __init__(self, question: str) -> None:
-        super().__init__()
-        self._question = question
-
-    def compose(self) -> ComposeResult:
-        with Vertical(id="askbox"):
-            yield Label(f"[b]The agent asks:[/b] {escape(self._question)}")
-            yield Input(id="answer", placeholder="Type your answer and press Enter…")
-
-    def on_input_submitted(self, event: Input.Submitted) -> None:
-        self.dismiss(event.value)
-
-
-class CodeAgentApp(App[None]):
+class CodeAgentApp(_VoiceLegs):
     """The coding-agent TUI: conversation transcript + prompt + approval/ask modals."""
 
     # Flat pure-black canvas — no panel fills/gray, just the bordered prompt and a status
     # line, matching the deepagents-code look (wordmark in the AssemblyAI brand blue).
     CSS = f"""
     Screen {{ background: #000000; }}
-    #log {{
-        height: 1fr; border: none; background: #000000; padding: 1 2;
-        scrollbar-size-vertical: 0;
-    }}
+    /* The approval/ask modals must stay see-through so the transcript shows above their
+       docked prompt. Their own DEFAULT_CSS sets `background: transparent`, but app CSS beats
+       a widget's DEFAULT_CSS — without this rule the `Screen` canvas above paints the modal
+       opaque black (it matches every Screen subclass) and blanks the transcript behind it. */
+    ModalScreen {{ background: transparent; }}
+    /* The transcript is a scroll container of mounted message widgets (not a RichLog), so the
+       reply streams in place and tool output can expand/collapse. */
+    #log {{ height: 1fr; border: none; background: #000000; padding: 1 2; }}
     #promptbar {{ dock: bottom; height: 3; background: #000000; border: round #3a3f55; margin: 1 1; }}
     #promptmark {{ width: 3; color: {banner.BRAND_HEX}; content-align: center middle; }}
     #prompt {{ border: none; background: #000000; padding: 0; }}
+    /* Shown in place of the prompt while voice capture is on (Ctrl-V brings the prompt back). */
+    #voicebar {{ dock: bottom; height: 3; background: #000000; border: round {banner.BRAND_HEX};
+        margin: 1 1; content-align: center middle; display: none; }}
     /* In normal flow below the 1fr log, so it sits just above the docked prompt bar. */
     #spinner {{ height: 1; background: #000000; padding: 0 2;
         color: {banner.BRAND_HEX}; display: none; }}
@@ -191,6 +99,8 @@ class CodeAgentApp(App[None]):
         ("ctrl+c", "quit_or_interrupt", "Interrupt / Quit"),
         ("ctrl+q", "quit", "Quit"),
         ("ctrl+y", "copy_last", "Copy last reply"),
+        ("ctrl+v", "toggle_voice", "Toggle voice"),
+        ("ctrl+o", "toggle_output", "Expand/collapse output"),
     ]
 
     def __init__(
@@ -212,6 +122,12 @@ def __init__(
         self._initial = initial
         self._voice = voice  # when set, spoken turns drive the prompt and replies are read back
         self._voice_typed = False  # flips once the mic is ruled out; then input is typed only
+        self._voice_paused = False  # user-toggled off via Ctrl-V (distinct from a mic failure)
+        self._voice_phase = "listening"  # listening / thinking / speaking, shown in the voice bar
+        self._voice_frames = itertools.cycle(_VOICE_FRAMES)
+        self._voice_timer: Timer | None = None  # animates the voice-bar meter while it's shown
+        self._streaming_msg: AssistantMessage | None = None  # the reply widget tokens stream into
+        self._last_tool_output: ToolOutput | None = None  # the row Ctrl+O expands/collapses
         self._session_name = thread_id  # not _thread_id: that shadows Textual App's int
         self._cwd = cwd if cwd is not None else Path.cwd()
         self._web_note = web_note
@@ -231,34 +147,55 @@ def __init__(
     def compose(self) -> ComposeResult:
         # No Header/Footer chrome — the splash is the title and the bottom status line
         # the only footer, so the screen stays a flat dark canvas.
-        yield RichLog(id="log", wrap=True, markup=True)
+        yield VerticalScroll(id="log")
         # Docked before the prompt bar, so the working indicator sits just above the input.
         yield Static("", id="spinner")
         with Horizontal(id="promptbar"):
             yield Static(">", id="promptmark")
             yield Input(id="prompt", placeholder="Ask the agent to build something…")
-        yield Static(_status_text(self._cwd, auto_approve=self._auto_approve), id="status")
-
-    def _write_splash(self, log: RichLog) -> None:
-        for row in banner.wordmark():
-            log.write(f"[bold {banner.BRAND_HEX}]{row}[/]")
-        log.write(f"[dim]{banner.version()}[/dim]")
-        log.write("")
-        log.write(f"[dim]Thread: {self._session_name}[/dim]")
-        log.write("")
-        log.write(f"[{banner.BRAND_HEX}]{banner.READY_LINE}[/]")
-        log.write(f"[dim]{banner.TIP_LINE}[/dim]")
+        yield Static("", id="voicebar")  # filled by _render_voicebar when voice mode is shown
+        yield Static(
+            _status_text(
+                self._cwd, auto_approve=self._auto_approve, voice_state=self._voice_state()
+            ),
+            id="status",
+        )
+
+    def _write_splash(self) -> None:
+        # The whole splash is fixed copy except the session name, so this markup is safe to
+        # parse (only the session name — a --session value — is escaped).
+        rows = [f"[bold {banner.BRAND_HEX}]{row}[/]" for row in banner.wordmark()]
+        rows += [
+            f"[dim]{banner.version()}[/dim]",
+            "",
+            f"[dim]Thread: {escape(self._session_name)}[/dim]",
+            "",
+            f"[{banner.BRAND_HEX}]{banner.READY_LINE}[/]",
+            f"[dim]{banner.TIP_LINE}[/dim]",
+        ]
+        self._mount("\n".join(rows))
+
+    def _mount(self, widget: Static | str) -> None:
+        """Append a transcript widget (or a markup string) and scroll it into view."""
+        log = self.query_one("#log", VerticalScroll)
+        log.mount(Static(widget) if isinstance(widget, str) else widget)
+        log.scroll_end(animate=False)  # pragma: no mutate — cosmetic; animate flag is unassertable
+
+    def _note(self, text: str) -> None:
+        """Append a dim transcript aside (cancelling / copied / voice-off)."""
+        self._mount(Note(text))
 
     def on_mount(self) -> None:
         # Route the agent's ask_user tool through a modal (the bridge is shared with
         # the tool built before this app existed).
         self._ask_bridge.handler = self._ask
-        self._write_splash(self.query_one("#log", RichLog))
+        self._write_splash()
         if self._web_note:
             self.notify(self._web_note, title="Web search disabled", severity="warning", timeout=10)
         # Put the cursor in the prompt so the user can type immediately (RichLog would
         # otherwise hold focus and swallow keystrokes).
         self.query_one("#prompt", Input).focus()
+        self._sync_input_mode()  # in voice mode, swap the prompt for the listening affordance
         if self._initial:
             self._submit(self._initial)
         else:
@@ -275,18 +212,34 @@ def _emit_event(self, event: Event) -> None:
         self.call_from_thread(self._write_event, event)
 
     def _write_event(self, event: Event) -> None:
-        log = self.query_one("#log", RichLog)
-        # Escape dynamic content: a model/tool string containing "[" would otherwise be
-        # parsed as Rich markup and raise MarkupError (crashing the turn), or inject styling.
-        if isinstance(event, AssistantText):
-            self._last_reply = event.text
-            log.write(escape(event.text))
+        if isinstance(event, AssistantDelta):
+            # Stream the token into the live reply widget (mounting one on the first token),
+            # updated in place until the authoritative AssistantText finalizes it below.
+            if self._streaming_msg is None:
+                self._streaming_msg = AssistantMessage()
+                self._mount(self._streaming_msg)
+            self._streaming_msg.stream(event.text)
+            self.query_one("#log", VerticalScroll).scroll_end(animate=False)  # pragma: no mutate
+        elif isinstance(event, AssistantText):
+            self._last_reply = event.text  # keep the raw text for clipboard copy
+            self._finalize_reply(event.text)
         elif isinstance(event, ToolCall):
-            log.write(f"[dim]→ {escape(event.name)}({escape(_format_args(event.args))})[/dim]")
+            self._mount(ToolCallLine(event.name, event.args))
         elif isinstance(event, ToolResult):
-            log.write(f"[dim]  {escape(event.name)}: {escape(event.content.strip()[:2000])}[/dim]")
+            self._last_tool_output = ToolOutput(event.name, event.content)
+            self._mount(self._last_tool_output)
         elif isinstance(event, ErrorText):
-            log.write(f"[#F04438]✗ {escape(event.text)}[/#F04438]")
+            self._mount(ErrorMessage(event.text))
+
+    def _finalize_reply(self, text: str) -> None:
+        """Commit the reply: finalize the streamed widget in place, or mount a fresh one."""
+        if self._streaming_msg is not None:
+            self._streaming_msg.finalize(text)
+            self._streaming_msg = None
+        else:
+            msg = AssistantMessage()
+            self._mount(msg)
+            msg.finalize(text)
 
     def action_copy_last(self) -> None:
         """Copy the most recent assistant reply to the system clipboard."""
@@ -294,7 +247,12 @@ def action_copy_last(self) -> None:
 
         if self._last_reply:
             pyperclip.copy(self._last_reply)
-            self.query_one("#log", RichLog).write("[dim](copied last reply to clipboard)[/dim]")
+            self._note("(copied last reply to clipboard)")
+
+    def action_toggle_output(self) -> None:
+        """Ctrl-O: expand/collapse the most recent tool output (a no-op if there's none)."""
+        if self._last_tool_output is not None:
+            self._last_tool_output.toggle()
 
     # --- approval / ask (called on the worker thread) -------------------------
 
@@ -320,7 +278,8 @@ def _approve(self, name: str, args: dict[str, object]) -> bool:
         """
         if self._auto_approve:
             return True
-        decision = self._modal_result(ApprovalScreen(name, args), default="reject")
+        screen = ApprovalScreen(name, args, voice=self._modal_voice())
+        decision = self._modal_result(screen, default="reject")
         if decision == "auto":
             self._enable_auto_approve()
             return True
@@ -333,14 +292,80 @@ def _enable_auto_approve(self) -> None:
         self.call_from_thread(self._refresh_status)
 
     def _refresh_status(self) -> None:
-        """Re-render the bottom status line (e.g. after the mode flips to auto)."""
+        """Re-render the bottom status line (e.g. after the mode flips to auto or voice toggles)."""
         self.query_one("#status", Static).update(
-            _status_text(self._cwd, auto_approve=self._auto_approve)
+            _status_text(
+                self._cwd, auto_approve=self._auto_approve, voice_state=self._voice_state()
+            )
         )
 
+    def _voice_state(self) -> str | None:
+        """``"on"``/``"off"`` for the status badge, or ``None`` when voice isn't wired up."""
+        if self._voice is None:
+            return None
+        return "on" if self._voice_active() else "off"
+
+    def action_toggle_voice(self) -> None:
+        """Ctrl-V: turn spoken input/readback on or off for the session.
+
+        A no-op notice when no voice front-end exists (e.g. a piped/typed run). Re-enabling
+        kicks off listening again unless a turn is mid-flight (the post-turn followup will).
+        """
+        if self._voice is None:
+            self.notify("Voice isn't available in this session", severity="warning")
+            return
+        self._voice_paused = not self._voice_paused
+        self._refresh_status()
+        self._sync_input_mode()  # show/hide the text box vs. the listening affordance
+        if self._voice_paused:
+            self.notify("Voice off — type your request")
+        elif not self._turn_running():
+            self.notify("Voice on — listening")
+            self._begin_listening()
+
+    def _sync_input_mode(self) -> None:
+        """Swap the text prompt for the 'listening' affordance while voice capture is active.
+
+        The Input stays mounted either way (it still holds the spoken transcript and the
+        turn-running ``disabled`` flag); only the bars' visibility flips. The prompt regains
+        focus whenever it's the visible input.
+        """
+        listening = self._voice_active()
+        self.query_one("#promptbar", Horizontal).display = not listening
+        self.query_one("#voicebar", Static).display = listening
+        if listening:
+            self._render_voicebar()
+            if self._voice_timer is None:  # animate the meter only while the bar is shown
+                self._voice_timer = self.set_interval(0.3, self._tick_voice)  # pragma: no mutate
+        else:
+            if self._voice_timer is not None:
+                self._voice_timer.stop()
+                self._voice_timer = None
+            self.query_one("#prompt", Input).focus()
+
+    def _set_voice_phase(self, phase: str) -> None:
+        """Switch the voice bar between listening / thinking / speaking and repaint it."""
+        self._voice_phase = phase
+        self._render_voicebar()
+
+    def _render_voicebar(self) -> None:
+        """Paint the voice bar for the current phase: an animated meter, label, and accent."""
+        label, color = _VOICE_PHASES[self._voice_phase]
+        meter = next(self._voice_frames)
+        hint = "   [dim](Ctrl-V to type)[/dim]" if self._voice_phase == "listening" else ""
+        self.query_one("#voicebar", Static).update(f"[{color}]{meter}[/] {escape(label)}{hint}")
+
+    def _tick_voice(self) -> None:
+        """Advance the voice-bar meter one frame (the animation timer's callback)."""
+        self._render_voicebar()
+
     def _ask(self, question: str) -> str:
         """Block the worker on a modal input screen and return the user's answer."""
-        return self._modal_result(AskScreen(question), default="")
+        return self._modal_result(AskScreen(question, voice=self._modal_voice()), default="")
+
+    def _modal_voice(self) -> _VoiceIO | None:
+        """The voice IO to drive a modal by speech, or ``None`` when voice isn't active."""
+        return self._voice if self._voice_active() else None
 
     # --- interrupt / quit -----------------------------------------------------
     # Mirrors deepagents-code: Escape interrupts a running turn; Ctrl-C interrupts a running
@@ -361,7 +386,7 @@ def _cancel_turn(self) -> bool:
         if not self._turn_running():
             return False
         self._session.request_cancel()
-        self.query_one("#log", RichLog).write("[dim](cancelling…)[/dim]")
+        self._note("cancelling…")
         return True
 
     def action_interrupt(self) -> None:
@@ -396,9 +421,9 @@ def on_input_submitted(self, event: Input.Submitted) -> None:
             self._submit(text)
 
     def _submit(self, text: str) -> None:
-        log = self.query_one("#log", RichLog)
-        log.write(f"[b cyan]» {escape(text)}[/b cyan]")
+        self._mount(UserMessage(text))
         self.query_one("#prompt", Input).disabled = True
+        self._set_voice_phase("thinking")  # voice bar reflects the turn (no-op when bar hidden)
         self._start_spinner()
         self._run_turn(text)
 
@@ -410,8 +435,14 @@ def _run_turn(self, text: str) -> Worker[None]:
     # --- working indicator (spinner + elapsed) --------------------------------
 
     def _start_spinner(self) -> None:
-        """Show the working indicator and animate it while the turn runs."""
+        """Show the working indicator and animate it while the turn runs.
+
+        Skipped in voice mode — the voice bar already shows a "Thinking…" state, so a second
+        spinner would just be redundant chrome.
+        """
         self._turn_started = time.monotonic()
+        if self._voice_active():
+            return
         self.query_one("#spinner", Static).display = True
         self._tick()
         self._spin_timer = self.set_interval(0.25, self._tick)  # pragma: no mutate
@@ -430,65 +461,16 @@ def _stop_spinner(self) -> None:
 
     def on_worker_state_changed(self, event: Worker.StateChanged) -> None:
         if event.worker.is_finished:
-            self._stop_spinner()
-            prompt = self.query_one("#prompt", Input)
-            prompt.disabled = False
-            prompt.focus()
-            self._voice_followup()  # read a spoken summary back, then listen for the next turn
-
-    # --- voice (speak-to-it / read-summary-back; the legs run off the UI thread) ----
-
-    def _voice_active(self) -> bool:
-        """Voice capture is on: a session exists and the mic hasn't been ruled out yet."""
-        return self._voice is not None and not self._voice_typed
-
-    def _spawn(self, target: Callable[[], None]) -> None:
-        """Run ``target`` on a daemon thread — voice legs block, so they stay off the UI thread."""
-        threading.Thread(target=target, daemon=True).start()  # pragma: no mutate
-
-    def _begin_listening(self) -> None:
-        """Capture the next spoken turn on a background thread (no-op when voice is off)."""
-        if not self._voice_active():
-            return
-        self._spawn(self._capture_voice_turn)
-
-    def _voice_followup(self) -> None:
-        """After a turn finishes: read back a spoken summary, then listen for the next turn."""
-        voice = self._voice
-        if voice is None:
-            return
-        self._spawn(lambda: self._speak_then_listen(voice))
-
-    def _speak_then_listen(self, voice: _VoiceIO) -> None:
-        """Read a summary of the last reply aloud (no code), then capture the next spoken turn."""
-        voice.speak(spoken_summary(self._last_reply))
-        self._capture_voice_turn()
-
-    def _capture_voice_turn(self) -> None:
-        """Listen for one spoken turn; enter it into the prompt, or degrade to typing."""
-        voice = self._voice
-        if voice is None or self._voice_typed:
-            return
-        try:
-            transcript = voice.listen()
-        except errors.CLIError as exc:
-            # A capture failure (no mic, STT error) drops voice for the rest of the session
-            # rather than wedging it — the user just types instead.
-            self._voice_typed = True
-            self.call_from_thread(self._notice_voice_off, exc.message)
-            return
-        if transcript:
-            self.call_from_thread(self._enter_and_submit, transcript)
-
-    def _notice_voice_off(self, detail: str) -> None:
-        """Tell the user voice input stopped and that input is now typed (UI thread)."""
-        self.query_one("#log", RichLog).write(
-            f"[dim](voice input off: {escape(detail)}; type your request instead)[/dim]"
-        )
-
-    def _enter_and_submit(self, text: str) -> None:
-        """Show the spoken text in the prompt, then submit it as a turn (UI thread)."""
-        prompt = self.query_one("#prompt", Input)
-        prompt.value = text
-        self._submit(text)
-        prompt.value = ""
+            self._finish_turn()
+
+    def _finish_turn(self) -> None:
+        """Wind down a completed turn: stop the spinner, re-enable input, resume voice."""
+        self._stop_spinner()
+        if self._streaming_msg is not None:  # a cancelled generation: keep what streamed in
+            self._finalize_reply(self._streaming_msg.text)
+        self.query_one("#prompt", Input).disabled = False
+        self._sync_input_mode()  # focus the prompt (text mode) or show the listening bar
+        self._voice_followup()  # read a spoken summary back, then listen for the next turn
+
+    # The off-thread voice legs (_voice_active, _begin_listening, _capture_voice_turn, …) are
+    # inherited from _VoiceLegs; the render/toggle side stays above.
diff --git a/aai_cli/code_agent/tui_status.py b/aai_cli/code_agent/tui_status.py
new file mode 100644
index 00000000..5e385b55
--- /dev/null
+++ b/aai_cli/code_agent/tui_status.py
@@ -0,0 +1,51 @@
+"""Pure text helpers for the coding-agent TUI's status line and working indicator.
+
+Split out of `tui.py` (to keep it under the file-length gate) and free of any Textual
+imports, so they unit-test as plain functions.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+
+def _spinner_text(elapsed_s: int, frame: str) -> str:
+    """The working-indicator line: a spinner glyph and the elapsed seconds."""
+    return f"{frame} Working… ({elapsed_s}s)"
+
+
+def _abbrev_home(path: Path) -> str:
+    """Render ``path`` with the home directory collapsed to ``~``."""
+    try:
+        return f"~/{path.relative_to(Path.home())}"
+    except ValueError:
+        return str(path)
+
+
+def _git_branch(start: Path) -> str | None:
+    """The current git branch for ``start`` (walking up to the repo root), or None."""
+    for directory in (start, *start.parents):
+        head = directory / ".git" / "HEAD"
+        if head.is_file():
+            ref = head.read_text(encoding="utf-8").strip()
+            return ref.removeprefix("ref: refs/heads/") if ref.startswith("ref: ") else ref[:8]
+    return None
+
+
+def _status_text(cwd: Path, *, auto_approve: bool, voice_state: str | None = None) -> str:
+    """The bottom status line: a mode badge, the working directory, git branch, and voice state.
+
+    ``voice_state`` is ``"on"``/``"off"`` when the session has a voice front-end (so the
+    Ctrl-V toggle shows its effect), or ``None`` when voice isn't wired up at all.
+    """
+    mode = "auto" if auto_approve else "manual"
+    badge = f"[black on #f59e0b] {mode} [/]"
+    parts = [badge, f"[dim]{_abbrev_home(cwd)}[/dim]"]
+    branch = _git_branch(cwd)
+    if branch:
+        parts.append(f"[dim]↗ {branch}[/dim]")
+    if voice_state is not None:
+        # A filled/hollow dot (BMP glyphs, like the rest of the UI — no double-width emoji).
+        glyph, color = ("●", "#22c55e") if voice_state == "on" else ("○", "#6b7280")
+        parts.append(f"[{color}]{glyph} voice {voice_state}[/]")
+    return " ".join(parts)
diff --git a/aai_cli/code_agent/voice_ui.py b/aai_cli/code_agent/voice_ui.py
new file mode 100644
index 00000000..cdac1e29
--- /dev/null
+++ b/aai_cli/code_agent/voice_ui.py
@@ -0,0 +1,107 @@
+"""The voice front-end legs for the coding-agent TUI, split out to keep `tui.py` small.
+
+These are the speak-to-it / read-back mechanics that run *off* the UI thread (mic capture and
+TTS readback block), marshaling back via ``call_from_thread``. They live in a mixin that
+:class:`~aai_cli.code_agent.tui.CodeAgentApp` inherits, so the app stays one ``App`` with the
+voice methods folded in. The render/toggle side (the voice bar, Ctrl-V) stays in `tui.py`.
+"""
+
+from __future__ import annotations
+
+import threading
+from typing import TYPE_CHECKING, Protocol
+
+from textual.app import App
+from textual.widgets import Input
+
+from aai_cli.code_agent.voice import spoken_summary
+from aai_cli.core import errors
+
+if TYPE_CHECKING:
+    from collections.abc import Callable
+
+
+class _VoiceIO(Protocol):
+    """The speak-to-it / read-back slice the TUI drives; :class:`VoiceSession` satisfies it."""
+
+    def listen(self) -> str | None:
+        """Capture one spoken turn and return its transcript (``None`` on no speech)."""
+
+    def speak(self, text: str) -> None:
+        """Read ``text`` back aloud (a no-op when readback is unavailable)."""
+
+
+class _VoiceLegs(App[None]):
+    """Mixin holding the off-thread voice capture/readback legs for ``CodeAgentApp``.
+
+    Extends ``App`` so the inherited ``query_one``/``call_from_thread`` are typed; the voice
+    state and the few app methods it leans on (``_set_voice_phase``/``_sync_input_mode``/
+    ``_submit``) are provided by the concrete app and declared here for the type checker.
+    """
+
+    if TYPE_CHECKING:  # provided by CodeAgentApp (state set in __init__, methods defined there)
+        _voice: _VoiceIO | None
+        _voice_typed: bool
+        _voice_paused: bool
+        _last_reply: str
+
+        def _set_voice_phase(self, phase: str) -> None: ...
+        def _sync_input_mode(self) -> None: ...
+        def _submit(self, text: str) -> None: ...
+        def _note(self, text: str) -> None: ...
+
+    def _voice_active(self) -> bool:
+        """Voice capture is on: a session exists, the mic isn't ruled out, and it isn't paused."""
+        return self._voice is not None and not self._voice_typed and not self._voice_paused
+
+    def _spawn(self, target: Callable[[], None]) -> None:
+        """Run ``target`` on a daemon thread — voice legs block, so they stay off the UI thread."""
+        threading.Thread(target=target, daemon=True).start()  # pragma: no mutate
+
+    def _begin_listening(self) -> None:
+        """Capture the next spoken turn on a background thread (no-op when voice is off)."""
+        if not self._voice_active():
+            return
+        self._spawn(self._capture_voice_turn)
+
+    def _voice_followup(self) -> None:
+        """After a turn finishes: read back a spoken summary, then listen for the next turn."""
+        voice = self._voice
+        if voice is None or self._voice_paused:  # paused via Ctrl-V: no readback, no listen
+            return
+        self._spawn(lambda: self._speak_then_listen(voice))
+
+    def _speak_then_listen(self, voice: _VoiceIO) -> None:
+        """Read a summary of the last reply aloud (no code), then capture the next spoken turn."""
+        self.call_from_thread(self._set_voice_phase, "speaking")
+        voice.speak(spoken_summary(self._last_reply))
+        self._capture_voice_turn()
+
+    def _capture_voice_turn(self) -> None:
+        """Listen for one spoken turn; enter it into the prompt, or degrade to typing."""
+        voice = self._voice
+        if voice is None or self._voice_typed or self._voice_paused:
+            return
+        self.call_from_thread(self._set_voice_phase, "listening")
+        try:
+            transcript = voice.listen()
+        except errors.CLIError as exc:
+            # A capture failure (no mic, STT error) drops voice for the rest of the session
+            # rather than wedging it — the user just types instead.
+            self._voice_typed = True
+            self.call_from_thread(self._notice_voice_off, exc.message)
+            return
+        if transcript:
+            self.call_from_thread(self._enter_and_submit, transcript)
+
+    def _notice_voice_off(self, detail: str) -> None:
+        """Tell the user voice input stopped and that input is now typed (UI thread)."""
+        self._note(f"voice input off: {detail}; type your request instead")
+        self._sync_input_mode()  # mic ruled out -> bring the text box back
+
+    def _enter_and_submit(self, text: str) -> None:
+        """Show the spoken text in the prompt, then submit it as a turn (UI thread)."""
+        prompt = self.query_one("#prompt", Input)
+        prompt.value = text
+        self._submit(text)
+        prompt.value = ""
diff --git a/aai_cli/code_agent/web_search.py b/aai_cli/code_agent/web_search.py
index d06af999..71ed2bff 100644
--- a/aai_cli/code_agent/web_search.py
+++ b/aai_cli/code_agent/web_search.py
@@ -19,6 +19,10 @@
 # agent a tool that will fail on first use for lack of a key.
 TAVILY_API_KEY_ENV = "TAVILY_API_KEY"
 
+# The name ``TavilySearch`` registers itself under. Callers (e.g. the live agent's prompt
+# builder) detect web-search availability by this name, so a test pins it against the tool.
+WEB_SEARCH_TOOL_NAME = "tavily_search"
+
 # A small result cap keeps search responses inside the model's context budget.
 _DEFAULT_MAX_RESULTS = 5
 
diff --git a/aai_cli/code_gen/agent_cascade.py b/aai_cli/code_gen/agent_cascade.py
index 0a861911..5f5306f0 100644
--- a/aai_cli/code_gen/agent_cascade.py
+++ b/aai_cli/code_gen/agent_cascade.py
@@ -16,9 +16,11 @@
 # which is never formatted — so no brace has to be doubled.
 _HEADER = """\
 # Live voice cascade: Streaming STT -> LLM Gateway -> streaming TTS, wired client-side.
-# This is what `assembly --sandbox agent-cascade` runs: it transcribes your speech,
+# The basic cascade behind `assembly --sandbox live`: it transcribes your speech,
 # sends each finalized turn to the LLM Gateway, and speaks the reply through streaming
 # TTS — the same three primitives the agent-cascade init template wires server-side.
+# (The `live` command adds a tool-using agent on the LLM leg; this snippet is the
+# plain single-completion version to build from.)
 # Requires audio + websockets:  pip install sounddevice websockets openai
 # Tip: use headphones — the mic stays open while the agent speaks, so on speakers it
 # would hear itself and loop.
diff --git a/aai_cli/commands/agent/__init__.py b/aai_cli/commands/agent/__init__.py
index f535b54c..b20dfc2a 100644
--- a/aai_cli/commands/agent/__init__.py
+++ b/aai_cli/commands/agent/__init__.py
@@ -84,7 +84,7 @@ def agent(
         help="Print the equivalent Python SDK code and exit (does not start a session)",
     ),
 ) -> None:
-    """Hold a live two-way voice conversation with a voice agent
+    """Hold a live two-way voice conversation with the Voice Agent API
 
     Use headphones: the mic stays open while the agent speaks, so on
     speakers it would hear itself and loop. Pass an audio file/URL (or
diff --git a/aai_cli/commands/agent_cascade/__init__.py b/aai_cli/commands/agent_cascade/__init__.py
index 3e99f146..b17e85e8 100644
--- a/aai_cli/commands/agent_cascade/__init__.py
+++ b/aai_cli/commands/agent_cascade/__init__.py
@@ -31,7 +31,7 @@
 SPEC = command_registry.CommandModuleSpec(
     panel=help_panels.TRANSCRIPTION,
     order=45,  # pragma: no mutate -- sparse rank; a +-1 shift is order-equivalent
-    commands=("agent-cascade",),
+    commands=("live",),
 )
 
 
@@ -43,28 +43,28 @@ def _emit_voice_list(_state: AppState, json_mode: bool) -> None:
 
 
 @app.command(
-    name="agent-cascade",
+    name="live",
     rich_help_panel=help_panels.TRANSCRIPTION,
     epilog=examples_epilog(
         [
-            ("Start a live cascade conversation", "assembly --sandbox agent-cascade"),
+            ("Start a live voice conversation", "assembly --sandbox live"),
             (
                 "Pick a voice and opening line",
-                'assembly --sandbox agent-cascade --voice michael --greeting "Hi there"',
+                'assembly --sandbox live --voice michael --greeting "Hi there"',
             ),
             (
                 "Give the agent a persona",
-                'assembly --sandbox agent-cascade --system-prompt "You are a terse pirate."',
+                'assembly --sandbox live --system-prompt "You are a terse pirate."',
             ),
-            ("See available voices", "assembly --sandbox agent-cascade --list-voices"),
+            ("See available voices", "assembly --sandbox live --list-voices"),
             (
                 "Print equivalent Python instead of running",
-                "assembly --sandbox agent-cascade --show-code",
+                "assembly --sandbox live --show-code",
             ),
         ]
     ),
 )
-def agent_cascade(
+def live(
     ctx: typer.Context,
     source: str | None = typer.Argument(
         None, help="Audio file path or URL to speak to the agent. Omit to use the microphone."
@@ -169,14 +169,15 @@ def agent_cascade(
         help="Print the equivalent Python SDK code and exit (does not start a session)",
     ),
 ) -> None:
-    """\\[sandbox] Hold a live voice conversation through a self-wired cascade
+    """\\[sandbox] Talk live to a tool-using voice agent
 
-    Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this
-    wires the three primitives together itself — Streaming STT, the LLM Gateway,
-    and streaming TTS — exactly like the 'agent-cascade' init template does
-    server-side. Because it uses streaming TTS it only runs in the sandbox: run
-    it as 'assembly --sandbox agent-cascade' (--sandbox goes before the
-    subcommand).
+    A real-time spoken conversation, wired client-side from three primitives —
+    Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS. Unlike
+    'assembly agent' (the Voice Agent API), the brain here is an agent that can use
+    tools mid-conversation — web search, URL fetch, and the AssemblyAI docs — so it
+    answers like a live multimodal assistant. Because it uses streaming TTS it only
+    runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes before
+    the subcommand).
 
     Use headphones: the mic stays open while the agent speaks, so on speakers it
     would hear itself and loop. Pass an audio file/URL (or --sample) to speak a
@@ -185,6 +186,9 @@ def agent_cascade(
 
     This only runs a conversation in the terminal — it writes no code. To build
     an agent-cascade app, run 'assembly init agent-cascade' instead.
+
+    Web search needs a TAVILY_API_KEY in the environment; without it the agent
+    keeps its URL-fetch and docs tools.
     """
 
     if list_voices:
diff --git a/aai_cli/commands/agent_cascade/_exec.py b/aai_cli/commands/agent_cascade/_exec.py
index 0b97e230..af466c56 100644
--- a/aai_cli/commands/agent_cascade/_exec.py
+++ b/aai_cli/commands/agent_cascade/_exec.py
@@ -169,9 +169,9 @@ def _print_show_code(opts: AgentCascadeOptions, system_prompt_text: str) -> None
 def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode: bool) -> None:
     """Execute one `assembly agent-cascade` cascade from already-parsed flags."""
     text_mode, json_mode = resolve_output_modes(opts.output_field, json_mode=json_mode)
-    validate_voice(opts.voice, voices.VOICE_NAMES, command="agent-cascade")
+    validate_voice(opts.voice, voices.VOICE_NAMES, command="live")
     # Streaming TTS has no production host, so the whole cascade is sandbox-only.
-    tts_session.require_available("agent-cascade")
+    tts_session.require_available("live")
     system_prompt_text = _resolve_system_prompt(opts.system_prompt, opts.system_prompt_file)
 
     if opts.show_code:
diff --git a/aai_cli/core/microphone.py b/aai_cli/core/microphone.py
index e75576d4..4ec65dda 100644
--- a/aai_cli/core/microphone.py
+++ b/aai_cli/core/microphone.py
@@ -1,5 +1,8 @@
 from __future__ import annotations
 
+import atexit
+import contextlib
+import signal
 import warnings
 from abc import abstractmethod
 from collections.abc import Callable, Iterable, Iterator, Mapping
@@ -57,6 +60,42 @@ def audio_missing_error() -> CLIError:
     )
 
 
+# Process-global once-latch. The default is only observable on the very first install
+# in a fresh process; the suite mutates this flag across tests, so the load-time value
+# can't be asserted in isolation — the check/set in _install_… are what the tests pin.
+_shutdown_interrupt_guard_installed = False  # pragma: no mutate
+
+
+def _ignore_interrupt_during_shutdown() -> None:
+    """Drop SIGINT for the remainder of interpreter shutdown.
+
+    sounddevice registers its own atexit handler that calls ``Pa_Terminate`` to tear
+    down PortAudio. A second Ctrl-C while that runs raises ``KeyboardInterrupt``
+    *inside* the atexit callback, which Python reports as a noisy "Exception ignored in
+    atexit callback" traceback — even though the first Ctrl-C already stopped the
+    session cleanly. There is nothing left to cancel once we're exiting, so ignore the
+    late interrupt.
+    """
+    # signal.signal only works on the main thread; atexit runs there, but a ValueError
+    # is still possible in odd embeddings, so guard it rather than crash the teardown.
+    with contextlib.suppress(ValueError):
+        signal.signal(signal.SIGINT, signal.SIG_IGN)
+
+
+def _install_shutdown_interrupt_guard() -> None:
+    """Register ``_ignore_interrupt_during_shutdown`` with atexit exactly once.
+
+    Registered *after* sounddevice imports so atexit's LIFO order runs our guard
+    before sounddevice's PortAudio teardown, neutralizing a second Ctrl-C that would
+    otherwise raise inside that atexit callback.
+    """
+    global _shutdown_interrupt_guard_installed
+    if _shutdown_interrupt_guard_installed:
+        return
+    atexit.register(_ignore_interrupt_during_shutdown)
+    _shutdown_interrupt_guard_installed = True
+
+
 def import_sounddevice() -> ModuleType:
     """Import sounddevice lazily, mapping an ImportError to ``audio_missing_error``.
 
@@ -68,6 +107,7 @@ def import_sounddevice() -> ModuleType:
         import sounddevice
     except ImportError as exc:
         raise audio_missing_error() from exc
+    _install_shutdown_interrupt_guard()
     module: ModuleType = sounddevice
     return module
 
diff --git a/install.sh b/install.sh
index c884e247..b87e8641 100755
--- a/install.sh
+++ b/install.sh
@@ -3,11 +3,93 @@
 set -e # Exit on any error
 
 # Canonical installer for the AssemblyAI CLI (`assembly`).
-# Installs the app as a uv tool, bootstrapping uv first if it is missing.
+#
+# Default: installs the latest published code as an isolated tool with uv (or
+#   pipx), bootstrapping uv when neither is present.
+# Dev mode (--install-method git / --dev): clones the repo (or reuses the
+#   checkout you run this from) and installs it editable (`uv tool install -e .`),
+#   so local source edits take effect without reinstalling.
+# Either way it then installs the optional system deps via Homebrew if available.
+#
+# Usage:
+#   curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash
+#   ./install.sh --dev                                   # editable, from a clone
+#   curl -LsSf .../install.sh | bash -s -- --install-method git
 
-PACKAGE="git+https://github.com/AssemblyAI/cli.git"
+REPO_URL="https://github.com/AssemblyAI/cli.git"
+PACKAGE="git+${REPO_URL}"
 PYTHON_VERSION="3.13"
 
+# Install method: "release" (default, publish-style) or "git" (editable clone).
+# Overridable by env or the flags parsed below.
+INSTALL_METHOD="${AAI_INSTALL_METHOD:-release}"
+GIT_DIR="${AAI_GIT_DIR:-$HOME/.local/share/assembly-cli}"
+# Passed to the installer as `-e` only in dev mode (empty array otherwise).
+EDITABLE=()
+
+usage() {
+	cat <<'EOF'
+Install the AssemblyAI CLI (assembly).
+
+Usage: install.sh [options]
+
+Options:
+  --install-method <release|git>  release (default): install the latest
+                                  published code. git: clone the repo and
+                                  install it editable (development mode).
+  --dev, -e, --editable, --git    Shortcut for --install-method git.
+  --release                       Shortcut for --install-method release.
+  --dir <path>                    Clone directory for dev mode
+                                  (default: ~/.local/share/assembly-cli).
+  -h, --help                      Show this help.
+
+Environment:
+  AAI_INSTALL_METHOD=release|git  Same as --install-method.
+  AAI_GIT_DIR=<path>              Same as --dir.
+EOF
+}
+
+while [ $# -gt 0 ]; do
+	case "$1" in
+	--install-method | --method)
+		[ $# -ge 2 ] || {
+			echo "Missing value for $1" >&2
+			exit 2
+		}
+		INSTALL_METHOD="$2"
+		shift
+		;;
+	--dev | -e | --editable | --git) INSTALL_METHOD="git" ;;
+	--release | --published) INSTALL_METHOD="release" ;;
+	--dir | --git-dir)
+		[ $# -ge 2 ] || {
+			echo "Missing value for $1" >&2
+			exit 2
+		}
+		GIT_DIR="$2"
+		shift
+		;;
+	-h | --help)
+		usage
+		exit 0
+		;;
+	*)
+		echo "Unknown option: $1" >&2
+		usage >&2
+		exit 2
+		;;
+	esac
+	shift
+done
+
+case "$INSTALL_METHOD" in
+release | git) ;;
+*)
+	echo "Invalid --install-method: $INSTALL_METHOD (use 'release' or 'git')" >&2
+	exit 2
+	;;
+esac
+
 # Best-effort check for the PortAudio shared library (no `command` to probe, so
 # look via pkg-config, the dynamic linker cache, then well-known lib paths).
 has_portaudio() {
@@ -33,17 +115,41 @@ has_portaudio() {
 	return 1
 }
 
-# Homebrew also pulls in ffmpeg, portaudio, and cloudflared. The uv install does
-# not, so detect any that are missing and print how to install them — without
-# touching the system or invoking sudo on the user's behalf.
-advise_system_deps() {
-	local missing=()
-	command -v ffmpeg >/dev/null 2>&1 || missing+=("ffmpeg")
-	has_portaudio || missing+=("portaudio")
-	command -v cloudflared >/dev/null 2>&1 || missing+=("cloudflared")
+# Populate MISSING_DEPS with the optional system deps not already on the system.
+MISSING_DEPS=()
+detect_missing_deps() {
+	MISSING_DEPS=()
+	command -v ffmpeg >/dev/null 2>&1 || MISSING_DEPS+=("ffmpeg")
+	has_portaudio || MISSING_DEPS+=("portaudio")
+	command -v cloudflared >/dev/null 2>&1 || MISSING_DEPS+=("cloudflared")
+}
+
+# Homebrew also pulls in ffmpeg, portaudio, and cloudflared. The uv/pipx installs
+# do not, so detect any that are missing. If Homebrew is available we install the
+# ones it actually carries (brew needs no sudo); for anything left we print how to
+# install it — without touching the system or invoking sudo on the user's behalf.
+install_system_deps() {
+	detect_missing_deps
+	[ ${#MISSING_DEPS[@]} -eq 0 ] && return 0
 
-	[ ${#missing[@]} -eq 0 ] && return 0
+	if command -v brew >/dev/null 2>&1; then
+		# Only ask Homebrew for formulae it actually has, so an unavailable one
+		# can't fail the whole batch; `brew info` exits non-zero for unknown names.
+		local brew_pkgs=() dep
+		for dep in "${MISSING_DEPS[@]}"; do
+			brew info --formula "$dep" >/dev/null 2>&1 && brew_pkgs+=("$dep")
+		done
+		if [ ${#brew_pkgs[@]} -gt 0 ]; then
+			echo ""
+			echo "Installing optional system dependencies with Homebrew: ${brew_pkgs[*]}"
+			brew install "${brew_pkgs[@]}" || true
+		fi
+		# Re-detect so we only advise on whatever brew couldn't provide.
+		detect_missing_deps
+		[ ${#MISSING_DEPS[@]} -eq 0 ] && return 0
+	fi
 
+	local missing=("${MISSING_DEPS[@]}")
 	echo ""
 	echo "Optional system dependencies are missing: ${missing[*]}"
 	echo "(core 'assembly transcribe' works without them)"
@@ -90,25 +196,71 @@ advise_system_deps() {
 	esac
 }
 
-if ! command -v uv &>/dev/null; then
-	echo "uv is not installed. Installing..."
+# Resolve the source for a development (editable) install: reuse the checkout we
+# are run from if it is the CLI repo, otherwise clone/update GIT_DIR. Sets PACKAGE
+# to the local path and EDITABLE so the installer passes `-e`.
+prepare_git_source() {
+	if [ -f pyproject.toml ] && grep -q '^name = "aai-cli"' pyproject.toml 2>/dev/null; then
+		PACKAGE="$(pwd)"
+		echo "Development install from current checkout: $PACKAGE"
+	else
+		if ! command -v git >/dev/null 2>&1; then
+			echo "Development install needs git to clone $REPO_URL" >&2
+			exit 1
+		fi
+		if [ -d "$GIT_DIR/.git" ]; then
+			echo "Updating existing clone at $GIT_DIR"
+			git -C "$GIT_DIR" pull --ff-only
+		else
+			echo "Cloning $REPO_URL to $GIT_DIR"
+			mkdir -p "$(dirname "$GIT_DIR")"
+			git clone "$REPO_URL" "$GIT_DIR"
+		fi
+		PACKAGE="$GIT_DIR"
+		echo "Development install from $PACKAGE"
+	fi
+	EDITABLE=(-e)
+}
+
+# Install `assembly` as an isolated tool. Prefer uv (it manages an isolated
+# Python for us), then fall back to an existing pipx, and only bootstrap uv if
+# neither is already present. EDITABLE is empty for a release install and `-e`
+# for a dev install.
+install_with_uv() {
+	# "$1" is the uv executable to invoke.
+	"$1" tool install -U "${EDITABLE[@]}" "$PACKAGE" --python "$PYTHON_VERSION"
+}
+
+[ "$INSTALL_METHOD" = "git" ] && prepare_git_source
+
+if command -v uv >/dev/null 2>&1; then
+	# `uv self update` errors out when uv was installed via an external package
+	# manager (Homebrew, apt, …) — it can't replace a binary it doesn't own. That
+	# is not fatal to us: a managed uv is already kept current by its manager, so
+	# swallow the failure and proceed straight to installing the CLI.
+	uv self update 2>/dev/null || true
+	install_with_uv uv
+elif command -v pipx >/dev/null 2>&1; then
+	# --force makes a re-run upgrade in place: the git source's version may not
+	# change between commits, so a plain `pipx install` would refuse as "already
+	# installed" and never pick up new code.
+	pipx install --force "${EDITABLE[@]}" "$PACKAGE"
+else
+	echo "Neither uv nor pipx found. Installing uv..."
 	curl -LsSf https://astral.sh/uv/install.sh | sh
 	echo "uv installation complete!"
 	echo ""
 
 	if [ -x "$HOME/.local/bin/uv" ]; then
-		"$HOME/.local/bin/uv" tool install -U "$PACKAGE" --python "$PYTHON_VERSION"
+		install_with_uv "$HOME/.local/bin/uv"
 	else
 		echo "Please restart your shell and run this script again"
 		echo ""
 		exit 0
 	fi
-else
-	uv self update
-	uv tool install -U "$PACKAGE" --python "$PYTHON_VERSION"
 fi
 
-advise_system_deps || true
+install_system_deps || true
 
 echo ""
 echo "For help and support, see the AssemblyAI CLI repository"
diff --git a/pyproject.toml b/pyproject.toml
index a7eb9e4e..e2da9d8a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -258,9 +258,10 @@ module = [
     "aai_cli.code_agent.store",
     "aai_cli.code_agent.model",
     "aai_cli.commands.code._exec",
+    "aai_cli.agent_cascade.brain",
 ]
 disallow_any_generics = false
-disable_error_code = ["return-value", "arg-type", "type-arg"]
+disable_error_code = ["return-value", "arg-type", "type-arg", "call-arg"]
 
 [tool.pyright]
 # Second type checker alongside mypy: pyright catches a different class of
@@ -279,7 +280,7 @@ exclude = ["**/node_modules", "**/__pycache__", "**/.*"]
 # Unknown*/invariance diagnostics our precise signatures can't satisfy. mypy still
 # type-checks these modules (with the targeted overrides above) as the safety net, so
 # we suppress pyright diagnostics here rather than littering per-line `# pyright: ignore`.
-ignore = ["aai_cli/code_agent", "aai_cli/commands/code"]
+ignore = ["aai_cli/code_agent", "aai_cli/commands/code", "aai_cli/agent_cascade/brain.py"]
 pythonVersion = "3.12"
 typeCheckingMode = "strict"
 # Third-party deps (assemblyai, sounddevice) ship no type stubs.
@@ -419,6 +420,8 @@ max-statements = 40
 "aai_cli/core/environments.py" = ["PLW0603"]
 # Verbosity is process-global startup state by design (mirrors environments.py).
 "aai_cli/core/debuglog.py" = ["PLW0603"]
+# The "shutdown SIGINT guard installed" latch is process-global once-only state.
+"aai_cli/core/microphone.py" = ["PLW0603"]
 # BaseHTTPRequestHandler.log_message requires a parameter named `format`.
 "aai_cli/auth/loopback.py" = ["A002"]
 # Template constants include URL path names such as TOKEN_PATH, not credentials.
diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json
index 1ea7be4a..f9dbdf0e 100644
--- a/pyrightconfig.tests.json
+++ b/pyrightconfig.tests.json
@@ -3,7 +3,8 @@
   "ignore": [
     "tests/test_code_agent.py",
     "tests/test_code_command.py",
-    "tests/test_code_tui.py"
+    "tests/test_code_tui.py",
+    "tests/test_agent_cascade_brain.py"
   ],
   "pythonVersion": "3.12",
   "typeCheckingMode": "standard",
diff --git a/scripts/generated_code_compile_gate.py b/scripts/generated_code_compile_gate.py
index 8d258efe..bd71efdf 100644
--- a/scripts/generated_code_compile_gate.py
+++ b/scripts/generated_code_compile_gate.py
@@ -118,10 +118,10 @@ def main() -> int:
         ),
         (
             # Sandbox-only: streaming TTS has no prod host, so --sandbox makes the URLs valid.
-            "agent-cascade-basic",
+            "live-basic",
             (
                 "--sandbox",
-                "agent-cascade",
+                "live",
                 "--voice",
                 "jane",
                 "--greeting",
diff --git a/tests/__snapshots__/test_snapshots_help_root.ambr b/tests/__snapshots__/test_snapshots_help_root.ambr
index 82cc9dc9..2bb0f987 100644
--- a/tests/__snapshots__/test_snapshots_help_root.ambr
+++ b/tests/__snapshots__/test_snapshots_help_root.ambr
@@ -32,60 +32,59 @@
   │                                                    exit.                     │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Quick Start ────────────────────────────────────────────────────────────────╮
-  │ onboard        Guided setup: sign in and run your first transcription        │
+  │ onboard      Guided setup: sign in and run your first transcription          │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Coding Agent ───────────────────────────────────────────────────────────────╮
-  │ code           Run a terminal coding agent backed by the AssemblyAI LLM      │
-  │                Gateway                                                       │
+  │ code         Run a terminal coding agent backed by the AssemblyAI LLM        │
+  │              Gateway                                                         │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Build an App ───────────────────────────────────────────────────────────────╮
-  │ init           Scaffold a new app from a template and launch it              │
-  │ dev            Run the dev server for the app in the current directory       │
-  │ share          Expose the local app on a public URL via a cloudflared tunnel │
-  │ deploy         Deploy the current project to Vercel, Railway, or Fly.io      │
+  │ init         Scaffold a new app from a template and launch it                │
+  │ dev          Run the dev server for the app in the current directory         │
+  │ share        Expose the local app on a public URL via a cloudflared tunnel   │
+  │ deploy       Deploy the current project to Vercel, Railway, or Fly.io        │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Run AssemblyAI ─────────────────────────────────────────────────────────────╮
-  │ transcribe     Transcribe a file, URL, or YouTube/podcast link — or a whole  │
-  │                batch                                                         │
-  │ stream         Transcribe live audio in real time from a mic, file, URL, or  │
-  │                pipe                                                          │
-  │ dictate        Signal-driven dictation: record the mic, get the transcript   │
-  │                back                                                          │
-  │ agent          Hold a live two-way voice conversation with a voice agent     │
-  │ agent-cascade  [sandbox] Hold a live voice conversation through a self-wired │
-  │                cascade                                                       │
-  │ speak          [sandbox] Synthesize speech from text with AssemblyAI         │
-  │                streaming TTS                                                 │
-  │ llm            Send a prompt to AssemblyAI's LLM Gateway and print the reply │
-  │ clip           Cut clips from media by speaker, text match, LLM pick, or     │
-  │                time range                                                    │
-  │ dub            [sandbox] Dub a video or audio file into another language     │
-  │ caption        Burn always-visible captions into a video                     │
-  │ eval           Transcribe one or more datasets and score WER against their   │
-  │                reference texts                                               │
-  │ webhooks       Receive webhook deliveries on a public dev URL                │
+  │ transcribe   Transcribe a file, URL, or YouTube/podcast link — or a whole    │
+  │              batch                                                           │
+  │ stream       Transcribe live audio in real time from a mic, file, URL, or    │
+  │              pipe                                                            │
+  │ dictate      Signal-driven dictation: record the mic, get the transcript     │
+  │              back                                                            │
+  │ agent        Hold a live two-way voice conversation with the Voice Agent API │
+  │ live         [sandbox] Talk live to a tool-using voice agent                 │
+  │ speak        [sandbox] Synthesize speech from text with AssemblyAI streaming │
+  │              TTS                                                             │
+  │ llm          Send a prompt to AssemblyAI's LLM Gateway and print the reply   │
+  │ clip         Cut clips from media by speaker, text match, LLM pick, or time  │
+  │              range                                                           │
+  │ dub          [sandbox] Dub a video or audio file into another language       │
+  │ caption      Burn always-visible captions into a video                       │
+  │ eval         Transcribe one or more datasets and score WER against their     │
+  │              reference texts                                                 │
+  │ webhooks     Receive webhook deliveries on a public dev URL                  │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Setup & Tools ──────────────────────────────────────────────────────────────╮
-  │ doctor         Check that your environment is ready for AssemblyAI           │
-  │ setup          Set up your coding agent for AssemblyAI (docs MCP + skills)   │
-  │ config         Inspect and edit persisted CLI settings (profiles, env,       │
-  │                telemetry)                                                    │
-  │ update         Update the CLI to the latest release (brew/pipx/uv)           │
-  │ telemetry      Anonymous usage telemetry: status, enable, disable            │
+  │ doctor       Check that your environment is ready for AssemblyAI             │
+  │ setup        Set up your coding agent for AssemblyAI (docs MCP + skills)     │
+  │ config       Inspect and edit persisted CLI settings (profiles, env,         │
+  │              telemetry)                                                      │
+  │ update       Update the CLI to the latest release (brew/pipx/uv)             │
+  │ telemetry    Anonymous usage telemetry: status, enable, disable              │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ History ────────────────────────────────────────────────────────────────────╮
-  │ transcripts    Browse and fetch past transcripts                             │
-  │ sessions       Browse your past streaming (real-time) sessions               │
+  │ transcripts  Browse and fetch past transcripts                               │
+  │ sessions     Browse your past streaming (real-time) sessions                 │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Account ────────────────────────────────────────────────────────────────────╮
-  │ login          Authenticate via your browser and store a CLI API key         │
-  │ logout         Clear stored credentials for the active profile               │
-  │ whoami         Show the active profile and whether its key works             │
-  │ balance        Show your remaining account balance                           │
-  │ usage          Show usage over a date range (default: last 30 days)          │
-  │ limits         Show your account's rate limits per service                   │
-  │ keys           List, create, and rename your AssemblyAI API keys             │
-  │ audit          List recent audit-log entries for your account                │
+  │ login        Authenticate via your browser and store a CLI API key           │
+  │ logout       Clear stored credentials for the active profile                 │
+  │ whoami       Show the active profile and whether its key works               │
+  │ balance      Show your remaining account balance                             │
+  │ usage        Show usage over a date range (default: last 30 days)            │
+  │ limits       Show your account's rate limits per service                     │
+  │ keys         List, create, and rename your AssemblyAI API keys               │
+  │ audit        List recent audit-log entries for your account                  │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   
    Examples
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index a36aa130..23ab4f37 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -1,121 +1,10 @@
 # serializer version: 1
-# name: test_command_help_matches_snapshot[agent-cascade]
-  '''
-  
-   Usage: assembly agent-cascade [OPTIONS] [SOURCE]
-  
-   [sandbox] Hold a live voice conversation through a self-wired cascade
-  
-   Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this
-   wires the three primitives together itself — Streaming STT, the LLM Gateway,
-   and streaming TTS — exactly like the 'agent-cascade' init template does
-   server-side. Because it uses streaming TTS it only runs in the sandbox: run
-   it as 'assembly --sandbox agent-cascade' (--sandbox goes before the
-   subcommand).
-  
-   Use headphones: the mic stays open while the agent speaks, so on speakers it
-   would hear itself and loop. Pass an audio file/URL (or --sample) to speak a
-   recorded clip instead of the microphone; the session then ends after the
-   agent's reply.
-  
-   This only runs a conversation in the terminal — it writes no code. To build
-   an agent-cascade app, run 'assembly init agent-cascade' instead.
-  
-  ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
-  │   source      [SOURCE]  Audio file path or URL to speak to the agent. Omit   │
-  │                         to use the microphone.                               │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Options ────────────────────────────────────────────────────────────────────╮
-  │ --sample                                   Speak the hosted wildfires.mp3    │
-  │                                            sample to the agent               │
-  │ --system-prompt               TEXT         System prompt (the agent's        │
-  │                                            persona)                          │
-  │                                            [default: You are a friendly,     │
-  │                                            concise voice assistant. Keep     │
-  │                                            replies short and conversational. │
-  │                                            Your reply is read aloud by a     │
-  │                                            text-to-speech engine, so write   │
-  │                                            plain spoken prose — no markdown, │
-  │                                            emoji, bullet lists, or code.]    │
-  │ --system-prompt-file          FILE         Read the system prompt from a     │
-  │                                            file (overrides --system-prompt)  │
-  │ --greeting                    TEXT         Spoken greeting                   │
-  │                                            [default: Hi! I'm your AssemblyAI │
-  │                                            voice agent. What can I help you  │
-  │                                            with?]                            │
-  │ --device                      INTEGER      Microphone device index           │
-  │ --list-voices                              Print known voices and exit       │
-  │ --json                -j                   Emit newline-delimited JSON       │
-  │                                            events                            │
-  │ --output              -o      [text|json]  Output mode: text (you:/agent:    │
-  │                                            lines as plain stdout,            │
-  │                                            pipe-friendly) or json            │
-  │ --show-code                                Print the equivalent Python SDK   │
-  │                                            code and exit (does not start a   │
-  │                                            session)                          │
-  │ --help                                     Show this message and exit.       │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Text-to-speech ─────────────────────────────────────────────────────────────╮
-  │ --voice             TEXT  TTS voice. See --list-voices. [default: jane]      │
-  │ --language          TEXT  TTS language (defaults to the voice's language)    │
-  │ --tts-config        TEXT  Set any extra streaming-TTS query field as         │
-  │                           KEY=VALUE (repeatable)                             │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Language model ─────────────────────────────────────────────────────────────╮
-  │ --model             TEXT                  LLM Gateway model that powers the  │
-  │                                           agent's replies                    │
-  │                                           [default:                          │
-  │                                           claude-haiku-4-5-20251001]         │
-  │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
-  │                                           [default: 8192]                    │
-  │ --llm-config        TEXT                  Set any LLM Gateway request field  │
-  │                                           as KEY=VALUE (repeatable)          │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  ╭─ Speech-to-text ─────────────────────────────────────────────────────────────╮
-  │ --speech-model                            TEXT              Streaming speech │
-  │                                                             model            │
-  │                                                             [default:        │
-  │                                                             u3-rt-pro]       │
-  │ --format-turns       --no-format-turns                      Format           │
-  │                                                             (punctuate)      │
-  │                                                             finalized turns  │
-  │                                                             before replying  │
-  │                                                             [default:        │
-  │                                                             format-turns]    │
-  │ --turn-detection                          [aggressive|bala  Turn-detection   │
-  │                                           nced|conservativ  sensitivity      │
-  │                                           e]                preset           │
-  │ --stt-config                              TEXT              Set any          │
-  │                                                             StreamingParame… │
-  │                                                             field as         │
-  │                                                             KEY=VALUE        │
-  │                                                             (repeatable)     │
-  │ --stt-config-file                         FILE              JSON file of     │
-  │                                                             streaming fields │
-  ╰──────────────────────────────────────────────────────────────────────────────╯
-  
-   Examples
-   Start a live cascade conversation
-   $ assembly --sandbox agent-cascade
-   Pick a voice and opening line
-   $ assembly --sandbox agent-cascade --voice michael --greeting "Hi there"
-   Give the agent a persona
-   $ assembly --sandbox agent-cascade --system-prompt "You are a terse pirate."
-   See available voices
-   $ assembly --sandbox agent-cascade --list-voices
-   Print equivalent Python instead of running
-   $ assembly --sandbox agent-cascade --show-code
-  
-  
-  
-  '''
-# ---
 # name: test_command_help_matches_snapshot[agent]
   '''
   
    Usage: assembly agent [OPTIONS] [SOURCE]
   
-   Hold a live two-way voice conversation with a voice agent
+   Hold a live two-way voice conversation with the Voice Agent API
   
    Use headphones: the mic stays open while the agent speaks, so on
    speakers it would hear itself and loop. Pass an audio file/URL (or
@@ -698,6 +587,126 @@
   
   
   
+  '''
+# ---
+# name: test_command_help_matches_snapshot[live]
+  '''
+  
+   Usage: assembly live [OPTIONS] [SOURCE]
+  
+   [sandbox] Talk live to a tool-using voice agent
+  
+   A real-time spoken conversation, wired client-side from three primitives —
+   Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS.
+   Unlike
+   'assembly agent' (the Voice Agent API), the brain here is an agent that can
+   use
+   tools mid-conversation — web search, URL fetch, and the AssemblyAI docs — so
+   it
+   answers like a live multimodal assistant. Because it uses streaming TTS it
+   only
+   runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes
+   before
+   the subcommand).
+  
+   Use headphones: the mic stays open while the agent speaks, so on speakers it
+   would hear itself and loop. Pass an audio file/URL (or --sample) to speak a
+   recorded clip instead of the microphone; the session then ends after the
+   agent's reply.
+  
+   This only runs a conversation in the terminal — it writes no code. To build
+   an agent-cascade app, run 'assembly init agent-cascade' instead.
+  
+   Web search needs a TAVILY_API_KEY in the environment; without it the agent
+   keeps its URL-fetch and docs tools.
+  
+  ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
+  │   source      [SOURCE]  Audio file path or URL to speak to the agent. Omit   │
+  │                         to use the microphone.                               │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Options ────────────────────────────────────────────────────────────────────╮
+  │ --sample                                   Speak the hosted wildfires.mp3    │
+  │                                            sample to the agent               │
+  │ --system-prompt               TEXT         System prompt (the agent's        │
+  │                                            persona)                          │
+  │                                            [default: You are a friendly,     │
+  │                                            concise voice assistant. Keep     │
+  │                                            replies short and conversational. │
+  │                                            Your reply is read aloud by a     │
+  │                                            text-to-speech engine, so write   │
+  │                                            plain spoken prose — no markdown, │
+  │                                            emoji, bullet lists, or code.]    │
+  │ --system-prompt-file          FILE         Read the system prompt from a     │
+  │                                            file (overrides --system-prompt)  │
+  │ --greeting                    TEXT         Spoken greeting                   │
+  │                                            [default: Hi! I'm your AssemblyAI │
+  │                                            voice agent. What can I help you  │
+  │                                            with?]                            │
+  │ --device                      INTEGER      Microphone device index           │
+  │ --list-voices                              Print known voices and exit       │
+  │ --json                -j                   Emit newline-delimited JSON       │
+  │                                            events                            │
+  │ --output              -o      [text|json]  Output mode: text (you:/agent:    │
+  │                                            lines as plain stdout,            │
+  │                                            pipe-friendly) or json            │
+  │ --show-code                                Print the equivalent Python SDK   │
+  │                                            code and exit (does not start a   │
+  │                                            session)                          │
+  │ --help                                     Show this message and exit.       │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Text-to-speech ─────────────────────────────────────────────────────────────╮
+  │ --voice             TEXT  TTS voice. See --list-voices. [default: jane]      │
+  │ --language          TEXT  TTS language (defaults to the voice's language)    │
+  │ --tts-config        TEXT  Set any extra streaming-TTS query field as         │
+  │                           KEY=VALUE (repeatable)                             │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Language model ─────────────────────────────────────────────────────────────╮
+  │ --model             TEXT                  LLM Gateway model that powers the  │
+  │                                           agent's replies                    │
+  │                                           [default:                          │
+  │                                           claude-haiku-4-5-20251001]         │
+  │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
+  │                                           [default: 8192]                    │
+  │ --llm-config        TEXT                  Set any LLM Gateway request field  │
+  │                                           as KEY=VALUE (repeatable)          │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  ╭─ Speech-to-text ─────────────────────────────────────────────────────────────╮
+  │ --speech-model                            TEXT              Streaming speech │
+  │                                                             model            │
+  │                                                             [default:        │
+  │                                                             u3-rt-pro]       │
+  │ --format-turns       --no-format-turns                      Format           │
+  │                                                             (punctuate)      │
+  │                                                             finalized turns  │
+  │                                                             before replying  │
+  │                                                             [default:        │
+  │                                                             format-turns]    │
+  │ --turn-detection                          [aggressive|bala  Turn-detection   │
+  │                                           nced|conservativ  sensitivity      │
+  │                                           e]                preset           │
+  │ --stt-config                              TEXT              Set any          │
+  │                                                             StreamingParame… │
+  │                                                             field as         │
+  │                                                             KEY=VALUE        │
+  │                                                             (repeatable)     │
+  │ --stt-config-file                         FILE              JSON file of     │
+  │                                                             streaming fields │
+  ╰──────────────────────────────────────────────────────────────────────────────╯
+  
+   Examples
+   Start a live voice conversation
+   $ assembly --sandbox live
+   Pick a voice and opening line
+   $ assembly --sandbox live --voice michael --greeting "Hi there"
+   Give the agent a persona
+   $ assembly --sandbox live --system-prompt "You are a terse pirate."
+   See available voices
+   $ assembly --sandbox live --list-voices
+   Print equivalent Python instead of running
+   $ assembly --sandbox live --show-code
+  
+  
+  
   '''
 # ---
 # name: test_command_help_matches_snapshot[llm]
diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py
new file mode 100644
index 00000000..9f0509ac
--- /dev/null
+++ b/tests/test_agent_cascade_brain.py
@@ -0,0 +1,235 @@
+"""Tests for the deepagents reply brain behind `assembly live`.
+
+The brain's only network seam is the compiled graph, so `build_completer` is driven
+against the *real* deepagents graph wired to a fake chat model (pytest-socket stays
+armed) — no sockets. `build_live_tools` and `build_model`'s new knobs are unit-tested
+directly.
+"""
+
+from __future__ import annotations
+
+from langchain_core.language_models.chat_models import BaseChatModel
+from langchain_core.messages import AIMessage
+from langchain_core.outputs import ChatGeneration, ChatResult
+
+from aai_cli.agent_cascade import brain
+from aai_cli.agent_cascade.config import CascadeConfig
+from aai_cli.code_agent import model as model_mod
+
+
+class FakeChatModel(BaseChatModel):
+    """A chat model that replays a scripted list of AIMessages (mirrors the code agent's)."""
+
+    responses: list[AIMessage]
+    index: int = 0
+
+    @property
+    def _llm_type(self) -> str:
+        return "fake-live-model"
+
+    def bind_tools(self, tools, **kwargs):
+        del tools, kwargs
+        return self
+
+    def _generate(self, messages, stop=None, run_manager=None, **kwargs):
+        del messages, stop, run_manager, kwargs
+        message = self.responses[self.index]
+        self.index += 1
+        return ChatResult(generations=[ChatGeneration(message=message)])
+
+
+def _graph(model: BaseChatModel):
+    from deepagents import create_deep_agent
+
+    return create_deep_agent(model=model, tools=[], system_prompt="be a friendly live agent")
+
+
+# --- build_system_prompt -----------------------------------------------------
+
+
+class _NamedTool:
+    """A stand-in tool exposing just the ``.name`` the prompt builder inspects."""
+
+    def __init__(self, name: str):
+        self.name = name
+
+
+def test_system_prompt_appends_tool_guidance_for_present_tools():
+    prompt = brain.build_system_prompt(
+        "You are a pirate.",
+        tools=[_NamedTool("tavily_search"), _NamedTool("fetch_url"), _NamedTool("docs_search")],
+    )
+    # The persona is preserved, and the guidance advertises each capability that a present
+    # tool backs (the plain cascade persona never mentions tools).
+    assert prompt.startswith("You are a pirate.")
+    assert "search the web" in prompt
+    assert "fetch a specific URL" in prompt
+    assert "AssemblyAI documentation" in prompt
+
+
+def test_system_prompt_omits_web_search_when_no_search_tool():
+    # With no TAVILY_API_KEY the search tool is absent — the guidance must NOT promise web
+    # search, since announcing a missing tool makes the agent narrate "I'll search…" and
+    # then stall with no answer. The capabilities it *does* have still appear.
+    prompt = brain.build_system_prompt(
+        "persona", tools=[_NamedTool("fetch_url"), _NamedTool("docs_search")]
+    )
+    assert "search the web for current or unfamiliar facts" not in prompt
+    assert "fetch a specific URL" in prompt
+    assert "AssemblyAI documentation" in prompt
+
+
+def test_system_prompt_tells_model_not_to_promise_tools_when_none():
+    # No tools at all: the model must answer from its own knowledge and explicitly not
+    # promise to search or look anything up (the bug that left replies never coming back).
+    prompt = brain.build_system_prompt("persona", tools=[])
+    assert "search the web for current or unfamiliar facts" not in prompt
+    assert "your own knowledge" in prompt
+    assert "Never say" in prompt
+
+
+def test_join_clause_grammar():
+    # One/two/three capability phrases each render with natural conjunctions.
+    assert brain._join_clause(["a"]) == "a"
+    assert brain._join_clause(["a", "b"]) == "a and b"
+    assert brain._join_clause(["a", "b", "c"]) == "a, b, and c"
+
+
+def test_web_search_tool_name_matches_built_tool(monkeypatch):
+    # The prompt builder detects search by WEB_SEARCH_TOOL_NAME, so pin it against the real
+    # tool's registered name — if langchain_tavily renames it, detection would silently break.
+    from aai_cli.code_agent import web_search
+
+    monkeypatch.setenv(web_search.TAVILY_API_KEY_ENV, "tvly-x")
+    assert web_search.build_web_search_tool().name == web_search.WEB_SEARCH_TOOL_NAME
+
+
+# --- build_completer (driving the real graph with a fake model) --------------
+
+
+def test_completer_returns_final_spoken_text():
+    graph = _graph(FakeChatModel(responses=[AIMessage(content="Hello there.")]))
+    completer = brain.build_completer("k", CascadeConfig(), graph=graph)
+    reply = completer([{"role": "system", "content": "x"}, {"role": "user", "content": "hi"}])
+    assert reply == "Hello there."
+
+
+def test_completer_strips_system_message_before_invoking():
+    # The cascade prepends its own system message each turn, but the graph already owns
+    # the system prompt — so the completer must drop it before invoking, leaving only the
+    # conversation. We capture what the graph received to prove the system line is gone.
+    captured = {}
+
+    class _CapturingGraph:
+        def invoke(self, value):
+            captured["messages"] = value["messages"]
+            return {"messages": [AIMessage(content="ok")]}
+
+    completer = brain.build_completer("k", CascadeConfig(), graph=_CapturingGraph())
+    completer([{"role": "system", "content": "persona"}, {"role": "user", "content": "hi"}])
+    roles = [m["role"] for m in captured["messages"]]
+    assert roles == ["user"]
+
+
+# --- _reply_text / _content_text ---------------------------------------------
+
+
+def test_reply_text_skips_empty_ai_messages_and_takes_last_text():
+    # Scanning from the end, a trailing empty AIMessage (a tool-call request with no
+    # spoken text) is skipped so the reply falls back to the prior AIMessage's text,
+    # rather than coming back blank.
+    result = {
+        "messages": [
+            AIMessage(content="The answer is 42."),
+            AIMessage(content=""),
+        ]
+    }
+    assert brain._reply_text(result) == "The answer is 42."
+
+
+def test_reply_text_joins_list_content_blocks():
+    result = {"messages": [AIMessage(content=[{"type": "text", "text": "Hello "}, "world"])]}
+    assert brain._reply_text(result) == "Hello world"
+
+
+def test_reply_text_skips_non_assistant_messages():
+    from langchain_core.messages import ToolMessage
+
+    # Scanning from the end, a trailing non-assistant message (e.g. a tool result) is
+    # skipped — the spoken reply is the AIMessage before it.
+    result = {
+        "messages": [
+            AIMessage(content="hello there"),
+            ToolMessage(content="tool output", tool_call_id="c1"),
+        ]
+    }
+    assert brain._reply_text(result) == "hello there"
+
+
+def test_content_text_coerces_unexpected_content():
+    # A content that is neither a string nor a list of blocks (defensive fallback).
+    assert brain._content_text(123) == "123"
+
+
+def test_reply_text_is_empty_without_an_assistant_message():
+    assert brain._reply_text({"messages": []}) == ""
+    assert brain._reply_text({}) == ""
+
+
+# --- build_live_tools --------------------------------------------------------
+
+
+def test_build_live_tools_includes_search_when_keyed(monkeypatch):
+    search = object()
+    monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch")
+    monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: search)
+    monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", lambda: ["docs"])
+    tools = brain.build_live_tools()
+    # Fetch + the keyed search + the docs tools, in that order.
+    assert tools == ["fetch", search, "docs"]
+
+
+def test_build_live_tools_omits_search_when_unkeyed(monkeypatch):
+    monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch")
+    monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: None)
+    monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", list)
+    tools = brain.build_live_tools()
+    # No TAVILY_API_KEY -> no search tool, just the fetch tool.
+    assert tools == ["fetch"]
+
+
+# --- build_graph (model construction + compile, with the docs probe skipped) -
+
+
+def test_build_graph_uses_gateway_model_and_runs_offline(monkeypatch):
+    captured = {}
+
+    def fake_build_model(api_key, *, model, max_tokens, extra):
+        captured["model"] = model
+        captured["max_tokens"] = max_tokens
+        captured["extra"] = dict(extra)
+        return FakeChatModel(responses=[AIMessage(content="hi from the agent")])
+
+    monkeypatch.setattr(model_mod, "build_model", fake_build_model)
+    cfg = CascadeConfig(model="claude-x", max_tokens=128, llm_extra={"temperature": 0.2})
+    graph = brain.build_graph("k", cfg, tools=[])
+    # The cascade's model + knobs are threaded into the gateway model build.
+    assert captured == {"model": "claude-x", "max_tokens": 128, "extra": {"temperature": 0.2}}
+    # The compiled graph is a real deepagents graph that answers offline via the fake model.
+    completer = brain.build_completer("k", cfg, graph=graph)
+    assert completer([{"role": "user", "content": "hi"}]) == "hi from the agent"
+
+
+# --- build_model new knobs ---------------------------------------------------
+
+
+def test_build_model_threads_max_tokens_and_extra():
+    model = model_mod.build_model("k", model="claude-x", max_tokens=222, extra={"top_k": 5})
+    assert model.max_tokens == 222
+    assert model.extra_body == {"top_k": 5}
+
+
+def test_build_model_defaults_have_no_extra():
+    model = model_mod.build_model("k", model="claude-x")
+    assert model.max_tokens is None
+    assert model.extra_body is None
diff --git a/tests/test_agent_cascade_command.py b/tests/test_agent_cascade_command.py
index 513dc1cc..93d25a4e 100644
--- a/tests/test_agent_cascade_command.py
+++ b/tests/test_agent_cascade_command.py
@@ -1,4 +1,4 @@
-"""Command + wiring tests for `assembly agent-cascade`.
+"""Command + wiring tests for `assembly live`.
 
 Covers the argv -> options seam, the validation guards, _open_audio source
 selection, and CascadeDeps.real's three live legs (all driven against fakes).
@@ -60,14 +60,14 @@ def _opts(**overrides) -> AgentCascadeOptions:
 
 
 def test_list_voices_human_lists_catalog():
-    result = runner.invoke(app, ["agent-cascade", "--list-voices"])
+    result = runner.invoke(app, ["live", "--list-voices"])
     assert result.exit_code == 0
     assert "jane" in result.output
     assert "English:" in result.output
 
 
 def test_list_voices_json_emits_array():
-    result = runner.invoke(app, ["agent-cascade", "--list-voices", "--json"])
+    result = runner.invoke(app, ["live", "--list-voices", "--json"])
     assert result.exit_code == 0
     assert result.output.lstrip().startswith("[")
     assert '"jane"' in result.output
@@ -92,14 +92,14 @@ def test_missing_system_prompt_file_is_rejected_by_typer():
     # so the sandbox guard (the other exit-2 path) never runs. Asserting the guard's
     # message is absent kills the exists=True mutant without depending on the Rich error
     # text, which CI renders with ANSI + width ellipsis.
-    result = runner.invoke(app, ["agent-cascade", "--system-prompt-file", "/no/such/file"])
+    result = runner.invoke(app, ["live", "--system-prompt-file", "/no/such/file"])
     assert result.exit_code == 2
     assert "sandbox" not in result.output.lower()
 
 
 def test_production_env_is_rejected_with_sandbox_hint():
     # Default env is production, which has no streaming-TTS host.
-    result = runner.invoke(app, ["agent-cascade", "--voice", "jane"])
+    result = runner.invoke(app, ["live", "--voice", "jane"])
     assert result.exit_code == 2
     assert "only available in the sandbox" in result.output
 
@@ -126,7 +126,7 @@ def fake_run(opts, state, *, json_mode):
         captured["opts"] = opts
 
     monkeypatch.setattr(_exec, "run_agent_cascade", fake_run)
-    result = runner.invoke(app, ["agent-cascade", *argv])
+    result = runner.invoke(app, ["live", *argv])
     assert result.exit_code == 0
     assert captured["opts"].format_turns is expected
 
@@ -137,7 +137,7 @@ def test_stt_config_file_must_exist():
     # terminal so the "does not exist" message isn't wrapped by the 80-col error box.
     result = runner.invoke(
         app,
-        ["agent-cascade", "--stt-config-file", "/no/such/file.json"],
+        ["live", "--stt-config-file", "/no/such/file.json"],
         env={"COLUMNS": "300"},
     )
     assert result.exit_code == 2
@@ -418,36 +418,23 @@ def fake_stream_audio(api_key, source, *, params, on_turn):
     assert captured["params"] is params
 
 
-def test_deps_real_complete_reply_threads_model_tokens_and_extra(monkeypatch):
+def test_deps_real_complete_reply_is_built_by_the_deepagents_brain(monkeypatch):
+    # The LLM leg is now a deepagents graph: .real delegates to brain.build_completer,
+    # passing the api key + config, and uses whatever completer it returns. We assert the
+    # exact wiring so the brain swap (not a plain llm.complete) can't silently regress.
     captured = {}
 
-    def fake_complete(api_key, **kwargs):
-        captured.update(kwargs)
-        return "raw-response"
+    def fake_build_completer(api_key, config):
+        captured["api_key"] = api_key
+        captured["config"] = config
+        return lambda messages: f"reply to {messages[-1]['content']}"
 
-    monkeypatch.setattr(engine.llm, "complete", fake_complete)
-    monkeypatch.setattr(engine.llm, "content_of", lambda response: response.upper())
+    monkeypatch.setattr(engine.brain, "build_completer", fake_build_completer)
     cfg = CascadeConfig(model="m", max_tokens=222, llm_extra={"temperature": 0.5})
     deps = CascadeDeps.real("k", cfg, audio=[], stt_params=_stt_params())
-    assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "RAW-RESPONSE"
-    assert captured["model"] == "m"
-    assert captured["max_tokens"] == 222
-    assert captured["extra"] == {"temperature": 0.5}
-
-
-def test_deps_real_complete_reply_sends_no_extra_when_unset(monkeypatch):
-    captured = {}
-
-    def fake_complete(api_key, **kwargs):
-        captured.update(kwargs)
-        return "x"
-
-    monkeypatch.setattr(engine.llm, "complete", fake_complete)
-    monkeypatch.setattr(engine.llm, "content_of", lambda response: response)
-    deps = CascadeDeps.real("k", CascadeConfig(), audio=[], stt_params=_stt_params())
-    deps.complete_reply([{"role": "user", "content": "hi"}])
-    # Empty overrides collapse to None, not an empty dict, so the gateway sees no extra body.
-    assert captured["extra"] is None
+    assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "reply to hi"
+    assert captured["api_key"] == "k"
+    assert captured["config"] is cfg
 
 
 def test_deps_real_synthesize_threads_voice_language_and_extra(monkeypatch):
diff --git a/tests/test_agent_cascade_show_code.py b/tests/test_agent_cascade_show_code.py
index d05b5874..97bbe0ff 100644
--- a/tests/test_agent_cascade_show_code.py
+++ b/tests/test_agent_cascade_show_code.py
@@ -1,4 +1,4 @@
-"""`assembly agent-cascade --show-code` tests.
+"""`assembly live --show-code` tests.
 
 Split from test_agent_cascade_command.py (which holds the run-path wiring) so the
 print-only path's many invocations live in their own file. The cascade is
@@ -33,7 +33,7 @@ def _boom(**kwargs):
     )
     result = runner.invoke(
         app,
-        ["--sandbox", "agent-cascade", "--voice", "jane", "--greeting", "Hi there", "--show-code"],
+        ["--sandbox", "live", "--voice", "jane", "--greeting", "Hi there", "--show-code"],
     )
     assert result.exit_code == 0
     # Targets the sandbox the key was minted for — all three legs.
@@ -54,25 +54,23 @@ def fake_run(opts, state, *, json_mode):
         captured["opts"] = opts
 
     monkeypatch.setattr(_exec, "run_agent_cascade", fake_run)
-    assert runner.invoke(app, ["agent-cascade"]).exit_code == 0
+    assert runner.invoke(app, ["live"]).exit_code == 0
     assert captured["opts"].show_code is False
-    assert runner.invoke(app, ["agent-cascade", "--show-code"]).exit_code == 0
+    assert runner.invoke(app, ["live", "--show-code"]).exit_code == 0
     assert captured["opts"].show_code is True
 
 
 def test_show_code_injects_speech_model(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
-    result = runner.invoke(
-        app, ["--sandbox", "agent-cascade", "--speech-model", "u3-rt-pro", "--show-code"]
-    )
+    result = runner.invoke(app, ["--sandbox", "live", "--speech-model", "u3-rt-pro", "--show-code"])
     assert result.exit_code == 0
     assert "speech_model=u3-rt-pro" in result.stdout
 
 
 def test_show_code_reflects_no_format_turns(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
-    formatted = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"])
-    bare = runner.invoke(app, ["--sandbox", "agent-cascade", "--no-format-turns", "--show-code"])
+    formatted = runner.invoke(app, ["--sandbox", "live", "--show-code"])
+    bare = runner.invoke(app, ["--sandbox", "live", "--no-format-turns", "--show-code"])
     # With formatting on the cue waits for the punctuated turn; off, a bare end-of-turn fires.
     assert "turn_is_formatted" in formatted.stdout
     assert "turn_is_formatted" not in bare.stdout
@@ -83,7 +81,7 @@ def test_show_code_threads_model_and_max_tokens(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
     result = runner.invoke(
         app,
-        ["--sandbox", "agent-cascade", "--model", "claude-x", "--max-tokens", "321", "--show-code"],
+        ["--sandbox", "live", "--model", "claude-x", "--max-tokens", "321", "--show-code"],
     )
     assert result.exit_code == 0
     assert "claude-x" in result.stdout
@@ -95,7 +93,7 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch):
     monkeypatch.setattr(
         _exec.engine, "run_cascade", lambda **kw: (_ for _ in ()).throw(AssertionError("no run"))
     )
-    result = runner.invoke(app, ["--sandbox", "agent-cascade", "clip.wav", "--show-code"])
+    result = runner.invoke(app, ["--sandbox", "live", "clip.wav", "--show-code"])
     assert result.exit_code == 0
     assert "uses the microphone" in result.stderr
     assert "uses the microphone" not in result.stdout  # stdout stays a clean script
@@ -104,13 +102,13 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch):
 
 def test_show_code_mic_emits_no_warning(monkeypatch):
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None)
-    result = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"])
+    result = runner.invoke(app, ["--sandbox", "live", "--show-code"])
     assert result.exit_code == 0
     assert "uses the microphone" not in result.stderr  # mic script matches the run, nothing to warn
 
 
 def test_show_code_in_production_is_rejected_with_sandbox_hint():
     # --show-code still honors the sandbox-only guard, so the generated URLs are valid.
-    result = runner.invoke(app, ["agent-cascade", "--show-code"])
+    result = runner.invoke(app, ["live", "--show-code"])
     assert result.exit_code == 2
     assert "only available in the sandbox" in result.output
diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py
index 76af37af..0e5d17c5 100644
--- a/tests/test_code_agent.py
+++ b/tests/test_code_agent.py
@@ -344,49 +344,6 @@ def invoke(self, *a, **k):
     assert any(isinstance(e, ErrorText) and "gateway 500" in e.text for e in seen)
 
 
-class StreamingAgent:
-    """A double exercising the streaming path: yields scripted state snapshots."""
-
-    def __init__(self, chunks: list[dict[str, object]]) -> None:
-        self._chunks = chunks
-
-    def stream(self, graph_input, config=None, *, stream_mode="values"):
-        del graph_input, config, stream_mode
-        yield from self._chunks
-
-    def invoke(self, *a, **k):  # the streaming branch is taken, so invoke is never used
-        raise AssertionError("a streaming agent must not be invoked")
-
-
-def test_send_streams_each_step_and_cancel_stops_the_loop() -> None:
-    from langchain_core.messages import HumanMessage
-
-    # Three successive graph states (messages grow by one each step); a stream_mode="values"
-    # graph yields exactly these snapshots, so the session must emit incrementally.
-    chunks: list[dict[str, object]] = [
-        {"messages": [HumanMessage("go")]},
-        {"messages": [HumanMessage("go"), AIMessage("first")]},
-        {"messages": [HumanMessage("go"), AIMessage("first"), AIMessage("second")]},
-    ]
-    seen: list[object] = []
-    session = CodeSession(
-        agent=StreamingAgent(chunks), sink=seen.append, approver=lambda n, a: True
-    )
-
-    def sink(event: object) -> None:
-        seen.append(event)
-        if isinstance(event, AssistantText) and event.text == "first":
-            session.request_cancel()  # cancel mid-stream, before the "second" chunk is consumed
-
-    session.sink = sink
-    session.send("go")
-
-    texts = [e.text for e in seen if isinstance(e, AssistantText)]
-    # "first" streamed out as its step landed; the cancel then broke the loop, so the later
-    # "second" step was never emitted — proving both incremental rendering and cancellation.
-    assert texts == ["first"]
-
-
 def test_session_propagates_keyboard_interrupt() -> None:
     class Stop:
         def invoke(self, *a, **k):
diff --git a/tests/test_code_messages.py b/tests/test_code_messages.py
new file mode 100644
index 00000000..9a1168d4
--- /dev/null
+++ b/tests/test_code_messages.py
@@ -0,0 +1,149 @@
+"""Tests for the mounted-widget transcript of the `assembly code` TUI.
+
+Drives the real Textual app (headless) and asserts on the mounted message widgets: the reply
+streams into one AssistantMessage in place and renders as Markdown, and a long tool result is
+a collapsible ToolOutput (Ctrl-O / click). Split from test_code_tui.py to stay under the
+file-length gate.
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+from aai_cli.code_agent.events import AssistantDelta, AssistantText, ToolResult
+from aai_cli.code_agent.messages import AssistantMessage, ToolOutput
+from aai_cli.code_agent.tui import CodeAgentApp
+
+
+class FakeAgent:
+    """Replays scripted invoke() results so a turn can complete without a model."""
+
+    def __init__(self, results: list[dict[str, object]]) -> None:
+        self._results = results
+        self.calls = 0
+
+    def invoke(self, *args, **kwargs):
+        result = self._results[self.calls]
+        self.calls += 1
+        return result
+
+
+def _run(coro) -> None:
+    asyncio.run(coro)
+
+
+def test_assistant_reply_renders_as_markdown_widget() -> None:
+    # The reply mounts an AssistantMessage rendered as Markdown — the fence markers are
+    # consumed and the code shows; the raw text is kept for clipboard copy.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            reply = "Here you go:\n\n```python\nprint('hi')\n```"
+            app._write_event(AssistantText(reply))
+            await pilot.pause()
+            msg = app.query_one(AssistantMessage)
+            text = "\n".join(msg.render_line(y).text for y in range(msg.size.height))
+            assert "```" not in text  # markdown consumed the fence markers
+            assert "print('hi')" in text  # the code itself renders
+            assert app._last_reply == reply  # raw markdown kept for clipboard copy
+
+    _run(go())
+
+
+def test_assistant_deltas_stream_in_place_then_finalize() -> None:
+    # Tokens stream into a single AssistantMessage in place (no separate region); the final
+    # AssistantText finalizes that same widget rather than mounting a second one.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._write_event(AssistantDelta("Hello, "))
+            app._write_event(AssistantDelta("world!"))
+            await pilot.pause()
+            assert len(app.query(AssistantMessage)) == 1  # one widget, updated in place
+            assert app.query_one(AssistantMessage).text == "Hello, world!"
+            streaming = app._streaming_msg  # local: asserting on the attr would poison the
+            assert streaming is not None  # later `is None` check (mypy can't see the reset)
+            app._write_event(AssistantText("Hello, world!"))
+            await pilot.pause()
+            assert app._streaming_msg is None  # finalized
+            assert app._last_reply == "Hello, world!"
+            assert len(app.query(AssistantMessage)) == 1  # finalized in place, not a 2nd widget
+
+    _run(go())
+
+
+def test_finish_turn_finalizes_a_dangling_streamed_reply() -> None:
+    # A turn cancelled mid-generation leaves a streamed-but-unfinalized reply; finishing the
+    # turn commits what streamed in (so it isn't lost) and clears the streaming reference.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._write_event(AssistantDelta("partial repl"))
+            await pilot.pause()
+            streaming = app._streaming_msg  # local so the later `is None` check stays reachable
+            assert streaming is not None
+            app._finish_turn()
+            assert app._streaming_msg is None  # finalized, not left dangling
+            assert app.query_one(AssistantMessage).text == "partial repl"  # kept what streamed
+
+    _run(go())
+
+
+def test_short_tool_output_is_not_expandable() -> None:
+    # Output that already fits has no expand affordance and Ctrl-O is a no-op on it.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._write_event(ToolResult(name="execute", content="ok"))
+            await pilot.pause()
+            out = app.query_one(ToolOutput)
+            before = str(out.render())
+            assert "Ctrl+O" not in before  # nothing to expand -> no hint
+            out.toggle()
+            assert str(out.render()) == before  # toggle is a no-op when it all fits
+
+    _run(go())
+
+
+def test_tool_output_toggles_on_click_and_ctrl_o_is_safe_with_no_output() -> None:
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.action_toggle_output()  # no tool output yet -> safe no-op
+            app._write_event(
+                ToolResult(name="execute", content="\n".join(f"x{i}" for i in range(20)))
+            )
+            await pilot.pause()
+            out = app.query_one(ToolOutput)
+            assert "x19" not in str(out.render())
+            out.on_click()  # clicking expands
+            assert "x19" in str(out.render())
+
+    _run(go())
+
+
+def test_tool_output_expands_and_collapses_on_ctrl_o() -> None:
+    # A long tool result mounts a collapsed ToolOutput (preview + "more lines"); Ctrl-O
+    # expands it to the full content and toggles back.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._write_event(
+                ToolResult(name="execute", content="\n".join(f"ln{i}" for i in range(20)))
+            )
+            await pilot.pause()
+            out = app.query_one(ToolOutput)
+            collapsed = str(out.render())
+            assert "ln0" in collapsed and "more lines" in collapsed and "ln19" not in collapsed
+            app.action_toggle_output()  # Ctrl-O expands the most recent output
+            assert "ln19" in str(out.render())  # full content now shown
+            app.action_toggle_output()  # toggles back to the preview
+            assert "ln19" not in str(out.render())
+
+    _run(go())
diff --git a/tests/test_code_modals.py b/tests/test_code_modals.py
new file mode 100644
index 00000000..5ad276b8
--- /dev/null
+++ b/tests/test_code_modals.py
@@ -0,0 +1,236 @@
+"""Tests for the spoken/voice-answerable approval and ask modals.
+
+The pure ``approval_from_speech`` mapping is unit-tested directly; the screen wiring (speak the
+prompt, listen, dismiss with the mapped decision) is driven through the real app headless with
+a scripted voice double — no mic, speaker, or socket.
+"""
+
+from __future__ import annotations
+
+import asyncio
+
+import pytest
+from textual.widgets import Input
+
+from aai_cli.code_agent.modals import ApprovalScreen, AskScreen, approval_from_speech
+from aai_cli.code_agent.tui import CodeAgentApp
+from aai_cli.core.errors import CLIError
+
+
+class FakeAgent:
+    def invoke(self, *a, **k):
+        return {}
+
+
+class FakeVoice:
+    """Scripted voice IO: speak() records, listen() replays one transcript (or raises)."""
+
+    def __init__(self, transcript: str | None = None, *, error: CLIError | None = None) -> None:
+        self._transcript = transcript
+        self._error = error
+        self.spoken: list[str] = []
+
+    def speak(self, text: str) -> None:
+        self.spoken.append(text)
+
+    def listen(self) -> str | None:
+        if self._error is not None:
+            raise self._error
+        return self._transcript
+
+
+def _run(coro) -> None:
+    asyncio.run(coro)
+
+
+@pytest.mark.parametrize(
+    ("said", "decision"),
+    [
+        ("yes please", "approve"),
+        ("approve that", "approve"),
+        ("go ahead", "approve"),
+        ("auto approve", "auto"),
+        ("always do this", "auto"),
+        ("no", "reject"),
+        ("reject it", "reject"),
+        ("don't", "reject"),
+        ("yes but no", "reject"),  # reject wins over approve when both are heard (safer)
+        ("uhh what", "reject"),  # unclear -> safe default
+    ],
+)
+def test_approval_from_speech(said: str, decision: str) -> None:
+    assert approval_from_speech(said) == decision
+
+
+async def _push_and_wait(app, pilot, screen) -> object:
+    box: dict[str, object] = {}
+    app.push_screen(screen, lambda result: box.update(value=result))
+    for _ in range(300):
+        await pilot.pause(0.01)
+        if "value" in box:
+            break
+    return box.get("value", "__pending__")
+
+
+def test_spoken_approval_speaks_prompt_and_maps_answer() -> None:
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            voice = FakeVoice(transcript="yes go for it")
+            result = await _push_and_wait(
+                app, pilot, ApprovalScreen("execute", {"command": "rm -rf build"}, voice=voice)
+            )
+            assert result == "approve"  # spoken "yes" mapped to approve
+            prompt = voice.spoken[0]
+            assert "Run execute" in prompt and "rm -rf build" in prompt
+            assert "Warning:" in prompt  # the risky command is read aloud
+            assert "approve, auto-approve, or reject" in prompt
+
+    _run(go())
+
+
+def test_spoken_approval_rejects_on_no() -> None:
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            result = await _push_and_wait(
+                app, pilot, ApprovalScreen("write_file", {"file_path": "x"}, voice=FakeVoice("no"))
+            )
+            assert result == "reject"
+
+    _run(go())
+
+
+def test_spoken_ask_speaks_question_and_returns_transcript() -> None:
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            voice = FakeVoice(transcript="use port 8080")
+            result = await _push_and_wait(app, pilot, AskScreen("Which port?", voice=voice))
+            assert result == "use port 8080"  # spoken answer returned verbatim
+            assert "The agent asks: Which port?" in voice.spoken[0]
+
+    _run(go())
+
+
+def test_silence_does_not_auto_reject() -> None:
+    # No speech (listen -> None) must not auto-decide — the modal waits for speech or a keypress
+    # rather than rejecting a tool on a pause.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            box: dict[str, object] = {}
+            app.push_screen(
+                ApprovalScreen("execute", {"command": "ls"}, voice=FakeVoice(None)),
+                lambda result: box.update(value=result),
+            )
+            for _ in range(50):
+                await pilot.pause(0.01)
+            assert "value" not in box  # silence -> not dismissed
+
+    _run(go())
+
+
+def test_voice_failure_falls_back_to_keyboard() -> None:
+    # If the mic/STT fails, the modal isn't auto-dismissed — the user can still press a key.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            voice = FakeVoice(error=CLIError("no mic", error_type="mic_missing", exit_code=2))
+            box: dict[str, object] = {}
+            app.push_screen(
+                ApprovalScreen("execute", {"command": "ls"}, voice=voice),
+                lambda result: box.update(value=result),
+            )
+            for _ in range(50):
+                await pilot.pause(0.01)
+            assert "value" not in box  # voice failed -> not auto-dismissed
+            await pilot.press("n")  # keyboard still works
+            await pilot.pause()
+            assert box.get("value") == "reject"
+
+    _run(go())
+
+
+def test_ask_voice_failure_falls_back_to_typing() -> None:
+    # An ask modal whose voice fails isn't dismissed; the user types the answer instead.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            voice = FakeVoice(error=CLIError("no mic", error_type="mic_missing", exit_code=2))
+            box: dict[str, object] = {}
+            app.push_screen(AskScreen("Which port?", voice=voice), lambda r: box.update(value=r))
+            for _ in range(50):
+                await pilot.pause(0.01)
+            assert "value" not in box  # voice failed -> not auto-dismissed
+            app.screen.query_one("#answer", Input).value = "8080"
+            await pilot.press("enter")
+            await pilot.pause()
+            assert box.get("value") == "8080"
+
+    _run(go())
+
+
+def test_spoken_prompt_omits_detail_when_no_args() -> None:
+    # A tool with no identifying arg reads as just "Run <tool>. Say approve…" (no detail clause).
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            voice = FakeVoice(transcript="yes")
+            result = await _push_and_wait(app, pilot, ApprovalScreen("noop", {}, voice=voice))
+            assert result == "approve"
+            assert "Run noop. Say approve" in voice.spoken[0]  # straight to the options
+
+    _run(go())
+
+
+def test_ask_silence_does_not_dismiss() -> None:
+    # No spoken answer (listen -> None) leaves the ask modal up for typing.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            box: dict[str, object] = {}
+            app.push_screen(AskScreen("Q?", voice=FakeVoice(None)), lambda r: box.update(value=r))
+            for _ in range(50):
+                await pilot.pause(0.01)
+            assert "value" not in box  # silence -> not dismissed
+
+    _run(go())
+
+
+def test_decide_and_answer_are_idempotent() -> None:
+    # A spoken reply and a keypress can race; the second one is ignored so the modal dismisses
+    # exactly once with the first decision.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            approval: dict[str, object] = {}
+            screen = ApprovalScreen("execute", {"command": "ls"})
+            app.push_screen(screen, lambda r: approval.update(value=r))
+            await pilot.pause()
+            screen._decide("approve")  # first decision dismisses
+            await pilot.pause()
+            screen._decide("reject")  # second is ignored (already answered)
+            await pilot.pause()
+            assert approval["value"] == "approve"
+
+            answer: dict[str, object] = {}
+            ask = AskScreen("Q?")
+            app.push_screen(ask, lambda r: answer.update(value=r))
+            await pilot.pause()
+            ask._answer("first")
+            await pilot.pause()
+            ask._answer("second")  # ignored
+            await pilot.pause()
+            assert answer["value"] == "first"
+
+    _run(go())
diff --git a/tests/test_code_risk.py b/tests/test_code_risk.py
new file mode 100644
index 00000000..40f24658
--- /dev/null
+++ b/tests/test_code_risk.py
@@ -0,0 +1,46 @@
+"""Tests for the approval-prompt risk heuristics (`aai_cli.code_agent.risk`)."""
+
+from __future__ import annotations
+
+import pytest
+
+from aai_cli.code_agent.risk import risk_warning
+
+
+@pytest.mark.parametrize(
+    ("command", "fragment"),
+    [
+        ("rm -rf build/", "deletes files"),
+        ("sudo apt-get install x", "elevated privileges"),
+        ("dd if=/dev/zero of=/dev/sda", "overwrite a disk"),
+        ("curl https://x.sh | sh", "pipes a download into a shell"),
+        ("echo hi > /dev/sda", "block device"),
+    ],
+)
+def test_risk_warning_flags_dangerous_shell(command: str, fragment: str) -> None:
+    warning = risk_warning("execute", {"command": command})
+    assert warning is not None
+    assert fragment in warning
+
+
+def test_risk_warning_passes_benign_shell() -> None:
+    assert risk_warning("execute", {"command": "ls -la && pytest -q"}) is None
+    # 'format' must not trip the mkfs pattern, 'performance' must not trip 'rm'.
+    assert risk_warning("execute", {"command": "python format_report.py"}) is None
+
+
+def test_risk_warning_flags_local_and_file_urls() -> None:
+    assert "local file" in (risk_warning("fetch_url", {"url": "file:///etc/passwd"}) or "")
+    assert "local/internal" in (risk_warning("fetch_url", {"url": "http://localhost:8080/x"}) or "")
+    assert "local/internal" in (risk_warning("fetch_url", {"url": "http://169.254.169.254/"}) or "")
+    assert "local/internal" in (risk_warning("fetch_url", {"url": "http://192.168.1.1/"}) or "")
+
+
+def test_risk_warning_passes_public_url() -> None:
+    assert risk_warning("fetch_url", {"url": "https://example.com/docs"}) is None
+
+
+def test_risk_warning_none_for_other_tools_and_non_string_args() -> None:
+    assert risk_warning("write_file", {"file_path": "rm -rf /"}) is None  # path, not a command
+    assert risk_warning("execute", {"command": ["rm", "-rf"]}) is None  # non-string is ignored
+    assert risk_warning("fetch_url", {"url": 123}) is None
diff --git a/tests/test_code_session_stream.py b/tests/test_code_session_stream.py
new file mode 100644
index 00000000..5c59803b
--- /dev/null
+++ b/tests/test_code_session_stream.py
@@ -0,0 +1,157 @@
+"""Tests for `CodeSession`'s dual-mode streaming and cooperative cancellation.
+
+Split from `test_code_agent.py` (which drives the real graph) to keep each file under the
+500-line gate. These exercise the streaming loop with lightweight fakes: the session renders
+from per-super-step ``"values"`` snapshots and checks the cancel flag on the frequent
+per-token ``"messages"`` deltas, so a long generation can be interrupted promptly.
+"""
+
+from __future__ import annotations
+
+from langchain_core.messages import AIMessage, HumanMessage, ToolMessage
+
+from aai_cli.code_agent.events import AssistantDelta, AssistantText, assistant_delta
+from aai_cli.code_agent.session import CodeSession
+
+
+class StreamingAgent:
+    """A double exercising the dual-mode streaming path.
+
+    Mirrors langgraph's ``stream_mode=["values", "messages"]`` contract: each scripted state
+    snapshot is yielded tagged as ``("values", snapshot)``, optionally preceded by
+    ``("messages", delta)`` per-token deltas (the fine-grained cancellation checkpoints).
+    """
+
+    def __init__(
+        self, chunks: list[dict[str, object]], *, token_deltas: tuple[str, ...] = ()
+    ) -> None:
+        self._chunks = chunks
+        self._token_deltas = token_deltas
+
+    def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")):
+        del graph_input, config, stream_mode
+        for delta in self._token_deltas:
+            yield ("messages", delta)
+        for chunk in self._chunks:
+            yield ("values", chunk)
+
+    def invoke(self, *a, **k):  # the streaming branch is taken, so invoke is never used
+        raise AssertionError("a streaming agent must not be invoked")
+
+
+def test_assistant_delta_is_frozen_hashable() -> None:
+    # frozen=True makes it immutable+hashable; a non-frozen eq dataclass sets __hash__=None,
+    # so hash() would raise — this keeps the event safe to dedupe/compare and pins `frozen`.
+    assert hash(AssistantDelta("x")) == hash(AssistantDelta("x"))
+
+
+def test_assistant_delta_extracts_only_ai_text() -> None:
+    # messages-mode yields (message, metadata); only AI text becomes a delta.
+    assert assistant_delta((AIMessage("tok"), {"node": "agent"})) == AssistantDelta("tok")
+    assert assistant_delta(AIMessage("bare")) == AssistantDelta("bare")  # untupled is fine too
+    assert assistant_delta((AIMessage(""), {})) is None  # empty content (e.g. a tool-call turn)
+    assert assistant_delta((ToolMessage("result", tool_call_id="1"), {})) is None  # not assistant
+    assert assistant_delta(()) is None  # defensive: empty payload
+
+
+def test_send_emits_assistant_deltas_from_messages_stream() -> None:
+    # The per-token messages chunks are surfaced as AssistantDelta (live preview), and the
+    # values snapshot still yields the authoritative AssistantText.
+    seen: list[object] = []
+
+    class TokenAgent:
+        def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")):
+            del graph_input, config, stream_mode
+            yield ("messages", (AIMessage("Hello, "), {}))
+            yield ("messages", (AIMessage("world"), {}))
+            yield ("values", {"messages": [AIMessage("Hello, world")]})
+
+        def invoke(self, *a, **k):
+            raise AssertionError("a streaming agent must not be invoked")
+
+    session = CodeSession(agent=TokenAgent(), sink=seen.append, approver=lambda n, a: True)
+    session.send("go")
+
+    deltas = [e.text for e in seen if isinstance(e, AssistantDelta)]
+    finals = [e.text for e in seen if isinstance(e, AssistantText)]
+    assert deltas == ["Hello, ", "world"]  # streamed tokens
+    assert finals == ["Hello, world"]  # authoritative full reply from the values snapshot
+
+
+def test_send_streams_each_step_and_cancel_stops_the_loop() -> None:
+    # Three successive graph states (messages grow by one each step); a stream_mode="values"
+    # graph yields exactly these snapshots, so the session must emit incrementally.
+    chunks: list[dict[str, object]] = [
+        {"messages": [HumanMessage("go")]},
+        {"messages": [HumanMessage("go"), AIMessage("first")]},
+        {"messages": [HumanMessage("go"), AIMessage("first"), AIMessage("second")]},
+    ]
+    seen: list[object] = []
+    session = CodeSession(
+        agent=StreamingAgent(chunks), sink=seen.append, approver=lambda n, a: True
+    )
+
+    def sink(event: object) -> None:
+        seen.append(event)
+        if isinstance(event, AssistantText) and event.text == "first":
+            session.request_cancel()  # cancel mid-stream, before the "second" chunk is consumed
+
+    session.sink = sink
+    session.send("go")
+
+    texts = [e.text for e in seen if isinstance(e, AssistantText)]
+    # "first" streamed out as its step landed; the cancel then broke the loop, so the later
+    # "second" step was never emitted — proving both incremental rendering and cancellation.
+    assert texts == ["first"]
+
+
+def test_cancel_within_a_step_breaks_on_a_token_delta() -> None:
+    # A single model generation is one super-step, so a values-only loop can't break until the
+    # whole reply lands. Streaming the per-token "messages" deltas alongside gives a frequent
+    # cancel checkpoint: a Ctrl-C mid-generation breaks before the reply ("late") is ever
+    # rendered. Modeled by an agent that requests cancel between two token deltas.
+    seen: list[object] = []
+
+    class TokenStreamAgent:
+        session: CodeSession
+
+        def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")):
+            del graph_input, config, stream_mode
+            yield ("messages", "par")  # first token arrives — loop sees no cancel yet
+            self.session.request_cancel()  # user hits Ctrl-C mid-generation
+            yield ("messages", "tial")  # next token: the loop's top-of-iteration check breaks
+            yield ("values", {"messages": [AIMessage("late")]})  # must never be rendered
+
+        def invoke(self, *a, **k):
+            raise AssertionError("a streaming agent must not be invoked")
+
+    agent = TokenStreamAgent()
+    session = CodeSession(agent=agent, sink=seen.append, approver=lambda n, a: True)
+    agent.session = session
+    session.send("go")
+
+    texts = [e.text for e in seen if isinstance(e, AssistantText)]
+    assert texts == []  # the post-cancel "late" reply was dropped, not rendered
+
+
+def test_only_values_chunks_are_rendered_not_messages_deltas() -> None:
+    # The dual-mode stream tags each yield by mode; only "values" snapshots are rendered (the
+    # "messages" deltas exist purely as cancel checkpoints). A messages delta that happens to
+    # be a dict must NOT be emitted — guards the `mode == "values" and ...` guard against an
+    # `and`->`or` slip that would render it.
+    seen: list[object] = []
+
+    class DualModeAgent:
+        def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")):
+            del graph_input, config, stream_mode
+            yield ("messages", {"messages": [AIMessage("ghost")]})  # dict, but messages-mode
+            yield ("values", {"messages": [AIMessage("real")]})
+
+        def invoke(self, *a, **k):
+            raise AssertionError("a streaming agent must not be invoked")
+
+    session = CodeSession(agent=DualModeAgent(), sink=seen.append, approver=lambda n, a: True)
+    session.send("go")
+
+    texts = [e.text for e in seen if isinstance(e, AssistantText)]
+    assert texts == ["real"]  # the messages-mode dict ("ghost") was not rendered
diff --git a/tests/test_code_summarize.py b/tests/test_code_summarize.py
new file mode 100644
index 00000000..ebf0eb24
--- /dev/null
+++ b/tests/test_code_summarize.py
@@ -0,0 +1,93 @@
+"""Tests for the shared tool-activity summarizers (`aai_cli.code_agent.summarize`).
+
+These keep the coding-agent transcript scannable: a tool call shows its identifying arg
+(not the whole file being written), and tool output is previewed with a hidden-line tail.
+"""
+
+from __future__ import annotations
+
+from aai_cli.code_agent.summarize import (
+    describe_args,
+    full_args,
+    summarize_call,
+    summarize_result,
+)
+
+
+def test_describe_args_prefers_identity_arg_and_elides_bulk() -> None:
+    # write_file's content is the bulk we must NOT inline — only the path identifies the call.
+    body = "\n".join(f"line {i}" for i in range(50))
+    assert describe_args({"file_path": "app.py", "content": body}) == "app.py"
+    # A shell command is the identity arg for execute.
+    assert describe_args({"command": "pip install flask"}) == "pip install flask"
+
+
+def test_describe_args_clips_long_identity_value() -> None:
+    out = describe_args({"command": "echo " + "x" * 200})
+    assert out.endswith("…")
+    assert len(out) == 60  # exact: clipped to the per-arg budget, ellipsis included
+
+
+def test_describe_args_without_identity_shows_capped_key_values() -> None:
+    out = describe_args({"a": 1, "b": 2, "c": 3, "d": 4})
+    # Only the first few args render, then an ellipsis marks the elided remainder.
+    assert out.startswith("a=1, b=2, c=3")
+    assert out.endswith(", …")
+    assert "d=4" not in out
+
+
+def test_describe_args_collapses_newlines_in_values() -> None:
+    # A newline-bearing value must not break the one-line transcript entry.
+    assert "\n" not in describe_args({"x": "a\nb\nc"})
+
+
+def test_summarize_call_wraps_args_in_tool_name() -> None:
+    assert (
+        summarize_call("write_file", {"file_path": "app.py", "content": "x"})
+        == "write_file(app.py)"
+    )
+
+
+def test_summarize_result_previews_and_counts_hidden_lines() -> None:
+    out = summarize_result("\n".join(f"line {i}" for i in range(20)))
+    assert "line 0" in out and "line 3" in out
+    assert "line 4" not in out  # only the first few lines are kept
+    assert "+16 more lines" in out  # the rest are counted, not dropped silently
+
+
+def test_summarize_result_shows_short_output_in_full() -> None:
+    assert summarize_result("done\n") == "done"  # no tail when nothing is hidden
+    assert summarize_result("   ") == ""  # whitespace-only collapses to empty
+
+
+def test_full_args_shows_every_arg_whole_with_newlines() -> None:
+    # The expanded view keeps content (and its newlines) that describe_args elides.
+    out = full_args({"file_path": "app.py", "content": "a\nb\nc"})
+    assert "file_path=app.py" in out
+    assert "content=a\nb\nc" in out  # full value, newlines preserved
+
+
+def test_full_args_caps_a_huge_value_with_char_count() -> None:
+    out = full_args({"content": "z" * 1500})  # over the 1000-char expanded budget
+    assert "+500 more chars" in out  # exact: 1500 minus the 1000 budget
+    assert out.startswith("content=" + "z" * 1000)
+
+
+def test_full_args_shows_a_value_at_the_budget_whole() -> None:
+    # Boundary: exactly the budget is shown whole (guards the cap's `>` against a `>=` slip).
+    out = full_args({"content": "z" * 1000})
+    assert "more chars" not in out
+    assert out == "content=" + "z" * 1000
+
+
+def test_summarize_result_counts_a_single_hidden_line() -> None:
+    # Boundary: exactly one line over the preview budget still gets a tail (guards the
+    # `hidden_lines > 0` threshold against a `> 1` slip that would silently drop it).
+    out = summarize_result("\n".join(f"line {i}" for i in range(5)))  # 4 shown, 1 hidden
+    assert out.endswith("(+1 more lines)")
+
+
+def test_summarize_result_clips_one_huge_line_with_char_count() -> None:
+    out = summarize_result("z" * 500)  # a single line longer than the char budget
+    assert "+200 more chars" in out  # exact: 500 minus the 300-char budget = 200 hidden
+    assert out.startswith("z" * 300)
diff --git a/tests/test_code_tui.py b/tests/test_code_tui.py
index 8abeee08..b536dec1 100644
--- a/tests/test_code_tui.py
+++ b/tests/test_code_tui.py
@@ -10,15 +10,15 @@
 import asyncio
 import threading
 import time
-from pathlib import Path
 
 import pytest
 from langchain_core.messages import AIMessage, HumanMessage
-from textual.widgets import Input, Label, RichLog, Static
+from textual.containers import VerticalScroll
+from textual.widgets import Input, Label, Static
 
-from aai_cli.code_agent import tui
 from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult
-from aai_cli.code_agent.tui import ApprovalScreen, AskScreen, CodeAgentApp
+from aai_cli.code_agent.modals import ApprovalScreen, AskScreen
+from aai_cli.code_agent.tui import CodeAgentApp
 
 
 class FakeAgent:
@@ -39,31 +39,6 @@ def __init__(self, value: dict[str, object]) -> None:
         self.value = value
 
 
-# --- pure helpers -------------------------------------------------------------
-
-
-def test_format_args_and_abbrev_home() -> None:
-    assert tui._format_args({"a": 1, "b": "x"}) == "a=1, b='x'"
-    assert tui._abbrev_home(Path.home() / "proj") == "~/proj"
-    # A path outside home renders as-is; compare to the platform-native string so this
-    # holds on Windows (where str(Path(...)) uses backslashes) as well as POSIX.
-    outside = Path("/etc/hosts")
-    assert tui._abbrev_home(outside) == str(outside)
-
-
-def test_git_branch_and_status(tmp_path: Path) -> None:
-    assert tui._git_branch(tmp_path) is None  # no .git
-    (tmp_path / ".git").mkdir()
-    (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/feature-x\n")
-    assert tui._git_branch(tmp_path) == "feature-x"
-    (tmp_path / ".git" / "HEAD").write_text("a1b2c3d4e5f6\n")  # detached
-    assert tui._git_branch(tmp_path) == "a1b2c3d4"
-
-    status = tui._status_text(tmp_path, auto_approve=True)
-    assert "auto" in status and "a1b2c3d4" in status
-    assert "manual" in tui._status_text(tmp_path, auto_approve=False)
-
-
 # --- pilot tests --------------------------------------------------------------
 
 
@@ -76,8 +51,9 @@ async def go() -> None:
         app = CodeAgentApp(agent=FakeAgent([]), web_note="no key", thread_id="t1")
         async with app.run_test(size=(100, 30)) as pilot:
             await pilot.pause()
-            log = app.query_one("#log", RichLog)
-            assert len(log.lines) > 6  # wordmark + tagline
+            log = app.query_one("#log", VerticalScroll)
+            assert len(log.children) >= 1  # the splash is mounted into the transcript
+            assert "Ready to code" in str(log.children[0].render())  # splash intro shown
             assert app.focused is app.query_one("#prompt", Input)
 
     _run(go())
@@ -216,6 +192,50 @@ async def go() -> None:
     _run(go())
 
 
+def test_approval_expands_args_on_e() -> None:
+    # Collapsed, the prompt shows only the identifying arg (the filename); pressing `e`
+    # expands it to the full args, revealing the file content that was elided.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.push_screen(
+                ApprovalScreen("write_file", {"file_path": "x.py", "content": "SECRET"})
+            )
+            await pilot.pause()
+            detail = app.screen.query_one("#approvaldetail", Label)
+            assert "SECRET" not in str(detail.render())  # collapsed: content elided
+            await pilot.press("e")
+            await pilot.pause()
+            assert "SECRET" in str(detail.render())  # expanded: full args shown
+            await pilot.press("e")  # toggles back
+            await pilot.pause()
+            assert "SECRET" not in str(detail.render())
+
+    _run(go())
+
+
+def test_approval_shows_risk_warning_for_dangerous_command() -> None:
+    # A destructive shell command carries a one-line warning above the prompt; a benign one
+    # mounts no warning label at all.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.push_screen(ApprovalScreen("execute", {"command": "rm -rf build/"}))
+            await pilot.pause()
+            warn = app.screen.query("#approvalwarn")
+            assert warn  # warning present
+            assert "deletes files" in str(warn.first().render())
+            app.pop_screen()
+            await pilot.pause()
+            app.push_screen(ApprovalScreen("execute", {"command": "ls -la"}))
+            await pilot.pause()
+            assert not app.screen.query("#approvalwarn")  # benign: no warning mounted
+
+    _run(go())
+
+
 def test_approval_box_is_compact_and_bottom_docked() -> None:
     # Regression guard: the approval prompt must not take over the whole screen — it
     # docks a short box at the bottom so the transcript stays visible above it.
@@ -233,6 +253,28 @@ async def go() -> None:
     _run(go())
 
 
+def test_modals_are_transparent_so_transcript_stays_visible() -> None:
+    # Regression guard: the app's `Screen { background: #000000 }` canvas rule matches every
+    # Screen subclass, and app CSS beats a widget's DEFAULT_CSS — so without the explicit
+    # `ModalScreen { background: transparent }` app rule, the modal paints opaque black and
+    # blanks the transcript behind it. Assert each modal resolves to a see-through background
+    # (alpha 0); an opaque modal (alpha 1.0) — the bug — fails here.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.push_screen(ApprovalScreen("write_file", {"file_path": "x.py"}))
+            await pilot.pause()
+            assert app.screen.styles.background.a == 0  # approval modal is see-through
+            app.pop_screen()
+            await pilot.pause()
+            app.push_screen(AskScreen("which port?"))
+            await pilot.pause()
+            assert app.screen.styles.background.a == 0  # ask modal is see-through
+
+    _run(go())
+
+
 def test_approval_auto_approve_flips_mode_and_skips_later_prompts() -> None:
     # Picking "Auto-approve (a)" approves this call, flips the badge manual→auto, and
     # makes every later _approve return True without ever pushing a modal.
@@ -335,11 +377,6 @@ async def go() -> None:
     _run(go())
 
 
-def test_spinner_text_formats_frame_and_elapsed() -> None:
-    assert tui._spinner_text(46, "✶") == "✶ Working… (46s)"
-    assert tui._spinner_text(0, "✷") == "✷ Working… (0s)"
-
-
 def test_spinner_starts_ticks_and_stops(monkeypatch: pytest.MonkeyPatch) -> None:
     async def go() -> None:
         app = CodeAgentApp(agent=FakeAgent([]))
@@ -352,6 +389,12 @@ async def go() -> None:
             await pilot.pause()
             assert app.query_one("#spinner", Static).display is True
             # _tick wires the elapsed seconds off the start time; pin "now" to assert it.
+            # Stop the live interval first so only this deterministic tick writes the
+            # readout — otherwise a real-time auto-tick can race the assert on a loaded
+            # runner, which flaked CI with "(6s)" vs "(7s)". update()->render() is
+            # synchronous, so no pilot.pause() is needed (and pausing here deadlocks).
+            assert app._spin_timer is not None
+            app._spin_timer.stop()
             monkeypatch.setattr(time, "monotonic", lambda: app._turn_started + 7.0)
             app._tick()
             assert "Working… (7s)" in str(app.query_one("#spinner", Static).render())
diff --git a/tests/test_code_tui_status.py b/tests/test_code_tui_status.py
new file mode 100644
index 00000000..f261a517
--- /dev/null
+++ b/tests/test_code_tui_status.py
@@ -0,0 +1,49 @@
+"""Tests for the coding-agent TUI's pure status/text helpers (`tui_status`).
+
+Split from test_code_tui.py (which drives the Textual app) to keep each file under the
+500-line gate; these need no pilot, just the plain functions.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+from aai_cli.code_agent import tui_status
+
+
+def test_spinner_text_formats_frame_and_elapsed() -> None:
+    assert tui_status._spinner_text(46, "✶") == "✶ Working… (46s)"
+    assert tui_status._spinner_text(0, "✷") == "✷ Working… (0s)"
+
+
+def test_abbrev_home() -> None:
+    assert tui_status._abbrev_home(Path.home() / "proj") == "~/proj"
+    # A path outside home renders as-is; compare to the platform-native string so this
+    # holds on Windows (where str(Path(...)) uses backslashes) as well as POSIX.
+    outside = Path("/etc/hosts")
+    assert tui_status._abbrev_home(outside) == str(outside)
+
+
+def test_git_branch_and_status(tmp_path: Path) -> None:
+    assert tui_status._git_branch(tmp_path) is None  # no .git
+    (tmp_path / ".git").mkdir()
+    (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/feature-x\n")
+    assert tui_status._git_branch(tmp_path) == "feature-x"
+    (tmp_path / ".git" / "HEAD").write_text("a1b2c3d4e5f6\n")  # detached
+    assert tui_status._git_branch(tmp_path) == "a1b2c3d4"
+
+    status = tui_status._status_text(tmp_path, auto_approve=True)
+    assert "auto" in status and "a1b2c3d4" in status
+    assert "manual" in tui_status._status_text(tmp_path, auto_approve=False)
+
+
+def test_status_text_renders_voice_badge(tmp_path: Path) -> None:
+    # No voice front-end -> no voice badge (the dot glyphs are absent); on/off render the
+    # state so the Ctrl-V toggle shows. (Asserts on the dots, not the word — the tmp_path name
+    # itself can contain "voice".)
+    none = tui_status._status_text(tmp_path, auto_approve=False)
+    assert "●" not in none and "○" not in none
+    on = tui_status._status_text(tmp_path, auto_approve=False, voice_state="on")
+    off = tui_status._status_text(tmp_path, auto_approve=False, voice_state="off")
+    assert "voice on" in on and "●" in on  # filled dot when on
+    assert "voice off" in off and "○" in off  # hollow dot when off
diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py
index 8adbaea6..88072984 100644
--- a/tests/test_code_tui_voice.py
+++ b/tests/test_code_tui_voice.py
@@ -12,7 +12,7 @@
 
 import pytest
 from langchain_core.messages import AIMessage, HumanMessage
-from textual.widgets import Input
+from textual.widgets import Input, Static
 
 from aai_cli.code_agent.tui import CodeAgentApp
 from aai_cli.core.errors import CLIError
@@ -163,3 +163,167 @@ async def go() -> None:
             assert app._voice is None
 
     _run(go())
+
+
+def test_toggle_voice_pauses_and_resumes_capture() -> None:
+    # Ctrl-V flips voice off (no capture, no readback) and back on; the state badge tracks it.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            # Assert via the methods, not the `_voice_paused` attribute: mypy narrows the
+            # attribute and can't see action_toggle_voice() flip it back, flagging the second
+            # check unreachable. The method calls reflect the same state without that trap.
+            assert app._voice_active()
+            assert app._voice_state() == "on"
+            app.action_toggle_voice()  # pause
+            assert not app._voice_active()
+            assert app._voice_state() == "off"
+            app.action_toggle_voice()  # resume
+            assert app._voice_active()
+            assert app._voice_state() == "on"
+
+    _run(go())
+
+
+def test_paused_voice_skips_followup_readback() -> None:
+    # While paused, the post-turn followup neither speaks a summary nor listens.
+    async def go() -> None:
+        voice = FakeVoice(transcripts=["ignored"])
+        app = CodeAgentApp(agent=FakeAgent([]), voice=voice)
+        app._voice_paused = True  # set before mount so on_mount never auto-listens
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._last_reply = "a reply"
+            app._voice_followup()
+            await pilot.pause()
+            assert voice.spoken == []  # paused: no readback
+            assert voice.listens == 0  # paused: no capture
+
+    _run(go())
+
+
+def test_voice_mode_swaps_text_input_for_listening_affordance() -> None:
+    # While voice capture is on, the text prompt is hidden and a "listening" bar shows;
+    # toggling voice off (Ctrl-V) brings the text box back. (Re-query each check so mypy
+    # doesn't narrow a stored display bool across the toggles.)
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        app._voice_paused = True  # start paused so on_mount doesn't race a capture thread
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            assert app.query_one("#promptbar").display is True  # paused -> text box visible
+            assert app.query_one("#voicebar").display is False
+            app.action_toggle_voice()  # voice on
+            await pilot.pause()
+            assert app.query_one("#promptbar").display is False  # text box hidden
+            assert app.query_one("#voicebar").display is True  # listening affordance shown
+            app.action_toggle_voice()  # voice off
+            await pilot.pause()
+            assert app.query_one("#promptbar").display is True  # text box back
+            assert app.query_one("#voicebar").display is False
+
+    _run(go())
+
+
+def test_voice_capture_failure_restores_the_text_input() -> None:
+    # When the mic is ruled out mid-session, the listening bar is replaced by the text box.
+    async def go() -> None:
+        voice = FakeVoice(error=CLIError("no mic", error_type="mic_missing", exit_code=2))
+        app = CodeAgentApp(agent=FakeAgent([]), voice=voice)
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            assert await _wait_until(pilot, lambda: app._voice_typed)
+            await pilot.pause()
+            assert app.query_one("#promptbar").display is True  # text box restored on failure
+            assert app.query_one("#voicebar").display is False
+
+    _run(go())
+
+
+def test_voice_bar_distinguishes_phases() -> None:
+    # The bar shows a distinct label per phase; only the listening phase carries the type hint.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        app._voice_paused = True  # quiet the auto-listen; drive phases directly
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._set_voice_phase("listening")
+            bar = str(app.query_one("#voicebar", Static).render())
+            assert "Listening" in bar and "Ctrl-V to type" in bar
+            app._set_voice_phase("thinking")
+            bar = str(app.query_one("#voicebar", Static).render())
+            assert "Thinking" in bar and "Ctrl-V to type" not in bar  # hint is listening-only
+            app._set_voice_phase("speaking")
+            assert "Speaking" in str(app.query_one("#voicebar", Static).render())
+
+    _run(go())
+
+
+def test_spinner_suppressed_in_voice_mode() -> None:
+    # In voice mode the bar carries the "thinking" state, so the separate spinner stays hidden;
+    # pausing voice brings the spinner back.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._start_spinner()
+            assert app.query_one("#spinner", Static).display is False  # voice active -> no spinner
+            app._voice_paused = True
+            app._start_spinner()
+            assert app.query_one("#spinner", Static).display is True  # paused -> spinner shows
+
+    _run(go())
+
+
+def test_voice_bar_animation_timer_runs_and_advances() -> None:
+    # The meter animation timer runs only while the bar is shown, and a tick changes the frame.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        app._voice_paused = True
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            # Read into fresh locals each time: asserting `is None`/`is not None` on the same
+            # attribute across the opaque toggle would make mypy flag the later check unreachable.
+            paused_timer = app._voice_timer
+            assert paused_timer is None  # paused -> no animation
+            app.action_toggle_voice()  # voice on -> bar shown, timer running
+            await pilot.pause()
+            running_timer = app._voice_timer
+            assert running_timer is not None
+            before = str(app.query_one("#voicebar", Static).render())
+            app._tick_voice()
+            assert str(app.query_one("#voicebar", Static).render()) != before  # meter advanced
+            app.action_toggle_voice()  # voice off -> timer stopped
+            await pilot.pause()
+            stopped_timer = app._voice_timer
+            assert stopped_timer is None
+
+    _run(go())
+
+
+def test_submit_sets_thinking_phase() -> None:
+    async def go() -> None:
+        agent = FakeAgent([{"messages": [HumanMessage("go"), AIMessage("done")]}])
+        app = CodeAgentApp(agent=agent, voice=FakeVoice())
+        app._voice_paused = True  # keep the post-turn followup from flipping the phase
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._submit("go")
+            assert app._voice_phase == "thinking"  # set synchronously when the turn starts
+            await app.workers.wait_for_complete()
+
+    _run(go())
+
+
+def test_toggle_voice_without_session_notifies_and_stays_off() -> None:
+    # With no voice front-end the toggle is a no-op (notice only) and never marks a pause.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]))  # no voice
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.action_toggle_voice()
+            assert app._voice_paused is False  # nothing to pause
+            assert app._voice_state() is None  # no badge without a session
+
+    _run(go())
diff --git a/tests/test_microphone.py b/tests/test_microphone.py
index 207f6b4a..ffb2468e 100644
--- a/tests/test_microphone.py
+++ b/tests/test_microphone.py
@@ -1,5 +1,7 @@
+import signal
 import sys
 import types
+from collections.abc import Callable, Mapping
 from typing import Any
 
 import pytest
@@ -11,7 +13,12 @@
     MicrophoneSource,
     _default_mic_stream,
     _device_default_rate,
+    _ignore_interrupt_during_shutdown,
+    _install_shutdown_interrupt_guard,
+    _max_input_channels,
+    _RawInputStream,
     _SoundDeviceMic,
+    import_sounddevice,
     resample_pcm16,
 )
 
@@ -37,6 +44,29 @@ def close(self):
         self.closed = True
 
 
+class _FakeSoundDevice(types.ModuleType):
+    """A typed `_SoundDeviceModule` double: scripted device info + a RawInputStream factory.
+
+    Subclasses `ModuleType` so it can be slotted into `sys.modules` via `monkeypatch.setitem`,
+    and conforms to the protocol so it needs no escape hatches at the call sites that pass it
+    to the real `_max_input_channels` / `_default_mic_stream` code under test.
+    """
+
+    def __init__(
+        self,
+        info: Mapping[str, object],
+        raw_input_stream: Callable[..., _RawInputStream] = _FakeRawStream,
+    ) -> None:
+        super().__init__("sounddevice")
+        self._info = info
+        self.RawInputStream = raw_input_stream
+
+    def query_devices(
+        self, device: int | None = None, kind: str | None = None
+    ) -> Mapping[str, object]:
+        return self._info
+
+
 def test_audio_missing_error_has_reinstall_suggestion():
     from aai_cli.core.microphone import audio_missing_error
 
@@ -303,19 +333,24 @@ def test_sounddevice_mic_downmixes_stereo_to_mono():
     assert next(iter(mic)) == b"\x00\x02"
 
 
-def _fake_sd_rejecting_mono(max_input_channels: int, opened: list[int]) -> Any:
+def _fake_sd_rejecting_mono(max_input_channels: int, opened: list[int]) -> _FakeSoundDevice:
     """A sounddevice whose mono open fails with -9998; query reports ``max_input_channels``."""
 
-    def raw_input_stream(**kwargs):
-        opened.append(kwargs["channels"])
-        if kwargs["channels"] == 1:
+    def raw_input_stream(*, channels: int, **kwargs: object) -> _RawInputStream:
+        opened.append(channels)
+        if channels == 1:
             raise OSError("Error opening RawInputStream: Invalid number of channels [-9998]")
-        return _FakeStereoStream(**kwargs)
+        return _FakeStereoStream(channels=channels, **kwargs)
+
+    return _FakeSoundDevice({"max_input_channels": max_input_channels}, raw_input_stream)
 
-    fake_sd: Any = types.ModuleType("sounddevice")
-    fake_sd.RawInputStream = raw_input_stream
-    fake_sd.query_devices = lambda device, kind: {"max_input_channels": max_input_channels}
-    return fake_sd
+
+def test_max_input_channels_defaults_to_zero_when_absent_or_non_int():
+    # A device dict missing the key, or carrying a non-int value, must read as 0 channels (so
+    # the caller raises the actionable no-input error) rather than a truthy bogus count.
+    assert _max_input_channels(_FakeSoundDevice({}), None) == 0  # key absent -> 0, not get()'s
+    assert _max_input_channels(_FakeSoundDevice({"max_input_channels": None}), None) == 0  # non-int
+    assert _max_input_channels(_FakeSoundDevice({"max_input_channels": 2}), None) == 2  # int passes
 
 
 def test_default_mic_stream_falls_back_to_stereo_downmix(monkeypatch):
@@ -337,6 +372,7 @@ def test_default_mic_stream_zero_input_channels_raises_permission_error(monkeypa
         _default_mic_stream(sample_rate=16000, device=None)
     assert opened == [1]  # only the mono attempt; no pointless stereo retry
     assert exc.value.error_type == "mic_error"
+    assert exc.value.exit_code == 1
     assert "no input channels" in exc.value.message.lower()
     assert exc.value.suggestion is not None
     assert "Microphone" in exc.value.suggestion
@@ -365,3 +401,50 @@ def boom(**_kwargs):
         list(mic)
     assert exc.value is err  # passed through unchanged
     assert exc.value.suggestion == "grant it"
+
+
+def test_ignore_interrupt_during_shutdown_sets_sig_ign():
+    # The guard drops a second Ctrl-C during teardown so it can't raise inside
+    # sounddevice's atexit PortAudio terminate. Save/restore the global disposition.
+    before = signal.getsignal(signal.SIGINT)
+    try:
+        _ignore_interrupt_during_shutdown()
+        assert signal.getsignal(signal.SIGINT) is signal.SIG_IGN
+    finally:
+        signal.signal(signal.SIGINT, before)
+
+
+def test_install_shutdown_interrupt_guard_registers_once(monkeypatch):
+    registered = []
+    monkeypatch.setattr(microphone, "_shutdown_interrupt_guard_installed", False)
+    monkeypatch.setattr(microphone.atexit, "register", lambda fn: registered.append(fn))
+
+    _install_shutdown_interrupt_guard()
+    _install_shutdown_interrupt_guard()  # idempotent: the flag short-circuits the second call
+
+    assert registered == [_ignore_interrupt_during_shutdown]
+
+
+def test_import_sounddevice_installs_shutdown_guard(monkeypatch):
+    registered = []
+    monkeypatch.setattr(microphone, "_shutdown_interrupt_guard_installed", False)
+    monkeypatch.setattr(microphone.atexit, "register", lambda fn: registered.append(fn))
+    monkeypatch.setitem(sys.modules, "sounddevice", types.ModuleType("sounddevice"))
+
+    import_sounddevice()
+
+    assert registered == [_ignore_interrupt_during_shutdown]
+
+
+def test_import_sounddevice_missing_does_not_register_guard(monkeypatch):
+    # A broken install raises before the guard is reached, so nothing is registered.
+    registered = []
+    monkeypatch.setattr(microphone, "_shutdown_interrupt_guard_installed", False)
+    monkeypatch.setattr(microphone.atexit, "register", lambda fn: registered.append(fn))
+    monkeypatch.setitem(sys.modules, "sounddevice", None)  # import -> ImportError
+
+    with pytest.raises(CLIError) as exc:
+        import_sounddevice()
+
+    assert exc.value.error_type == "mic_missing"
+    assert registered == []
diff --git a/tests/test_sandbox_access.py b/tests/test_sandbox_access.py
index ce947ec4..6fe112de 100644
--- a/tests/test_sandbox_access.py
+++ b/tests/test_sandbox_access.py
@@ -241,7 +241,9 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m
     assert "--sandbox" not in external
     assert "--env" not in external
     assert "[sandbox]" not in external
-    assert "agent-cascade" not in external
+    # The [sandbox]-only `live` command's summary is hidden too (a token unique to it,
+    # since the bare word "live" also appears in other commands' descriptions).
+    assert "tool-using" not in external
     # …but the filter is surgical: non-sandbox flags and commands stay visible (this
     # also kills the mutant that would treat every option/command as sandbox).
     assert "--profile" in external
@@ -255,4 +257,4 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m
     assert "--sandbox" in internal
     assert "--env" in internal
     assert "[sandbox]" in internal
-    assert "agent-cascade" in internal
+    assert "tool-using" in internal
diff --git a/tests/test_smoke.py b/tests/test_smoke.py
index b9ba17ff..a66e2929 100644
--- a/tests/test_smoke.py
+++ b/tests/test_smoke.py
@@ -162,7 +162,7 @@ def test_help_lists_commands_in_workflow_order():
         "stream",
         "dictate",
         "agent",
-        "agent-cascade",
+        "live",
         "speak",
         "llm",
         "clip",

From 48326e7ac1e8e21233547ca8b2066176d2550aec Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Thu, 18 Jun 2026 17:54:34 +0000
Subject: [PATCH 3/3] Fix flaky Windows voice-leg thread teardown in `assembly
 code` TUI
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The voice legs run on daemon threads that call back onto the UI thread via
call_from_thread. If the app stops (a quit, or a test's run_test block exiting)
while a leg is mid-call, that callback raises RuntimeError in the daemon thread,
which pytest's threadexception plugin escalates to a failure — surfacing as a
flaky `tests (windows, py3.12)` run on test_submit_sets_thinking_phase.

Route every leg through a guarded body that swallows the callback error once the
app is no longer running (the spoken turn is moot then) while still surfacing a
genuine failure that happens while the app is live.

Co-Authored-By: Claude Opus 4.8 <noreply@anthropic.com>
Claude-Session: https://claude.ai/code/session_01Ad72JciKrsz4TKG7ZY9GR6
---
 aai_cli/code_agent/voice_ui.py | 21 +++++++++++++++++++-
 tests/test_code_tui_voice.py   | 35 ++++++++++++++++++++++++++++++++++
 2 files changed, 55 insertions(+), 1 deletion(-)

diff --git a/aai_cli/code_agent/voice_ui.py b/aai_cli/code_agent/voice_ui.py
index cdac1e29..0fcaf531 100644
--- a/aai_cli/code_agent/voice_ui.py
+++ b/aai_cli/code_agent/voice_ui.py
@@ -56,7 +56,26 @@ def _voice_active(self) -> bool:
 
     def _spawn(self, target: Callable[[], None]) -> None:
         """Run ``target`` on a daemon thread — voice legs block, so they stay off the UI thread."""
-        threading.Thread(target=target, daemon=True).start()  # pragma: no mutate
+        thread = threading.Thread(
+            target=lambda: self._run_leg(target),
+            daemon=True,  # pragma: no mutate — daemon flag only affects process exit, unassertable
+        )
+        thread.start()
+
+    def _run_leg(self, target: Callable[[], None]) -> None:
+        """Run one voice leg, dropping the callback error a torn-down app raises mid-flight.
+
+        A leg calls back onto the UI thread (``call_from_thread``); if the app stops — a quit,
+        or a test's ``run_test`` block exiting — while the leg is mid-call, that callback raises
+        ``RuntimeError`` in this daemon thread, which would otherwise surface as an unhandled
+        thread exception (a flaky Windows CI failure). The spoken turn is moot once the app is
+        gone, so swallow it then; a genuine failure while the app is still live still propagates.
+        """
+        try:
+            target()
+        except Exception:
+            if self.is_running:
+                raise
 
     def _begin_listening(self) -> None:
         """Capture the next spoken turn on a background thread (no-op when voice is off)."""
diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py
index 88072984..e402d2d0 100644
--- a/tests/test_code_tui_voice.py
+++ b/tests/test_code_tui_voice.py
@@ -316,6 +316,41 @@ async def go() -> None:
     _run(go())
 
 
+def test_run_leg_swallows_callback_error_after_the_app_stops() -> None:
+    # A voice leg still in flight when the app tears down calls back onto a dead UI thread;
+    # the resulting RuntimeError must be dropped (the spoken turn is moot), not surface as an
+    # unhandled thread exception. This app was never started, so is_running is False.
+    app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+    assert app.is_running is False
+    ran: list[bool] = []
+
+    def boom() -> None:
+        ran.append(True)
+        raise RuntimeError("App is not running")
+
+    app._run_leg(boom)  # returns without raising — the teardown-race error is swallowed
+    assert ran == [True]  # the leg body did run; only its post-teardown error was dropped
+
+
+def test_run_leg_reraises_a_genuine_failure_while_the_app_is_live() -> None:
+    # While the app is running, a real exception in a leg is a bug and must propagate (so it's
+    # reported), not be silently swallowed like the teardown race above.
+    async def go() -> None:
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        app._voice_paused = True  # no auto-listen thread racing this assertion
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            assert app.is_running is True
+
+            def boom() -> None:
+                raise ValueError("genuine bug")
+
+            with pytest.raises(ValueError, match="genuine bug"):
+                app._run_leg(boom)
+
+    _run(go())
+
+
 def test_toggle_voice_without_session_notifies_and_stays_off() -> None:
     # With no voice front-end the toggle is a no-op (notice only) and never marks a pause.
     async def go() -> None: