From dc3fa64e62a43a803c5487bc60d58739aed1fbc6 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 16:24:44 -0700
Subject: [PATCH 01/10] assembly code/live: voice-interrupt UX, modal
 dismissal, concise speech, gemini live default

- Interrupting the readback (Escape/Ctrl-C while the voice is speaking) now stops the
  talking and resumes listening instead of pausing to text mode; interrupting while
  listening still pauses to the text prompt. Ctrl-C only arms the double-press quit when
  it paused to text, not when it resumed listening.
- Escape/Ctrl-C dismiss the approval modal (declining the tool) and the ask modal
  (empty answer).
- The assembly code system prompt now steers the model to concise, speech-ready prose
  (read aloud), with code kept in fenced blocks the readback skips.
- assembly live defaults to gemini-2.5-flash-lite (low latency for spoken turns);
  assembly code stays gpt-5.1. Verified the gateway accepts it; --help snapshot updated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/agent_cascade/config.py               |  8 +++----
 aai_cli/code_agent/modals.py                  |  8 +++++++
 aai_cli/code_agent/prompt.py                  |  8 +++++--
 aai_cli/code_agent/tui.py                     | 19 ++++++++--------
 .../test_snapshots_help_run.ambr              |  2 +-
 tests/test_agent_cascade_config.py            |  2 +-
 tests/test_code_agent.py                      | 12 ++++++++++
 tests/test_code_tui.py                        | 16 ++++++++++++++
 tests/test_code_tui_voice.py                  | 22 +++++++++++++++++++
 9 files changed, 80 insertions(+), 17 deletions(-)

diff --git a/aai_cli/agent_cascade/config.py b/aai_cli/agent_cascade/config.py
index efa5b43..ef8aa3d 100644
--- a/aai_cli/agent_cascade/config.py
+++ b/aai_cli/agent_cascade/config.py
@@ -13,10 +13,10 @@
 from aai_cli.agent_cascade.voices import DEFAULT_VOICE
 from aai_cli.core import llm
 
-# `assembly live` defaults to a capable gateway model (override with --model); kept a
-# literal rather than llm.DEFAULT_MODEL so the live agent's default is independent of the
-# one-shot `assembly llm` default.
-DEFAULT_MODEL = "gpt-5.1"
+# `assembly live` defaults to a fast, low-latency gateway model (override with --model) —
+# a literal rather than llm.DEFAULT_MODEL so the live agent's default is independent of the
+# one-shot `assembly llm` default. Latency matters most for a spoken back-and-forth.
+DEFAULT_MODEL = "gemini-2.5-flash-lite"
 DEFAULT_MAX_TOKENS = llm.DEFAULT_MAX_TOKENS
 # The realtime model the cascade transcribes with (same as the agent-cascade template).
 DEFAULT_SPEECH_MODEL = "u3-rt-pro"
diff --git a/aai_cli/code_agent/modals.py b/aai_cli/code_agent/modals.py
index 25c54a7..4e21a97 100644
--- a/aai_cli/code_agent/modals.py
+++ b/aai_cli/code_agent/modals.py
@@ -76,6 +76,8 @@ class ApprovalScreen(ModalScreen[str]):
         ("a", "auto", "Auto-approve"),
         ("n", "reject", "Reject"),
         ("e", "expand", "Expand"),
+        # Escape / Ctrl-C dismiss the modal — declining the tool is the safe cancel.
+        ("escape,ctrl+c", "reject", "Cancel"),
     ]
 
     def __init__(
@@ -165,6 +167,8 @@ class AskScreen(ModalScreen[str]):
         border: round #3a3f55; background: #000000; padding: 0 1; margin: 0 1 1 1;
     }
     """
+    # Escape / Ctrl-C dismiss the question with no answer.
+    BINDINGS: ClassVar = [("escape,ctrl+c", "cancel", "Cancel")]
 
     def __init__(self, question: str, *, voice: _VoiceIO | None = None) -> None:
         super().__init__()
@@ -198,5 +202,9 @@ def _answer(self, text: str) -> None:
         self._answered = True
         self.dismiss(text)
 
+    def action_cancel(self) -> None:
+        """Escape / Ctrl-C: dismiss with no answer (the agent gets an empty reply)."""
+        self._answer("")
+
     def on_input_submitted(self, event: Input.Submitted) -> None:
         self._answer(event.value)
diff --git a/aai_cli/code_agent/prompt.py b/aai_cli/code_agent/prompt.py
index 4e2a7ef..c704607 100644
--- a/aai_cli/code_agent/prompt.py
+++ b/aai_cli/code_agent/prompt.py
@@ -29,8 +29,12 @@
   for API/SDK questions, and web search for anything else. Prefer the docs for
   AssemblyAI specifics.
 
-Be concise. Make focused edits, explain what you changed, and run commands to verify
-your work when it helps. Stop and ask before destructive or far-reaching actions.\
+Be concise — and especially so out loud. Your prose is read aloud by a text-to-speech
+engine, so keep replies to a sentence or two of plain, simple spoken language: no
+markdown, lists, symbols, URLs, or code in the prose. Put any code in fenced code blocks
+(the readback skips them). Make focused edits, briefly say what you changed, and run
+commands to verify your work when it helps. Stop and ask before destructive or
+far-reaching actions.\
 """
 
 
diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py
index 5d5119d..7ca036e 100644
--- a/aai_cli/code_agent/tui.py
+++ b/aai_cli/code_agent/tui.py
@@ -389,17 +389,18 @@ def _cancel_turn(self) -> bool:
         return True
 
     def _stop_voice_activity(self) -> bool:
-        """Stop in-flight voice listening/readback and go idle; True if voice was active.
+        """Stop in-flight voice; True if voice was active.
 
-        In voice mode the agent is usually listening or reading a reply back — neither is a
-        "running turn", so without this an interrupt key would skip straight to the quit hint.
-        This cancels the active leg, pauses voice (the text prompt returns, no auto re-listen),
-        and refreshes the UI, so a first Ctrl-C/Escape gives immediate feedback. Once paused
-        ``_voice_active`` is False, so a second press falls through to the quit path.
+        Interrupting the readback (speaking) stops it and resumes listening — the cancelled
+        speak() returns and the loop captures the next turn. Interrupting while listening
+        pauses voice to the text prompt, after which a second press falls through to quit.
         """
         if self._voice is None or not self._voice_active():
             return False
         self._voice.cancel()
+        if self._voice_phase == "speaking":  # stop talking, stay in voice mode -> re-listen
+            self._note("stopped — listening…")
+            return True
         self._voice_paused = True
         self._refresh_status()
         self._sync_input_mode()  # active leg stopped -> bring the text prompt back
@@ -417,7 +418,8 @@ def action_quit_or_interrupt(self) -> None:
             self._quit_pending = False
             return
         if self._stop_voice_activity():
-            self._arm_quit_pending()  # idle now; a second Ctrl-C confirms the quit
+            if self._voice_paused:  # paused to text -> a 2nd Ctrl-C quits; re-listening doesn't
+                self._arm_quit_pending()
             return
         if self._quit_pending:
             self.exit()
@@ -481,8 +483,7 @@ def _stop_spinner(self) -> None:
         self.query_one("#spinner", Static).display = False
 
     def on_worker_state_changed(self, event: Worker.StateChanged) -> None:
-        # Guard on is_running: a worker finishing *after* the app tears down (quit / test exit)
-        # would drive _finish_turn against an unmounted DOM — NoMatches on "#spinner", a flake.
+        # is_running guard: a worker finishing after teardown would hit an unmounted DOM.
         if event.worker.is_finished and self.is_running:
             self._finish_turn()
 
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index c2334d9..710b3bb 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -667,7 +667,7 @@
   ╭─ Language model ─────────────────────────────────────────────────────────────╮
   │ --model             TEXT                  LLM Gateway model that powers the  │
   │                                           agent's replies                    │
-  │                                           [default: gpt-5.1]                 │
+  │                                           [default: gemini-2.5-flash-lite]   │
   │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
   │                                           [default: 8192]                    │
   │ --llm-config        TEXT                  Set any LLM Gateway request field  │
diff --git a/tests/test_agent_cascade_config.py b/tests/test_agent_cascade_config.py
index 7514ed5..f8b06a8 100644
--- a/tests/test_agent_cascade_config.py
+++ b/tests/test_agent_cascade_config.py
@@ -19,7 +19,7 @@
 def test_default_config_values():
     config = CascadeConfig()
     assert config.voice == DEFAULT_VOICE
-    assert config.model == DEFAULT_MODEL == "gpt-5.1"  # `assembly live` defaults to gpt-5.1
+    assert config.model == DEFAULT_MODEL == "gemini-2.5-flash-lite"  # `assembly live` default
     assert config.greeting == DEFAULT_GREETING
     # The sliding-window default keeps the last 40 messages of context.
     assert config.max_history == 40
diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py
index a1452bf..b9fd1d2 100644
--- a/tests/test_code_agent.py
+++ b/tests/test_code_agent.py
@@ -27,6 +27,7 @@
 )
 from aai_cli.code_agent.agent import MUTATING_TOOLS, build_agent
 from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult
+from aai_cli.code_agent.prompt import build_system_prompt
 from aai_cli.code_agent.render import RichRenderer, make_approver
 from aai_cli.code_agent.session import QUIT_COMMANDS, CodeSession, run_repl
 
@@ -124,6 +125,17 @@ def test_run_repl_sends_initial_then_lines_until_quit(tmp_path: Path) -> None:
     assert texts == ["a", "b"]  # initial + "second"; blank skipped, stops at /quit
 
 
+def test_system_prompt_steers_concise_speech() -> None:
+    prompt = build_system_prompt("/work")
+    assert "/work" in prompt  # anchored to the working directory
+    # The prose is read aloud, so the prompt must steer the model to concise, speech-ready
+    # replies with code kept out of the spoken text.
+    assert "read aloud" in prompt
+    assert "fenced code blocks" in prompt
+    lowered = prompt.lower()
+    assert "concise" in lowered and "spoken" in lowered
+
+
 def test_mutating_tools_include_cli_shell_and_fetch() -> None:
     assert set(MUTATING_TOOLS) == {"write_file", "edit_file", "execute", "assembly", "fetch_url"}
     assert "exit" in QUIT_COMMANDS and "/exit" in QUIT_COMMANDS
diff --git a/tests/test_code_tui.py b/tests/test_code_tui.py
index b536dec..aa34f4f 100644
--- a/tests/test_code_tui.py
+++ b/tests/test_code_tui.py
@@ -145,6 +145,22 @@ def test_ask_modal_returns_typed_answer() -> None:
     assert answer == "8080"
 
 
+def test_approval_modal_dismisses_on_escape_or_ctrl_c() -> None:
+    # Escape / Ctrl-C decline the tool (the safe cancel), like pressing "n".
+    app = CodeAgentApp(agent=FakeAgent([]))
+    assert _drive_modal(app, lambda: app._approve("execute", {"cmd": "ls"}), ["escape"]) is False
+    app2 = CodeAgentApp(agent=FakeAgent([]))
+    assert _drive_modal(app2, lambda: app2._approve("execute", {"cmd": "ls"}), ["ctrl+c"]) is False
+
+
+def test_ask_modal_dismisses_on_escape_or_ctrl_c_with_no_answer() -> None:
+    # Escape / Ctrl-C cancel the question; the agent gets an empty answer.
+    app = CodeAgentApp(agent=FakeAgent([]))
+    assert _drive_modal(app, lambda: app._ask("which port?"), ["escape"]) == ""
+    app2 = CodeAgentApp(agent=FakeAgent([]))
+    assert _drive_modal(app2, lambda: app2._ask("which port?"), ["ctrl+c"]) == ""
+
+
 def test_full_turn_with_approval_interrupt() -> None:
     async def go() -> None:
         agent = FakeAgent(
diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py
index 2ec6448..459e3e4 100644
--- a/tests/test_code_tui_voice.py
+++ b/tests/test_code_tui_voice.py
@@ -140,6 +140,28 @@ async def go() -> None:
     assert calls == [True]  # running -> the finished turn is handled
 
 
+def test_interrupt_while_speaking_resumes_listening_not_text():  # untyped: probes app internals
+    # Ctrl-C during the readback (speaking) cancels it but stays in voice mode — the loop
+    # re-listens — and doesn't arm the quit. Interrupting while listening pauses to the text
+    # prompt and arms the double-press quit.
+    async def go():
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app._voice_phase = "speaking"
+            app.action_quit_or_interrupt()
+            assert app._voice.cancels >= 1  # the readback was cancelled
+            assert app._voice_paused is False  # stayed in voice mode -> will re-listen
+            assert app._quit_pending is False  # stopping the readback isn't a quit step
+
+            app._voice_phase = "listening"
+            app.action_quit_or_interrupt()
+            assert app._voice_paused is True  # listening-interrupt brings the text prompt back
+            assert app._quit_pending is True  # paused to text -> a 2nd Ctrl-C quits
+
+    _run(go())
+
+
 def test_capture_voice_turn_is_a_noop_once_typed() -> None:
     async def go() -> None:
         voice = FakeVoice(transcripts=["ignored"])

From 0c8da5af077861a420d0829047321dde9c312ec3 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 16:50:27 -0700
Subject: [PATCH 02/10] Strip JSON-Schema metadata keys gateway models reject
 from tool definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

assembly live (and assembly code) bind tools whose JSON-Schema `parameters` carry
`$schema`/`additionalProperties`/`title`. OpenAI ignores them, but Gemini's
function_declarations 400 on them ("Unknown name \"$schema\""), so every tool-bound
turn failed — the brain graph raised a non-CLIError, the reply worker died silently,
and the live agent never responded.

_GatewayChatOpenAI now strips those keys (recursively) from each tool's parameter
schema in the outgoing request, so a tool-bound request works on every gateway-routed
model. Verified end-to-end: the brain now replies on gemini-2.5-flash-lite. This is
what makes the gemini live default usable.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/model.py | 36 +++++++++++++++++++++++++++++++++++-
 tests/test_code_model.py    | 32 ++++++++++++++++++++++++++++++++
 2 files changed, 67 insertions(+), 1 deletion(-)

diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index 4e1c556..b8a8496 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -9,7 +9,7 @@
 from __future__ import annotations
 
 import json
-from collections.abc import Mapping
+from collections.abc import Iterable, Mapping
 from typing import TYPE_CHECKING
 
 from aai_cli.core import environments
@@ -158,6 +158,39 @@ def _is_empty_arguments(arguments: object) -> bool:
     return isinstance(parsed, dict) and not parsed
 
 
+# JSON-Schema metadata keys some gateway-routed models reject on tool definitions. OpenAI
+# ignores them, but Gemini's ``function_declarations`` 400 on ``$schema`` (and friends), which
+# kills any tool-bound turn — so strip them from every tool's parameter schema before sending.
+_UNSUPPORTED_SCHEMA_KEYS = ("$schema", "additionalProperties", "title")
+
+
+def _sanitize_tool_schemas(payload: object) -> None:
+    """Strip model-incompatible JSON-Schema keys from each tool's ``parameters``, in place."""
+    if not isinstance(payload, dict):
+        return
+    tools = payload.get("tools")
+    if not isinstance(tools, list):
+        return
+    for tool in tools:
+        function = tool.get("function") if isinstance(tool, dict) else None
+        if isinstance(function, dict):
+            _strip_schema_keys(function.get("parameters"))
+
+
+def _strip_schema_keys(node: object) -> None:
+    """Recursively drop :data:`_UNSUPPORTED_SCHEMA_KEYS` from a JSON-Schema-shaped structure."""
+    if isinstance(node, dict):
+        for key in _UNSUPPORTED_SCHEMA_KEYS:
+            node.pop(key, None)
+        children: Iterable[object] = list(node.values())
+    elif isinstance(node, list):
+        children = node
+    else:
+        return
+    for child in children:
+        _strip_schema_keys(child)
+
+
 def build_model(
     api_key: str,
     *,
@@ -201,6 +234,7 @@ def _get_request_payload(
             messages = payload.get("messages")
             _flatten_content(messages)
             _ensure_tool_call_arguments(messages)
+            _sanitize_tool_schemas(payload)
             return payload
 
         def _convert_chunk_to_generation_chunk(
diff --git a/tests/test_code_model.py b/tests/test_code_model.py
index 047f9bf..9899b20 100644
--- a/tests/test_code_model.py
+++ b/tests/test_code_model.py
@@ -173,6 +173,38 @@ def test_ensure_tool_call_arguments_guards() -> None:
     model_mod._ensure_tool_call_arguments([{"tool_calls": 99}])  # tool_calls not a list
 
 
+def test_sanitize_tool_schemas_strips_model_incompatible_keys() -> None:
+    # Gemini's function_declarations 400 on $schema/additionalProperties/title; strip them
+    # recursively from each tool's parameters so a tool-bound request works on every model.
+    city: dict[str, object] = {"type": "string", "title": "City"}  # held ref (nested dict)
+    any_of: list[object] = [{"$schema": "x", "type": "string"}]  # held ref (nested list)
+    params: dict[str, object] = {
+        "$schema": "https://json-schema.org/draft/2020-12/schema",
+        "type": "object",
+        "additionalProperties": False,
+        "title": "Args",
+        "properties": {"city": city},
+        "anyOf": any_of,
+    }
+    payload: dict[str, object] = {
+        "tools": [
+            None,  # non-dict tool -> skipped
+            {"type": "function", "function": 7},  # function not a dict -> skipped
+            {"type": "function", "function": {"name": "get_weather", "parameters": params}},
+        ]
+    }
+    model_mod._sanitize_tool_schemas(payload)
+    assert not ({"$schema", "additionalProperties", "title"} & set(params))  # top-level stripped
+    assert params["type"] == "object"  # real schema keys preserved
+    assert city == {"type": "string"}  # nested dict stripped
+    assert any_of == [{"type": "string"}]  # nested list stripped
+
+
+def test_sanitize_tool_schemas_guards() -> None:
+    model_mod._sanitize_tool_schemas(None)  # not a dict -> early return, no error
+    model_mod._sanitize_tool_schemas({"tools": 99})  # tools not a list -> early return
+
+
 def test_get_request_payload_fills_empty_tool_call_arguments() -> None:
     from langchain_core.messages import AIMessage, HumanMessage
     from langchain_openai import ChatOpenAI

From b22495d307301022f674c7c097da0e5786ede32b Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 17:04:51 -0700
Subject: [PATCH 03/10] Strip the full set of Gemini-rejected JSON-Schema
 keywords from tool definitions
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Expanding the earlier $schema/additionalProperties/title fix: the default MCP tools
carry more validation keywords Gemini's function_declarations reject (exclusiveMinimum/
Maximum, multipleOf, patternProperties, …), each 400-ing a tool-bound turn. Strip the
full validation/metadata keyword set (structural keys kept). Verified end-to-end: the
live brain replies on gemini-2.5-flash-lite with all 28 default MCP tools loaded.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/model.py | 30 ++++++++++++++++++++---
 tests/test_code_model.py    | 49 +++++++++++++++++++++++++++----------
 2 files changed, 62 insertions(+), 17 deletions(-)

diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index b8a8496..f2b7e50 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -158,10 +158,32 @@ def _is_empty_arguments(arguments: object) -> bool:
     return isinstance(parsed, dict) and not parsed
 
 
-# JSON-Schema metadata keys some gateway-routed models reject on tool definitions. OpenAI
-# ignores them, but Gemini's ``function_declarations`` 400 on ``$schema`` (and friends), which
-# kills any tool-bound turn — so strip them from every tool's parameter schema before sending.
-_UNSUPPORTED_SCHEMA_KEYS = ("$schema", "additionalProperties", "title")
+# JSON-Schema keywords some gateway-routed models reject on tool definitions. OpenAI ignores
+# them, but Gemini's ``function_declarations`` 400 on them ("Unknown name …"), which kills any
+# tool-bound turn. These are all validation/metadata keywords — stripping them leaves the
+# structural schema (type/properties/items/required/enum/anyOf/description/…) the model needs
+# to call the tool, so the call still works; only the unenforced constraints are dropped.
+_UNSUPPORTED_SCHEMA_KEYS = (
+    "$schema",
+    "$id",
+    "$comment",
+    "title",
+    "default",
+    "examples",
+    "const",
+    "additionalProperties",
+    "unevaluatedProperties",
+    "patternProperties",
+    "minProperties",
+    "maxProperties",
+    "propertyNames",
+    "exclusiveMinimum",
+    "exclusiveMaximum",
+    "multipleOf",
+    "additionalItems",
+    "unevaluatedItems",
+    "contains",
+)
 
 
 def _sanitize_tool_schemas(payload: object) -> None:
diff --git a/tests/test_code_model.py b/tests/test_code_model.py
index 9899b20..2cc8c70 100644
--- a/tests/test_code_model.py
+++ b/tests/test_code_model.py
@@ -174,17 +174,40 @@ def test_ensure_tool_call_arguments_guards() -> None:
 
 
 def test_sanitize_tool_schemas_strips_model_incompatible_keys() -> None:
-    # Gemini's function_declarations 400 on $schema/additionalProperties/title; strip them
-    # recursively from each tool's parameters so a tool-bound request works on every model.
-    city: dict[str, object] = {"type": "string", "title": "City"}  # held ref (nested dict)
-    any_of: list[object] = [{"$schema": "x", "type": "string"}]  # held ref (nested list)
+    # Gemini's function_declarations 400 on these validation/metadata keywords; strip every
+    # one (recursively) while keeping structural keys, so a tool-bound request works.
+    denied = [
+        "$schema",
+        "$id",
+        "$comment",
+        "title",
+        "default",
+        "examples",
+        "const",
+        "additionalProperties",
+        "unevaluatedProperties",
+        "patternProperties",
+        "minProperties",
+        "maxProperties",
+        "propertyNames",
+        "exclusiveMinimum",
+        "exclusiveMaximum",
+        "multipleOf",
+        "additionalItems",
+        "unevaluatedItems",
+        "contains",
+    ]
+    # Pin the shipped denylist against this list: a renamed/dropped key would silently leak an
+    # unsupported keyword to Gemini (and break a tool-bound turn).
+    assert set(model_mod._UNSUPPORTED_SCHEMA_KEYS) == set(denied)
+
+    nested: dict[str, object] = {"type": "string", **dict.fromkeys(denied, "x")}
+    inside_list: dict[str, object] = {"type": "number", **dict.fromkeys(denied, "x")}
     params: dict[str, object] = {
-        "$schema": "https://json-schema.org/draft/2020-12/schema",
         "type": "object",
-        "additionalProperties": False,
-        "title": "Args",
-        "properties": {"city": city},
-        "anyOf": any_of,
+        "properties": {"city": nested},  # nested dict
+        "anyOf": [inside_list],  # nested list
+        **dict.fromkeys(denied, "x"),
     }
     payload: dict[str, object] = {
         "tools": [
@@ -194,10 +217,10 @@ def test_sanitize_tool_schemas_strips_model_incompatible_keys() -> None:
         ]
     }
     model_mod._sanitize_tool_schemas(payload)
-    assert not ({"$schema", "additionalProperties", "title"} & set(params))  # top-level stripped
-    assert params["type"] == "object"  # real schema keys preserved
-    assert city == {"type": "string"}  # nested dict stripped
-    assert any_of == [{"type": "string"}]  # nested list stripped
+    assert not (set(denied) & set(params))  # every denied key stripped at the top level
+    assert params["type"] == "object"  # structural keys preserved
+    assert nested == {"type": "string"}  # nested dict fully stripped
+    assert inside_list == {"type": "number"}  # nested-in-list fully stripped
 
 
 def test_sanitize_tool_schemas_guards() -> None:

From b9fd7f6e7dae2986ffb16a79e0728ded9ba426c1 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 17:16:31 -0700
Subject: [PATCH 04/10] assembly live: space replies in the transcript; silence
 firecrawl import warnings

- Give AssistantMessage a top margin in the live TUI so the greeting is separated from
  the splash and each reply is separated from the preceding user turn (scoped to the live
  app's CSS, so `assembly code` is unaffected).
- Suppress firecrawl-py's pydantic "Field name 'json'/'schema' shadows an attribute"
  UserWarnings at the runtime import site (pytest already filters them via pyproject);
  they otherwise leak into the user's terminal whenever a FIRECRAWL_API_KEY is set.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/agent_cascade/tui.py           |  2 ++
 aai_cli/code_agent/firecrawl_search.py | 10 +++++++++-
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/aai_cli/agent_cascade/tui.py b/aai_cli/agent_cascade/tui.py
index 90607d1..02b84d3 100644
--- a/aai_cli/agent_cascade/tui.py
+++ b/aai_cli/agent_cascade/tui.py
@@ -89,6 +89,8 @@ class LiveAgentApp(App[None]):
     #voicebar {{ dock: bottom; height: 3; background: #000000; border: round {banner.BRAND_HEX};
         margin: 1 1; content-align: center middle; }}
     #status {{ dock: bottom; height: 1; background: #000000; padding: 0 1; }}
+    /* Blank line above each agent reply (and the greeting), so turns don't run together. */
+    AssistantMessage {{ margin-top: 1; }}
     """
     TITLE = "AssemblyAI Live"
     ENABLE_COMMAND_PALETTE = False
diff --git a/aai_cli/code_agent/firecrawl_search.py b/aai_cli/code_agent/firecrawl_search.py
index e66be97..6358e98 100644
--- a/aai_cli/code_agent/firecrawl_search.py
+++ b/aai_cli/code_agent/firecrawl_search.py
@@ -11,6 +11,7 @@
 
 from __future__ import annotations
 
+import warnings
 from typing import TYPE_CHECKING
 
 from aai_cli.core import env
@@ -32,6 +33,13 @@ def build_web_search_tool() -> BaseTool | None:
     if not env.get(FIRECRAWL_API_KEY_ENV):
         return None
 
-    from langchain_firecrawl import FirecrawlSearch
+    with warnings.catch_warnings():
+        # firecrawl-py's pydantic models name fields ``json``/``schema``, which shadow
+        # BaseModel attributes and emit noisy UserWarnings on import. They're harmless and
+        # out of our control, so silence them at runtime (pytest filters them via pyproject).
+        warnings.filterwarnings(
+            "ignore", message="Field name .* shadows an attribute", category=UserWarning
+        )
+        from langchain_firecrawl import FirecrawlSearch
 
     return FirecrawlSearch()

From d9e4ffb03374d2d99a4a72c7246e76e441410c79 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 17:28:41 -0700
Subject: [PATCH 05/10] assembly live: surface a failed reply turn instead of
 dying silently
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

When the brain graph failed mid-turn (a gateway 4xx/5xx, a tool raising, a recursion
limit), it raised a non-CLIError, _generate_reply only caught CLIError, and the reply
worker died on a daemon thread — so the agent announced an action ("I'll search…") and
then never came back, with no clue why.

brain._run_graph now converts any graph exception into a CLIError (re-raising CLIErrors
unchanged), and the cascade shows it in the transcript ("(error: …)") and records it,
instead of swallowing it. The user sees *why* a turn produced no answer.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/agent_cascade/brain.py     | 18 +++++++++++++++++-
 aai_cli/agent_cascade/engine.py    |  6 ++++++
 tests/test_agent_cascade_brain.py  | 29 +++++++++++++++++++++++++++++
 tests/test_agent_cascade_engine.py | 12 ++++++++----
 4 files changed, 60 insertions(+), 5 deletions(-)

diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py
index 7f9b8d2..fdaa7e4 100644
--- a/aai_cli/agent_cascade/brain.py
+++ b/aai_cli/agent_cascade/brain.py
@@ -25,6 +25,7 @@
 from aai_cli.code_agent.fetch_tool import FETCH_TOOL_NAME
 from aai_cli.code_agent.firecrawl_search import WEB_SEARCH_TOOL_NAME
 from aai_cli.core import debuglog
+from aai_cli.core.errors import CLIError
 
 if TYPE_CHECKING:
     from langchain_core.tools import BaseTool
@@ -213,7 +214,22 @@ def _run_graph(
     is what makes a stalled spoken turn debuggable. The test fakes only implement
     ``invoke``, so they (and the non-verbose path) take the plain branch.
     """
-    graph_input = {"messages": conversation}
+    try:
+        return _drive_graph(graph, {"messages": conversation})
+    except CLIError:
+        raise
+    except Exception as exc:
+        # The graph can fail anywhere in the tool loop — a gateway 4xx/5xx, a tool raising,
+        # a langgraph recursion limit. Convert it to a CLIError so the cascade records and
+        # *surfaces* it (the engine shows it in the transcript) instead of the reply worker
+        # dying silently and the user getting no answer with no clue why.
+        raise CLIError(
+            f"the agent couldn't complete the turn: {exc}", error_type="agent_brain_error"
+        ) from exc
+
+
+def _drive_graph(graph: CompiledAgent, graph_input: dict[str, object]) -> dict[str, object]:
+    """Invoke the graph (or stream it under ``-v`` so :func:`_log_flow` can trace each step)."""
     if debuglog.active() and hasattr(graph, "stream"):
         last: dict[str, object] = {}
         seen = 0
diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py
index af52f15..8d8118e 100644
--- a/aai_cli/agent_cascade/engine.py
+++ b/aai_cli/agent_cascade/engine.py
@@ -215,7 +215,13 @@ def _generate_reply(self) -> None:
         try:
             reply = self.deps.complete_reply(messages)
         except CLIError as exc:
+            # The reply leg failed (gateway/tool/graph error, now converted to a CLIError in
+            # brain._run_graph). Show it in the transcript so the turn doesn't just vanish —
+            # the user sees *why* there was no answer instead of silence.
             self._record_error(exc)
+            self.renderer.reply_started()
+            self.renderer.agent_transcript(f"(error: {exc.message})", interrupted=False)
+            self.renderer.reply_done(interrupted=False)
             return
         self.renderer.reply_started()
         spoken: list[str] = []
diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py
index d8bee24..869aa02 100644
--- a/tests/test_agent_cascade_brain.py
+++ b/tests/test_agent_cascade_brain.py
@@ -10,6 +10,7 @@
 
 import logging
 
+import pytest
 from langchain_core.language_models.chat_models import BaseChatModel
 from langchain_core.messages import AIMessage, ToolMessage
 from langchain_core.outputs import ChatGeneration, ChatResult
@@ -17,6 +18,7 @@
 from aai_cli.agent_cascade import brain
 from aai_cli.agent_cascade.config import CascadeConfig
 from aai_cli.code_agent import model as model_mod
+from aai_cli.core.errors import CLIError
 
 
 class FakeChatModel(BaseChatModel):
@@ -261,6 +263,33 @@ def invoke(self, graph_input):
     assert completer([{"role": "user", "content": "hi"}]) == "from invoke"
 
 
+def test_run_graph_converts_graph_errors_to_cli_error():
+    # A graph failure (gateway 4xx/5xx, a tool raising, a recursion limit) must become a
+    # CLIError so the cascade surfaces it instead of the reply worker dying silently.
+    class _Boom:
+        def invoke(self, graph_input):
+            del graph_input
+            raise ValueError("bedrock said no")
+
+    completer = brain.build_completer("k", CascadeConfig(), graph=_Boom())
+    with pytest.raises(CLIError) as excinfo:
+        completer([{"role": "user", "content": "hi"}])
+    assert "couldn't complete the turn" in excinfo.value.message
+    assert "bedrock said no" in excinfo.value.message  # the cause is preserved for diagnosis
+
+
+def test_run_graph_passes_cli_error_through():
+    # A CLIError from the graph is already user-facing -> propagate as-is, not re-wrapped.
+    class _CliBoom:
+        def invoke(self, graph_input):
+            del graph_input
+            raise CLIError("already clean", error_type="x")
+
+    completer = brain.build_completer("k", CascadeConfig(), graph=_CliBoom())
+    with pytest.raises(CLIError, match="already clean"):
+        completer([{"role": "user", "content": "hi"}])
+
+
 def test_log_flow_ignores_non_list_messages():
     # Defensive: a snapshot without a messages list logs nothing and reports no progress.
     assert brain._log_flow({"messages": None}, 3) == 3
diff --git a/tests/test_agent_cascade_engine.py b/tests/test_agent_cascade_engine.py
index d113433..729f1e9 100644
--- a/tests/test_agent_cascade_engine.py
+++ b/tests/test_agent_cascade_engine.py
@@ -251,14 +251,18 @@ def test_generate_reply_stop_before_first_sentence_speaks_nothing():
     assert ("reply_done", True) in renderer.calls
 
 
-def test_generate_reply_llm_failure_is_recorded_and_aborts():
+def test_generate_reply_llm_failure_is_recorded_and_surfaced():
     def boom(messages):
+        del messages
         raise APIError("gateway down")
 
-    session, renderer, _player = make_session(complete_reply=boom)
+    session, renderer, player = make_session(complete_reply=boom)
     session._generate_reply()
-    assert isinstance(session.error, APIError)
-    assert ("reply_started",) not in renderer.calls  # aborted before speaking
+    assert isinstance(session.error, APIError)  # recorded for the exit path
+    # Surfaced in the transcript (not swallowed) but nothing is spoken — the turn aborts.
+    assert ("agent_transcript", "(error: gateway down)", False) in renderer.calls
+    assert ("reply_done", False) in renderer.calls  # the error line is closed off cleanly
+    assert player.enqueued == []
 
 
 def test_generate_reply_tts_failure_midway_is_recorded():

From 13c92701204b6323724420465d11e450b82ea726 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 17:46:01 -0700
Subject: [PATCH 06/10] assembly code voice: never trap on Ctrl-C; assembly
 live default -> claude-haiku-4-5

- A second Ctrl-C now always quits, even mid-readback: the quit-pending check moved
  ahead of stopping voice, so a spoken turn can't trap you. The first Ctrl-C (and
  Escape) still stops the readback and resumes listening; the second Ctrl-C exits.
  _stop_voice_activity returns None now (its result is no longer branched on).
- assembly live defaults to claude-haiku-4-5-20251001 (low latency for spoken turns);
  assembly code stays gpt-5.1. Config test + --help snapshot updated.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/agent_cascade/config.py               |  2 +-
 aai_cli/code_agent/tui.py                     | 20 +++----
 .../test_snapshots_help_run.ambr              |  3 +-
 tests/test_agent_cascade_config.py            |  2 +-
 tests/test_code_tui_voice.py                  | 55 +++++++++----------
 5 files changed, 40 insertions(+), 42 deletions(-)

diff --git a/aai_cli/agent_cascade/config.py b/aai_cli/agent_cascade/config.py
index ef8aa3d..bce18fc 100644
--- a/aai_cli/agent_cascade/config.py
+++ b/aai_cli/agent_cascade/config.py
@@ -16,7 +16,7 @@
 # `assembly live` defaults to a fast, low-latency gateway model (override with --model) —
 # a literal rather than llm.DEFAULT_MODEL so the live agent's default is independent of the
 # one-shot `assembly llm` default. Latency matters most for a spoken back-and-forth.
-DEFAULT_MODEL = "gemini-2.5-flash-lite"
+DEFAULT_MODEL = "claude-haiku-4-5-20251001"
 DEFAULT_MAX_TOKENS = llm.DEFAULT_MAX_TOKENS
 # The realtime model the cascade transcribes with (same as the agent-cascade template).
 DEFAULT_SPEECH_MODEL = "u3-rt-pro"
diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py
index 7ca036e..099c64c 100644
--- a/aai_cli/code_agent/tui.py
+++ b/aai_cli/code_agent/tui.py
@@ -388,24 +388,23 @@ def _cancel_turn(self) -> bool:
         self._note("cancelling…")
         return True
 
-    def _stop_voice_activity(self) -> bool:
-        """Stop in-flight voice; True if voice was active.
+    def _stop_voice_activity(self) -> None:
+        """Stop in-flight voice (a no-op when none is active).
 
         Interrupting the readback (speaking) stops it and resumes listening — the cancelled
         speak() returns and the loop captures the next turn. Interrupting while listening
         pauses voice to the text prompt, after which a second press falls through to quit.
         """
         if self._voice is None or not self._voice_active():
-            return False
+            return
         self._voice.cancel()
         if self._voice_phase == "speaking":  # stop talking, stay in voice mode -> re-listen
             self._note("stopped — listening…")
-            return True
+            return
         self._voice_paused = True
         self._refresh_status()
         self._sync_input_mode()  # active leg stopped -> bring the text prompt back
         self._note("voice interrupted (Ctrl-V to talk again)")
-        return True
 
     def action_interrupt(self) -> None:
         """Escape: interrupt a running agent turn or in-flight voice (a no-op when idle)."""
@@ -417,14 +416,13 @@ def action_quit_or_interrupt(self) -> None:
         if self._cancel_turn():
             self._quit_pending = False
             return
-        if self._stop_voice_activity():
-            if self._voice_paused:  # paused to text -> a 2nd Ctrl-C quits; re-listening doesn't
-                self._arm_quit_pending()
-            return
+        # A second press always quits — checked before stopping voice so a spoken turn can
+        # never trap you (the first press stops the readback and arms; the second exits).
         if self._quit_pending:
             self.exit()
-        else:
-            self._arm_quit_pending()
+            return
+        self._stop_voice_activity()  # stop a readback/listen if one's active (a no-op otherwise)
+        self._arm_quit_pending()
 
     def _arm_quit_pending(self) -> None:
         """Arm Ctrl-C double-press-to-quit, showing a hint that expires after a few seconds."""
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 710b3bb..9d55009 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -667,7 +667,8 @@
   ╭─ Language model ─────────────────────────────────────────────────────────────╮
   │ --model             TEXT                  LLM Gateway model that powers the  │
   │                                           agent's replies                    │
-  │                                           [default: gemini-2.5-flash-lite]   │
+  │                                           [default:                          │
+  │                                           claude-haiku-4-5-20251001]         │
   │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
   │                                           [default: 8192]                    │
   │ --llm-config        TEXT                  Set any LLM Gateway request field  │
diff --git a/tests/test_agent_cascade_config.py b/tests/test_agent_cascade_config.py
index f8b06a8..e722fca 100644
--- a/tests/test_agent_cascade_config.py
+++ b/tests/test_agent_cascade_config.py
@@ -19,7 +19,7 @@
 def test_default_config_values():
     config = CascadeConfig()
     assert config.voice == DEFAULT_VOICE
-    assert config.model == DEFAULT_MODEL == "gemini-2.5-flash-lite"  # `assembly live` default
+    assert config.model == DEFAULT_MODEL == "claude-haiku-4-5-20251001"  # `assembly live` default
     assert config.greeting == DEFAULT_GREETING
     # The sliding-window default keeps the last 40 messages of context.
     assert config.max_history == 40
diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py
index 459e3e4..ddc3226 100644
--- a/tests/test_code_tui_voice.py
+++ b/tests/test_code_tui_voice.py
@@ -140,24 +140,26 @@ async def go() -> None:
     assert calls == [True]  # running -> the finished turn is handled
 
 
-def test_interrupt_while_speaking_resumes_listening_not_text():  # untyped: probes app internals
-    # Ctrl-C during the readback (speaking) cancels it but stays in voice mode — the loop
-    # re-listens — and doesn't arm the quit. Interrupting while listening pauses to the text
-    # prompt and arms the double-press quit.
+def test_interrupt_during_speaking_stops_readback_and_ctrl_c_can_always_quit():  # untyped: internals
+    # Both Escape and Ctrl-C stop the readback and re-listen (not text); Ctrl-C also arms the
+    # quit, and a SECOND Ctrl-C exits even mid-speech — so a spoken turn can never trap you.
     async def go():
         app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
+        exited: list[bool] = []
+        app.exit = lambda *a, **k: exited.append(True)  # capture the quit without tearing down
         async with app.run_test(size=(100, 30)) as pilot:
             await pilot.pause()
             app._voice_phase = "speaking"
-            app.action_quit_or_interrupt()
-            assert app._voice.cancels >= 1  # the readback was cancelled
-            assert app._voice_paused is False  # stayed in voice mode -> will re-listen
-            assert app._quit_pending is False  # stopping the readback isn't a quit step
+            app.action_interrupt()  # Escape
+            assert app._voice.cancels >= 1 and app._voice_paused is False  # stopped, re-listens
+            assert app._quit_pending is False  # Escape never quits
 
-            app._voice_phase = "listening"
-            app.action_quit_or_interrupt()
-            assert app._voice_paused is True  # listening-interrupt brings the text prompt back
-            assert app._quit_pending is True  # paused to text -> a 2nd Ctrl-C quits
+            app._voice_phase = "speaking"
+            app.action_quit_or_interrupt()  # Ctrl-C
+            assert app._voice.cancels >= 2 and app._quit_pending is True  # stopped + armed
+            assert exited == []
+            app.action_quit_or_interrupt()  # second Ctrl-C
+            assert exited == [True]  # quits even mid-speech — never trapped
 
     _run(go())
 
@@ -425,25 +427,22 @@ async def go() -> None:
     _run(go())
 
 
-def test_ctrl_c_on_active_voice_interrupts_even_when_a_quit_was_pending(
+def test_ctrl_c_quits_when_a_quit_is_pending_even_with_active_voice(
     monkeypatch: pytest.MonkeyPatch,
 ) -> None:
-    # Stopping active voice takes priority over a pending quit: a Ctrl-C that lands while the
-    # agent is listening/speaking interrupts the voice and never quits, even if the quit hint
-    # was already armed from an earlier press.
+    # A pending quit takes priority over active voice: a second Ctrl-C (quit already armed)
+    # exits even while the agent is listening/speaking — otherwise a voice turn could trap
+    # the user with no way out.
     async def go() -> None:
-        voice = FakeVoice()
-        app = CodeAgentApp(agent=FakeAgent([]), voice=voice)
-        app._voice_paused = True
+        app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice())
         async with app.run_test(size=(100, 30)) as pilot:
             await pilot.pause()
             exited: list[bool] = []
             monkeypatch.setattr(app, "exit", lambda *a, **k: exited.append(True))
-            app._voice_paused = False  # voice active (listening)
-            app._quit_pending = True  # a quit hint was already armed
-            app.action_quit_or_interrupt()  # Ctrl-C: interrupt the voice, do NOT quit
-            assert voice.cancels == 1
-            assert exited == []  # active voice is interrupted, never quit
+            app._voice_paused = False  # voice active (listening/speaking)
+            app._quit_pending = True  # a quit hint was already armed by a prior press
+            app.action_quit_or_interrupt()  # Ctrl-C: with quit armed, exit
+            assert exited == [True]  # quits — never trapped
 
     _run(go())
 
@@ -466,20 +465,20 @@ async def go() -> None:
 
 
 def test_stop_voice_activity_is_a_noop_when_voice_inactive() -> None:
-    # No voice session, or a paused one, is not "active": the interrupt defers to the quit path
-    # rather than cancelling anything.
+    # No voice session, or a paused one, is not "active": _stop_voice_activity cancels nothing
+    # (and doesn't crash on the missing session), so the interrupt defers to the quit path.
     async def go() -> None:
         no_voice = CodeAgentApp(agent=FakeAgent([]))
         async with no_voice.run_test(size=(100, 30)) as pilot:
             await pilot.pause()
-            assert no_voice._stop_voice_activity() is False  # nothing to stop
+            no_voice._stop_voice_activity()  # no voice session -> no-op, no error
 
         voice = FakeVoice()
         paused = CodeAgentApp(agent=FakeAgent([]), voice=voice)
         paused._voice_paused = True
         async with paused.run_test(size=(100, 30)) as pilot:
             await pilot.pause()
-            assert paused._stop_voice_activity() is False  # paused -> inactive
+            paused._stop_voice_activity()  # paused -> inactive
             assert voice.cancels == 0  # a paused session is never cancelled
 
     _run(go())

From 169946ca568383011a1653078ea06b78692098a9 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 17:53:44 -0700
Subject: [PATCH 07/10] assembly live: bound MCP tool loading with a timeout so
 a slow server can't hang startup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Each MCP server was loaded with an unbounded asyncio.run(get_tools()); a slow/hung server
(npx/uvx cold-start, an unreachable host) blocked `assembly live` startup indefinitely,
and a Ctrl-C in that window triggered langchain-mcp-adapters' cancel-time crash. Wrap the
fetch in asyncio.wait_for(timeout=15s) — a server that won't list its tools in time is
cancelled and skipped (_safe_load turns the TimeoutError into []).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/agent_cascade/mcp_tools.py | 14 ++++++++++++--
 tests/test_agent_cascade_mcp.py    | 19 +++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/aai_cli/agent_cascade/mcp_tools.py b/aai_cli/agent_cascade/mcp_tools.py
index 1086f94..3c65bd9 100644
--- a/aai_cli/agent_cascade/mcp_tools.py
+++ b/aai_cli/agent_cascade/mcp_tools.py
@@ -112,13 +112,23 @@ def _to_connection(spec: ServerSpec) -> Connection:
     return {"transport": "stdio", "command": str(spec["command"]), "args": args, "env": env}
 
 
+# A server that hasn't listed its tools within this window is skipped, so a slow or hung
+# MCP server (npx/uvx cold-start, an unreachable host) can't block `assembly live` startup.
+_LOAD_TIMEOUT_S = 15.0  # pragma: no mutate — a tuning knob; ±a few seconds is equivalent
+
+
 def _load_server(name: str, conn: Connection) -> list[BaseTool]:
-    """Connect to one MCP server and return its tools (drives the async adapter)."""
+    """Connect to one MCP server and return its tools, bounded by :data:`_LOAD_TIMEOUT_S`.
+
+    The timeout is what keeps a slow/hung server from hanging startup forever — on timeout
+    the fetch is cancelled, ``asyncio.run`` raises ``TimeoutError``, and :func:`_safe_load`
+    turns that into an empty toolset (the server is simply skipped).
+    """
     from langchain_mcp_adapters.client import MultiServerMCPClient
 
     async def _fetch() -> list[BaseTool]:
         client = MultiServerMCPClient({name: conn})
-        return await client.get_tools()
+        return await asyncio.wait_for(client.get_tools(), timeout=_LOAD_TIMEOUT_S)
 
     return asyncio.run(_fetch())
 
diff --git a/tests/test_agent_cascade_mcp.py b/tests/test_agent_cascade_mcp.py
index f7d1fb0..178708d 100644
--- a/tests/test_agent_cascade_mcp.py
+++ b/tests/test_agent_cascade_mcp.py
@@ -150,6 +150,25 @@ def boom(name, conn) -> list:
     assert mcp_tools._safe_load(boom, "s", {"command": "x"}) == []
 
 
+def test_load_server_times_out_on_a_slow_server(monkeypatch):
+    # A server that won't list its tools within the timeout is cancelled, so it can't hang
+    # `assembly live` startup forever; _safe_load then turns the TimeoutError into [].
+    import asyncio
+
+    class _SlowClient:
+        def __init__(self, connections):
+            del connections
+
+        async def get_tools(self):
+            await asyncio.sleep(10)  # never finishes before the (patched) timeout
+            return []
+
+    monkeypatch.setattr("langchain_mcp_adapters.client.MultiServerMCPClient", _SlowClient)
+    monkeypatch.setattr(mcp_tools, "_LOAD_TIMEOUT_S", 0.05)
+    with pytest.raises(TimeoutError):
+        mcp_tools._load_server("slow", mcp_tools._to_connection({"command": "x"}))
+
+
 # --- _resolve_mcp_servers (the default set + --mcp-config merge) --------------
 
 

From fe9e60a5f0f42751c8af5c5554dcd3b0c9e631b9 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 18:25:09 -0700
Subject: [PATCH 08/10] assembly live: slim agent to web search only + keyboard
 interrupt to pause speech
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two focused changes to the `assembly live` voice agent (still deepagents-based):

Slim the toolset to just Firecrawl web search. A low-latency spoken turn does
best with one obvious tool rather than a large menu it has to choose among — the
big toolset (URL fetch, docs MCP, and a curated 5-server default MCP set) made
the model narrate "I'll search…" without ever calling anything, and bloated
every request with tool schemas. build_live_tools now returns only the web-search
tool (when FIRECRAWL_API_KEY is set), and no MCP servers load by default
(--mcp-config stays as a strictly opt-in power-user knob; default_servers is
removed). The prompt's capability builder is trimmed to match.

Wire Escape/Ctrl-C to pause speech and return to listening. A new
CascadeSession.interrupt_reply signals the in-flight reply to stop (sets the stop
flag + flushes audio) WITHOUT joining the worker — a UI-thread join would
deadlock against the worker's call_from_thread render hops. run_cascade gains an
on_session hook so the live TUI captures the session and binds Escape (interrupt)
and Ctrl-C (interrupt while speaking, else quit); Ctrl-Q always quits as the
guaranteed escape hatch.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 REFERENCE.md                                  | 24 ++---
 aai_cli/agent_cascade/brain.py                | 50 +++++-----
 aai_cli/agent_cascade/engine.py               | 31 ++++++-
 aai_cli/agent_cascade/mcp_tools.py            | 35 ++-----
 aai_cli/agent_cascade/tui.py                  | 46 ++++++++--
 aai_cli/commands/agent_cascade/__init__.py    | 16 ++--
 aai_cli/commands/agent_cascade/_exec.py       | 34 ++++---
 aai_cli/core/llm.py                           | 26 +++++-
 .../test_snapshots_help_run.ambr              | 19 ++--
 tests/test_agent_cascade_brain.py             | 53 ++++-------
 tests/test_agent_cascade_command.py           |  8 +-
 tests/test_agent_cascade_engine.py            | 49 +++++++++-
 tests/test_agent_cascade_mcp.py               | 41 ++-------
 tests/test_live_tui.py                        | 92 ++++++++++++++++++-
 tests/test_llm_command.py                     | 31 +++++++
 15 files changed, 358 insertions(+), 197 deletions(-)

diff --git a/REFERENCE.md b/REFERENCE.md
index 801ef66..304ba45 100644
--- a/REFERENCE.md
+++ b/REFERENCE.md
@@ -143,21 +143,17 @@ The two are mutually exclusive.
 ## Live agent tools (MCP)
 
 `assembly live` answers each spoken turn with a tool-using agent, so it can reach
-external tools mid-conversation. Out of the box it loads its built-in URL fetch,
-the AssemblyAI docs, and a curated, no-auth MCP toolset: `time` and `fetch`
-(`uvx`), `memory` and `filesystem` (`npx`, the latter rooted at the working
-directory), and an NWS-backed `weather` server.
-
-Firecrawl web search also loads when a `FIRECRAWL_API_KEY` is set; without it the
-session prints a one-line notice and runs without web search (every other default
-tool needs no key).
-
-`--mcp-config FILE` adds your own servers on top of the defaults, from a standard
-`mcpServers` JSON file — the same
+external tools mid-conversation. Its toolset is deliberately small — a low-latency
+spoken turn does best with one obvious tool rather than a large menu to choose
+among — so its one built-in tool is Firecrawl web search. It loads when a
+`FIRECRAWL_API_KEY` is set; without it the session prints a one-line notice and
+runs from the model's own knowledge (no web search).
+
+`--mcp-config FILE` adds your own MCP servers (none load by default), from a
+standard `mcpServers` JSON file — the same
 `{"mcpServers": {"name": {"command": "…", "args": […]}}}` shape Claude Desktop and
-Claude Code use. Repeat the flag to merge several files; a later file (or a config
-entry sharing a default's name) wins on a clash. Remote servers use `{"url": "…"}`
-instead of `command`/`args`.
+Claude Code use. Repeat the flag to merge several files; a later file wins on a
+name clash. Remote servers use `{"url": "…"}` instead of `command`/`args`.
 
 Each server is launched independently and best-effort: one that won't start (a
 missing `npx`/`uvx`, an offline host) drops only its own tools, so a single broken
diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py
index fdaa7e4..8df7b2d 100644
--- a/aai_cli/agent_cascade/brain.py
+++ b/aai_cli/agent_cascade/brain.py
@@ -1,9 +1,10 @@
 """Deepagents-powered reply brain for the live voice cascade.
 
 `assembly live` answers each spoken turn with a deepagents graph instead of a single
-LLM completion, so the agent can transparently reach for tools — web search, URL
-fetch, the AssemblyAI docs — mid-conversation, mimicking a live multimodal assistant
-(the "talk to Gemini Live" experience). The graph is built once per session
+LLM completion, so the agent can transparently reach for a tool — web search —
+mid-conversation, mimicking a live multimodal assistant (the "talk to Gemini Live"
+experience). The toolset is deliberately minimal: a low-latency spoken turn does best
+with one obvious tool rather than a menu it has to choose among. The graph is built once per session
 (:func:`build_graph`) and invoked statelessly per turn with the running history the
 cascade already keeps (:func:`build_completer`); tools are read-only and auto-approved,
 because a spoken turn can't pause for a keyboard confirmation, and the system prompt
@@ -22,7 +23,6 @@
 
 from aai_cli.agent_cascade.config import CascadeConfig
 from aai_cli.code_agent.agent import CompiledAgent
-from aai_cli.code_agent.fetch_tool import FETCH_TOOL_NAME
 from aai_cli.code_agent.firecrawl_search import WEB_SEARCH_TOOL_NAME
 from aai_cli.core import debuglog
 from aai_cli.core.errors import CLIError
@@ -71,21 +71,17 @@ def _join_clause(parts: list[str]) -> str:
 
 
 def _tool_capabilities(tools: Sequence[BaseTool]) -> list[str]:
-    """The spoken-capability phrases backed by an actually-present tool.
+    """The spoken-capability phrase backed by a present built-in tool.
 
-    Derived from the resolved tool names so the prompt never advertises a capability the
-    agent can't perform: web search is present only with a ``FIRECRAWL_API_KEY``, and the
-    docs tools are best-effort (absent when the docs host is unreachable).
+    The live agent's only built-in tool is Firecrawl web search, bound just when a
+    ``FIRECRAWL_API_KEY`` is set — so the prompt advertises web search only when the agent
+    can really do it. Advertising a tool it doesn't have made it announce an action ("I'll
+    search…") it then couldn't take, leaving the turn with no answer.
     """
     names = {tool.name for tool in tools}
-    capabilities: list[str] = []
     if WEB_SEARCH_TOOL_NAME in names:
-        capabilities.append("search the web for current or unfamiliar facts")
-    if FETCH_TOOL_NAME in names:
-        capabilities.append("fetch a specific URL")
-    if names - {WEB_SEARCH_TOOL_NAME, FETCH_TOOL_NAME}:
-        capabilities.append("look up the AssemblyAI documentation")
-    return capabilities
+        return ["search the web for current or unfamiliar facts"]
+    return []
 
 
 def _extra_capability(extra_tools: Sequence[BaseTool]) -> str | None:
@@ -129,24 +125,20 @@ def build_system_prompt(
 
 
 def build_live_tools() -> list[BaseTool]:
-    """The live agent's read-only toolset: URL fetch, web search (if keyed), and docs.
-
-    All three are reused from the coding agent's tool modules. Unlike there they are
-    *not* approval-gated — a spoken turn can't wait for a keyboard confirmation, so the
-    live agent only gets read-only tools and runs them automatically. Web search is
-    present only when ``FIRECRAWL_API_KEY`` is set; the docs MCP is best-effort (an empty
-    list when the host is unreachable), so neither blocks a session.
+    """The live agent's single read-only tool: Firecrawl web search (only when keyed).
+
+    Deliberately minimal. A low-latency spoken turn does best with one obvious tool rather
+    than a large menu it has to choose among — a big toolset made the model narrate "I'll
+    search…" without ever calling anything, and bloated every request with tool schemas.
+    Web search is the one capability worth the round-trip; everything else the agent answers
+    from its own knowledge. The tool is reused (un-approval-gated) from the coding agent and
+    is present only when ``FIRECRAWL_API_KEY`` is set, so an unkeyed session simply runs
+    tool-free. Extra tools remain strictly opt-in via ``--mcp-config``.
     """
-    from aai_cli.code_agent.docs_mcp import load_docs_tools
-    from aai_cli.code_agent.fetch_tool import build_fetch_tool
     from aai_cli.code_agent.firecrawl_search import build_web_search_tool
 
-    tools: list[BaseTool] = [build_fetch_tool()]
     search = build_web_search_tool()
-    if search is not None:
-        tools.append(search)
-    tools.extend(load_docs_tools())
-    return tools
+    return [search] if search is not None else []
 
 
 def build_graph(
diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py
index 8d8118e..ea1774e 100644
--- a/aai_cli/agent_cascade/engine.py
+++ b/aai_cli/agent_cascade/engine.py
@@ -194,6 +194,23 @@ def _barge_in(self) -> None:
             self.player.flush()
         self._join_reply()
 
+    def interrupt_reply(self) -> bool:
+        """Signal an in-flight reply to stop, without waiting for it; True if one was playing.
+
+        The UI-thread-safe counterpart to a spoken barge-in: the live TUI's Escape/Ctrl-C
+        calls this to silence the agent mid-reply without the user having to talk over it.
+        Flushing the queued audio stops speech at once; the reply worker then sees the stop
+        flag, unwinds on its own, and emits ``reply_done`` so the front-end returns to
+        listening (the STT loop keeps running, so the next spoken turn is handled normally).
+        It deliberately does *not* join the worker — a join from the UI thread would deadlock
+        against the worker's own ``call_from_thread`` render hops.
+        """
+        playing = self._reply is not None and self._reply.is_alive()
+        if playing:
+            self._stop.set()
+            self.player.flush()
+        return playing
+
     def _join_reply(self) -> None:
         """Wait for the current reply worker (if any) to unwind, then drop the handle."""
         worker = self._reply
@@ -270,14 +287,24 @@ def _is_final_turn(event: object, *, format_turns: bool) -> bool:
 
 
 def run_cascade(
-    *, renderer: Renderer, player: Player, config: CascadeConfig, deps: CascadeDeps
+    *,
+    renderer: Renderer,
+    player: Player,
+    config: CascadeConfig,
+    deps: CascadeDeps,
+    on_session: Callable[[CascadeSession], None] | None = None,
 ) -> None:
     """Run one terminal cascade conversation until STT closes or the user stops.
 
     Greets, then pumps STT turns through the LLM+TTS reply path. A recorded leg
-    failure is re-raised here so the command exits with the right code.
+    failure is re-raised here so the command exits with the right code. ``on_session`` is
+    handed the freshly built session before the conversation starts, so a front-end (the
+    live TUI) can grab a handle to it — e.g. to wire a keyboard interrupt to
+    :meth:`CascadeSession.interrupt_reply`.
     """
     session = CascadeSession(deps=deps, renderer=renderer, player=player, config=config)
+    if on_session is not None:
+        on_session(session)
     player.start()
     try:
         session.greet()
diff --git a/aai_cli/agent_cascade/mcp_tools.py b/aai_cli/agent_cascade/mcp_tools.py
index 3c65bd9..a864a94 100644
--- a/aai_cli/agent_cascade/mcp_tools.py
+++ b/aai_cli/agent_cascade/mcp_tools.py
@@ -3,16 +3,13 @@
 The live voice agent's brain is a deepagents graph, so any Model Context Protocol
 server's tools can be threaded into it through ``langchain-mcp-adapters`` — the same
 adapter `docs_mcp.py` uses for the hosted AssemblyAI docs. This lets a spoken
-conversation reach real tools (clock, weather, memory, a notes folder, …), bringing
-`assembly live` toward Gemini-Live / ChatGPT-voice parity.
+conversation reach real tools (a clock, a notes folder, …), bringing `assembly live`
+toward Gemini-Live / ChatGPT-voice parity.
 
-Two entry points feed the brain:
-
-- :func:`default_servers` returns a curated, zero/low-auth set (time, fetch, memory,
-  filesystem, weather) that every live session loads out of the box.
-- :func:`parse_mcp_config` reads one or more standard ``mcpServers`` JSON files — the
-  exact shape Claude Desktop / Claude Code use — so an existing config drops in
-  unchanged and can extend or override the defaults.
+The live agent ships with only its built-in Firecrawl web-search tool; MCP servers are
+**strictly opt-in** (a low-latency spoken turn does best with a small toolset).
+:func:`parse_mcp_config` reads one or more standard ``mcpServers`` JSON files — the exact
+shape Claude Desktop / Claude Code use — so an existing config drops in unchanged.
 
 Launching a server is **best-effort per server**: a missing ``npx``/``uvx`` or an
 offline run skips that one server (the others still load) rather than aborting the
@@ -42,26 +39,6 @@
 Loader = Callable[[str, "Connection"], "list[BaseTool]"]
 
 
-def default_servers(filesystem_root: Path) -> dict[str, ServerSpec]:
-    """The curated server set every live session loads: zero/low-auth, fast, speakable.
-
-    Every entry is a published reference server runnable with no API key:
-    ``time``/``fetch`` over ``uvx`` (PyPI), ``memory``/``filesystem`` over ``npx`` (npm),
-    and an NWS-backed ``weather`` server. ``filesystem`` is rooted at ``filesystem_root``
-    (the working directory) so "summarize my notes file" stays scoped to one folder.
-    """
-    return {
-        "time": {"command": "uvx", "args": ["mcp-server-time"]},
-        "fetch": {"command": "uvx", "args": ["mcp-server-fetch"]},
-        "memory": {"command": "npx", "args": ["-y", "@modelcontextprotocol/server-memory"]},
-        "filesystem": {
-            "command": "npx",
-            "args": ["-y", "@modelcontextprotocol/server-filesystem", str(filesystem_root)],
-        },
-        "weather": {"command": "npx", "args": ["-y", "@h1deya/mcp-server-weather"]},
-    }
-
-
 def parse_mcp_config(paths: Sequence[Path]) -> dict[str, ServerSpec]:
     """Merge the ``mcpServers`` maps from one or more standard MCP config JSON files.
 
diff --git a/aai_cli/agent_cascade/tui.py b/aai_cli/agent_cascade/tui.py
index 02b84d3..ec2bcbc 100644
--- a/aai_cli/agent_cascade/tui.py
+++ b/aai_cli/agent_cascade/tui.py
@@ -35,8 +35,8 @@
 # Splash intro copy (the code agent's banner copy is code-specific, so `live` carries its own).
 _READY_LINE = "Listening… start talking when you're ready."
 _TIP_LINE = "Use headphones — the mic stays open while the agent speaks."
-# The one-line footer: a hands-free session, so the only control is quit.
-_STATUS_LINE = "Ctrl-C to quit"
+# The one-line footer: a hands-free session, so the controls are interrupt-and-quit.
+_STATUS_LINE = "Esc/Ctrl-C to interrupt · Ctrl-Q to quit"
 
 
 class _TuiRenderer:
@@ -94,10 +94,14 @@ class LiveAgentApp(App[None]):
     """
     TITLE = "AssemblyAI Live"
     ENABLE_COMMAND_PALETTE = False
-    # Ctrl-C / Ctrl-Q both stop the session; there is no turn to interrupt and nothing to type,
-    # so a single press quits (closing the audio unblocks the cascade worker).
+    # Escape and Ctrl-C interrupt a playing reply (silence it and drop back to listening),
+    # the same as talking over the agent — so you can stop a long answer without speaking.
+    # When nothing is speaking, Ctrl-C quits; Ctrl-Q always quits (the guaranteed escape
+    # hatch, so a stuck reply can never trap the session). Quitting closes the audio, which
+    # unblocks the cascade worker.
     BINDINGS: ClassVar = [
-        ("ctrl+c", "stop", "Quit"),
+        ("escape", "interrupt", "Interrupt"),
+        ("ctrl+c", "interrupt_or_quit", "Interrupt / Quit"),
         ("ctrl+q", "stop", "Quit"),
     ]
 
@@ -112,6 +116,9 @@ def __init__(
         self._run_conversation = run_conversation  # blocking; runs the cascade given a Renderer
         self._on_stop = on_stop  # closes the audio so a quit unblocks the cascade worker
         self._web_note = web_note
+        # The cascade's reply-interrupt, wired once its session exists (see set_interrupt);
+        # None until then, so an early keypress is a harmless no-op.
+        self._interrupt: Callable[[], bool] | None = None
         self._voice_phase = "listening"
         self._voice_frames = itertools.cycle(tui_status.VOICE_FRAMES)
         self._voice_timer: Timer | None = None
@@ -243,10 +250,35 @@ def _mount(self, widget: Static) -> None:
     def _scroll_end(self) -> None:
         self.query_one("#log", VerticalScroll).scroll_end(animate=False)  # pragma: no mutate
 
-    # --- quit -----------------------------------------------------------------
+    # --- interrupt / quit -----------------------------------------------------
+
+    def set_interrupt(self, interrupt: Callable[[], bool]) -> None:
+        """Wire the session's reply-interrupt once the cascade has built its session.
+
+        Called from the cascade worker thread (via ``run_cascade``'s ``on_session``); it only
+        stores a callable reference, so no UI hop is needed.
+        """
+        self._interrupt = interrupt
+
+    def action_interrupt(self) -> None:
+        """Escape: silence a playing reply and return to listening (a no-op when idle)."""
+        self._do_interrupt()
+
+    def action_interrupt_or_quit(self) -> None:
+        """Ctrl-C: silence a playing reply and keep listening; quit when nothing is speaking."""
+        if not self._do_interrupt():
+            self.action_stop()
+
+    def _do_interrupt(self) -> bool:
+        """Fire the session's reply-interrupt; True if a reply was playing.
+
+        The reply worker then unwinds and emits ``reply_done``, so the renderer is what
+        returns the voice bar to listening — this only has to signal the stop.
+        """
+        return self._interrupt is not None and self._interrupt()
 
     def action_stop(self) -> None:
-        """Ctrl-C / Ctrl-Q: stop the audio (unblocking the cascade worker) and exit."""
+        """Ctrl-Q (or Ctrl-C when idle): stop the audio (unblocking the worker) and exit."""
         self._teardown()
         self.exit()
 
diff --git a/aai_cli/commands/agent_cascade/__init__.py b/aai_cli/commands/agent_cascade/__init__.py
index 97fcb8f..afa46c3 100644
--- a/aai_cli/commands/agent_cascade/__init__.py
+++ b/aai_cli/commands/agent_cascade/__init__.py
@@ -58,7 +58,7 @@ def _emit_voice_list(_state: AppState, json_mode: bool) -> None:
                 'assembly --sandbox live --system-prompt "You are a terse pirate."',
             ),
             (
-                "Add your own MCP servers on top of the defaults",
+                "Add your own MCP servers (none load by default)",
                 "assembly --sandbox live --mcp-config ~/.config/mcp/servers.json",
             ),
             ("See available voices", "assembly --sandbox live --list-voices"),
@@ -162,7 +162,7 @@ def live(
     mcp_config: list[Path] | None = typer.Option(
         None,
         "--mcp-config",
-        help='Extra MCP servers config JSON ({"mcpServers": {…}}) on top of the defaults (repeatable)',
+        help='MCP servers config JSON ({"mcpServers": {…}}) to add (repeatable; none load by default)',
         exists=True,
         dir_okay=False,
         rich_help_panel=_PANEL_TOOLS,
@@ -200,12 +200,12 @@ def live(
     This only runs a conversation in the terminal — it writes no code. To build
     an agent-cascade app, run 'assembly init agent-cascade' instead.
 
-    By default the agent loads a curated, no-auth MCP toolset (time, fetch,
-    memory, filesystem, weather) alongside its built-in URL fetch and AssemblyAI
-    docs. Firecrawl web search also loads when a FIRECRAWL_API_KEY is set (you'll
-    get a one-line notice when it isn't). Add your own servers with --mcp-config,
-    pointing at any standard mcpServers JSON file. A server that won't start is
-    skipped, so one broken tool never sinks the session.
+    The agent keeps a deliberately small toolset for low-latency spoken turns: its
+    one built-in tool is Firecrawl web search, which loads when a FIRECRAWL_API_KEY
+    is set (you'll get a one-line notice when it isn't). Add your own MCP servers
+    with --mcp-config, pointing at any standard mcpServers JSON file (none load by
+    default). A server that won't start is skipped, so one broken tool never sinks
+    the session.
     """
 
     if list_voices:
diff --git a/aai_cli/commands/agent_cascade/_exec.py b/aai_cli/commands/agent_cascade/_exec.py
index 7a7df44..2bbd324 100644
--- a/aai_cli/commands/agent_cascade/_exec.py
+++ b/aai_cli/commands/agent_cascade/_exec.py
@@ -74,7 +74,7 @@ class AgentCascadeOptions:
     # Text-to-speech: language named, any other query param via --tts-config.
     language: str | None
     tts_config: tuple[str, ...]
-    # Tools: extra standard mcpServers JSON config files, on top of the default set.
+    # Tools: opt-in standard mcpServers JSON config files (none load by default).
     mcp_config: tuple[Path, ...]
     # Print the equivalent Python instead of running a conversation.
     show_code: bool
@@ -123,8 +123,9 @@ def _parse_tts_config(pairs: tuple[str, ...]) -> dict[str, str]:
 def _web_search_note() -> str | None:
     """The "web search is off" notice when no ``FIRECRAWL_API_KEY`` enables it, else ``None``.
 
-    The other default tools (URL fetch, AssemblyAI docs, and the MCP servers) need no
-    key; only Firecrawl web search does, so its absence is the one worth flagging up front.
+    Web search (Firecrawl) is the live agent's one built-in tool and the only one needing a
+    key, so its absence — which leaves the agent answering from its own knowledge alone — is
+    worth flagging up front.
     """
     if env.get(firecrawl_search.FIRECRAWL_API_KEY_ENV):
         return None
@@ -139,15 +140,13 @@ def _warn_without_web_search(*, json_mode: bool) -> None:
 
 
 def _resolve_mcp_servers(mcp_config: tuple[Path, ...]) -> dict[str, Mapping[str, object]]:
-    """The MCP servers for this run: the curated default set overlaid with any --mcp-config
-    files, so an explicit config can extend the defaults or override one by name.
+    """The MCP servers for this run: only those from ``--mcp-config`` files (none by default).
 
-    The default filesystem server is rooted at the working directory, scoping its file
-    access to one folder.
+    The live agent ships with just its Firecrawl web-search tool; extra MCP servers are
+    strictly opt-in, so a low-latency spoken turn isn't handed a large tool menu it has to
+    choose among.
     """
-    servers: dict[str, Mapping[str, object]] = dict(mcp_tools.default_servers(Path.cwd()))
-    servers.update(mcp_tools.parse_mcp_config(mcp_config))
-    return servers
+    return dict(mcp_tools.parse_mcp_config(mcp_config))
 
 
 def _open_audio(
@@ -230,13 +229,22 @@ def _run_live_tui(api_key: str, opts: AgentCascadeOptions, config: CascadeConfig
     deps = engine.CascadeDeps.real(api_key, config, audio=duplex.mic, stt_params=stt_params)
 
     def run_conversation(renderer: engine.Renderer) -> None:
-        engine.run_cascade(renderer=renderer, player=duplex.player, config=config, deps=deps)
+        # Hand the app the session's reply-interrupt so Escape/Ctrl-C can silence a reply
+        # mid-sentence and drop back to listening (the session is built inside run_cascade).
+        engine.run_cascade(
+            renderer=renderer,
+            player=duplex.player,
+            config=config,
+            deps=deps,
+            on_session=lambda session: app.set_interrupt(session.interrupt_reply),
+        )
 
-    LiveAgentApp(
+    app = LiveAgentApp(
         run_conversation=run_conversation,
         on_stop=duplex.close,
         web_note=_web_search_note(),
-    ).run(mouse=False)
+    )
+    app.run(mouse=False)
 
 
 def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode: bool) -> None:
diff --git a/aai_cli/core/llm.py b/aai_cli/core/llm.py
index b147e89..f3009a0 100644
--- a/aai_cli/core/llm.py
+++ b/aai_cli/core/llm.py
@@ -23,18 +23,40 @@
 # is supplied. Must be exactly "{{ transcript }}" (spaces included).
 TRANSCRIPT_TAG = "{{ transcript }}"
 
-# A curated subset for `assembly llm --list-models` and help text. The gateway is the
-# source of truth for what's actually accepted, so we don't validate against this.
+# The known model ids surfaced by `assembly llm --list-models`, help text, and shell
+# completion, grouped by provider. The gateway is the source of truth for what's
+# actually accepted, so we don't validate against this — a newer id works even before
+# it lands here.
 KNOWN_MODELS = (
+    # Anthropic
     "claude-opus-4-7",
+    "claude-opus-4-6",
+    "claude-opus-4-5-20251101",
     "claude-sonnet-4-6",
+    "claude-sonnet-4-5-20250929",
     "claude-haiku-4-5-20251001",
+    # OpenAI
+    "gpt-5.5",
+    "gpt-5.2",
     "gpt-5.1",
     "gpt-5",
+    "gpt-5-mini",
+    "gpt-5-nano",
     "gpt-4.1",
+    "gpt-oss-120b",
+    "gpt-oss-20b",
+    # Google
+    "gemini-3.5-flash",
+    "gemini-3-flash-preview",
+    "gemini-3.1-flash-lite-preview",
     "gemini-2.5-pro",
     "gemini-2.5-flash",
     "gemini-2.5-flash-lite",
+    # Moonshot AI
+    "kimi-k2.5",
+    # Alibaba Cloud
+    "qwen3-next-80b-a3b",
+    "qwen3-32B",
 )
 
 
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 9d55009..96cca87 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -617,12 +617,13 @@
    This only runs a conversation in the terminal — it writes no code. To build
    an agent-cascade app, run 'assembly init agent-cascade' instead.
   
-   By default the agent loads a curated, no-auth MCP toolset (time, fetch,
-   memory, filesystem, weather) alongside its built-in URL fetch and AssemblyAI
-   docs. Firecrawl web search also loads when a FIRECRAWL_API_KEY is set (you'll
-   get a one-line notice when it isn't). Add your own servers with --mcp-config,
-   pointing at any standard mcpServers JSON file. A server that won't start is
-   skipped, so one broken tool never sinks the session.
+   The agent keeps a deliberately small toolset for low-latency spoken turns: its
+   one built-in tool is Firecrawl web search, which loads when a
+   FIRECRAWL_API_KEY
+   is set (you'll get a one-line notice when it isn't). Add your own MCP servers
+   with --mcp-config, pointing at any standard mcpServers JSON file (none load by
+   default). A server that won't start is skipped, so one broken tool never sinks
+   the session.
   
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
   │   source      [SOURCE]  Audio file path or URL to speak to the agent. Omit   │
@@ -697,8 +698,8 @@
   │                                                             streaming fields │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Tools ──────────────────────────────────────────────────────────────────────╮
-  │ --mcp-config        FILE  Extra MCP servers config JSON ({"mcpServers":      │
-  │                           {…}}) on top of the defaults (repeatable)          │
+  │ --mcp-config        FILE  MCP servers config JSON ({"mcpServers": {…}}) to   │
+  │                           add (repeatable; none load by default)             │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   
    Examples
@@ -708,7 +709,7 @@
    $ assembly --sandbox live --voice michael --greeting "Hi there"
    Give the agent a persona
    $ assembly --sandbox live --system-prompt "You are a terse pirate."
-   Add your own MCP servers on top of the defaults
+   Add your own MCP servers (none load by default)
    $ assembly --sandbox live --mcp-config ~/.config/mcp/servers.json
    See available voices
    $ assembly --sandbox live --list-voices
diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py
index 869aa02..a529ce8 100644
--- a/tests/test_agent_cascade_brain.py
+++ b/tests/test_agent_cascade_brain.py
@@ -58,33 +58,22 @@ def __init__(self, name: str):
         self.name = name
 
 
-def test_system_prompt_appends_tool_guidance_for_present_tools():
+def test_system_prompt_advertises_web_search_when_present():
     prompt = brain.build_system_prompt(
-        "You are a pirate.",
-        tools=[
-            _NamedTool(brain.WEB_SEARCH_TOOL_NAME),
-            _NamedTool("fetch_url"),
-            _NamedTool("docs_search"),
-        ],
+        "You are a pirate.", tools=[_NamedTool(brain.WEB_SEARCH_TOOL_NAME)]
     )
-    # The persona is preserved, and the guidance advertises each capability that a present
-    # tool backs (the plain cascade persona never mentions tools).
+    # The persona is preserved, and the guidance advertises the web-search capability the
+    # present tool backs (the plain cascade persona never mentions tools).
     assert prompt.startswith("You are a pirate.")
     assert "search the web" in prompt
-    assert "fetch a specific URL" in prompt
-    assert "AssemblyAI documentation" in prompt
 
 
-def test_system_prompt_omits_web_search_when_no_search_tool():
-    # With no TAVILY_API_KEY the search tool is absent — the guidance must NOT promise web
-    # search, since announcing a missing tool makes the agent narrate "I'll search…" and
-    # then stall with no answer. The capabilities it *does* have still appear.
-    prompt = brain.build_system_prompt(
-        "persona", tools=[_NamedTool("fetch_url"), _NamedTool("docs_search")]
-    )
+def test_system_prompt_omits_web_search_when_search_tool_absent():
+    # Without the Firecrawl search tool the guidance must NOT promise web search — announcing
+    # a missing tool makes the agent narrate "I'll search…" and then stall with no answer. A
+    # non-search tool name must not falsely trigger the web-search capability.
+    prompt = brain.build_system_prompt("persona", tools=[_NamedTool("some_other_tool")])
     assert "search the web for current or unfamiliar facts" not in prompt
-    assert "fetch a specific URL" in prompt
-    assert "AssemblyAI documentation" in prompt
 
 
 def test_system_prompt_tells_model_not_to_promise_tools_when_none():
@@ -365,23 +354,17 @@ def test_reply_text_is_empty_without_an_assistant_message():
 # --- build_live_tools --------------------------------------------------------
 
 
-def test_build_live_tools_includes_search_when_keyed(monkeypatch):
+def test_build_live_tools_is_just_web_search_when_keyed(monkeypatch):
     search = object()
-    monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch")
     monkeypatch.setattr("aai_cli.code_agent.firecrawl_search.build_web_search_tool", lambda: search)
-    monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", lambda: ["docs"])
-    tools = brain.build_live_tools()
-    # Fetch + the keyed search + the docs tools, in that order.
-    assert tools == ["fetch", search, "docs"]
+    # The live agent's sole built-in tool is Firecrawl web search — no URL fetch, no docs.
+    assert brain.build_live_tools() == [search]
 
 
-def test_build_live_tools_omits_search_when_unkeyed(monkeypatch):
-    monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch")
+def test_build_live_tools_is_empty_without_firecrawl_key(monkeypatch):
     monkeypatch.setattr("aai_cli.code_agent.firecrawl_search.build_web_search_tool", lambda: None)
-    monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", list)
-    tools = brain.build_live_tools()
-    # No TAVILY_API_KEY -> no search tool, just the fetch tool.
-    assert tools == ["fetch"]
+    # No FIRECRAWL_API_KEY -> no tool at all; the agent then runs tool-free.
+    assert brain.build_live_tools() == []
 
 
 # --- build_graph (model construction + compile, with the docs probe skipped) -
@@ -422,14 +405,14 @@ def fake_create(*, model, tools, system_prompt):
 
     monkeypatch.setattr(deepagents, "create_deep_agent", fake_create)
     monkeypatch.setattr(model_mod, "build_model", lambda *a, **k: object())
-    builtin = [_NamedTool("fetch_url")]
+    builtin = [_NamedTool(brain.WEB_SEARCH_TOOL_NAME)]
     extra = [_NamedTool("get_time")]
     graph = brain.build_graph("k", CascadeConfig(), tools=builtin, mcp_tools=extra)
     # The model is bound to both tool sets, in built-in-then-MCP order.
     assert graph == "graph"
     assert captured["tools"] == builtin + extra
-    # The prompt advertises the built-in fetch leg AND the MCP tool by name.
-    assert "fetch a specific URL" in captured["system_prompt"]
+    # The prompt advertises the built-in web-search leg AND the MCP tool by name.
+    assert "search the web" in captured["system_prompt"]
     assert "use your connected tools (get_time)" in captured["system_prompt"]
 
 
diff --git a/tests/test_agent_cascade_command.py b/tests/test_agent_cascade_command.py
index a7c2374..405a508 100644
--- a/tests/test_agent_cascade_command.py
+++ b/tests/test_agent_cascade_command.py
@@ -215,23 +215,23 @@ def test_open_audio_mic_warns_and_uses_duplex_rate(monkeypatch):
 
 
 # --- MCP servers (resolution unit-tested in test_agent_cascade_mcp.py) -------
-def test_default_mcp_servers_flow_into_cascade_config(monkeypatch):
+def test_no_mcp_servers_load_by_default(monkeypatch):
     monkeypatch.setattr(_exec.tts_session, "require_available", lambda _c: None)
     monkeypatch.setattr(config, "resolve_api_key", lambda **_: "k")
     monkeypatch.setattr(_exec, "FileSource", lambda src: types.SimpleNamespace(sample_rate=16000))
     monkeypatch.setattr(_exec.client, "resolve_audio_source", lambda source, sample: "clip.wav")
     captured = {}
 
-    # Capture config at the deps seam so the graph (and its npx/uvx servers) never builds.
+    # Capture config at the deps seam so the graph never builds.
     def fake_real(api_key, config, *, audio, stt_params):
         captured["config"] = config
         return "deps"
 
     monkeypatch.setattr(_exec.engine.CascadeDeps, "real", fake_real)
     monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kwargs: None)
-    # With no flags, the default servers (e.g. weather) ride into the config the brain reads.
+    # With no --mcp-config, no MCP servers load — the agent keeps just its web-search tool.
     run_agent_cascade(_opts(source="clip.wav"), AppState(), json_mode=False)
-    assert "weather" in captured["config"].mcp_servers
+    assert captured["config"].mcp_servers == {}
 
 
 # --- run_agent_cascade wiring ----------------------------------------------
diff --git a/tests/test_agent_cascade_engine.py b/tests/test_agent_cascade_engine.py
index 729f1e9..1032135 100644
--- a/tests/test_agent_cascade_engine.py
+++ b/tests/test_agent_cascade_engine.py
@@ -302,18 +302,36 @@ def test_barge_in_cancels_and_flushes_live_worker():
     assert session._reply is None
 
 
-def test_barge_in_no_worker_does_not_flush():
+def test_barge_in_without_a_live_worker_does_not_flush():
+    # No worker, or one that already finished: nothing to cancel, so no flush.
     session, _renderer, player = make_session()
-    session._barge_in()
+    session._barge_in()  # no worker
+    session._reply = FakeWorker(alive=False)
+    session._barge_in()  # finished worker
     assert player.flushed == 0
+    assert session._reply is None
 
 
-def test_barge_in_finished_worker_does_not_flush():
+def test_interrupt_reply_signals_stop_and_flushes_without_joining():
+    # Live TUI Escape/Ctrl-C silences a playing reply: stop flag + flush, but NO join.
     session, _renderer, player = make_session()
+    worker = FakeWorker(alive=True)
+    session._reply = worker
+    assert session.interrupt_reply() is True
+    assert session._stop.is_set()
+    assert player.flushed == 1
+    assert worker.joined == 0  # not joined — the worker unwinds on its own
+    assert session._reply is worker  # still tracked; the next turn's barge-in joins it
+
+
+def test_interrupt_reply_is_a_noop_when_nothing_is_playing():
+    # No worker, or one that already finished: nothing to stop, so no flush and no stop flag.
+    session, _renderer, player = make_session()
+    assert session.interrupt_reply() is False  # no worker
     session._reply = FakeWorker(alive=False)
-    session._barge_in()
+    assert session.interrupt_reply() is False  # finished worker
     assert player.flushed == 0
-    assert session._reply is None
+    assert not session._stop.is_set()
 
 
 def test_shutdown_joins_live_worker():
@@ -408,6 +426,27 @@ def complete_reply(messages):
     assert {"role": "assistant", "content": "Welcome."} in session_box["messages"]
 
 
+def test_run_cascade_hands_the_session_to_on_session_before_greeting():
+    # run_cascade hands the session to on_session before the player starts (TUI wires it).
+    captured = {}
+    player = FakePlayer()
+    deps = CascadeDeps(
+        run_stt=lambda on_turn: None,
+        complete_reply=lambda m: "hi",
+        synthesize=lambda text: b"",
+        spawn=_sync_spawn,
+    )
+    run_cascade(
+        renderer=FakeRenderer(),
+        player=player,
+        config=CascadeConfig(greeting=""),
+        deps=deps,
+        on_session=lambda s: captured.update(session=s, started=player.started),
+    )
+    assert isinstance(captured["session"], CascadeSession)
+    assert captured["started"] is False
+
+
 def test_run_cascade_shuts_down_inflight_worker():
     worker = FakeWorker(alive=True)
 
diff --git a/tests/test_agent_cascade_mcp.py b/tests/test_agent_cascade_mcp.py
index 178708d..015c22a 100644
--- a/tests/test_agent_cascade_mcp.py
+++ b/tests/test_agent_cascade_mcp.py
@@ -16,25 +16,6 @@
 from aai_cli.commands.agent_cascade import _exec
 from aai_cli.core.errors import UsageError
 
-# --- default_servers ---------------------------------------------------------
-
-
-def test_default_servers_curated_set_and_filesystem_root():
-    root = Path("/notes/dir")
-    servers = mcp_tools.default_servers(root)
-    # The five curated, no-auth servers, each with a real launch command.
-    assert set(servers) == {"time", "fetch", "memory", "filesystem", "weather"}
-    assert servers["time"] == {"command": "uvx", "args": ["mcp-server-time"]}
-    assert servers["memory"]["args"] == ["-y", "@modelcontextprotocol/server-memory"]
-    # The filesystem server is scoped to the passed-in root directory. Compare against
-    # str(root), not a hardcoded "/notes/dir", so it holds on Windows (backslash paths).
-    assert servers["filesystem"]["args"] == [
-        "-y",
-        "@modelcontextprotocol/server-filesystem",
-        str(root),
-    ]
-
-
 # --- parse_mcp_config --------------------------------------------------------
 
 
@@ -172,29 +153,19 @@ async def get_tools(self):
 # --- _resolve_mcp_servers (the default set + --mcp-config merge) --------------
 
 
-def test_resolve_mcp_servers_defaults_loaded_with_no_config():
-    servers = _exec._resolve_mcp_servers(mcp_config=())
-    # Every session loads the curated default set out of the box.
-    assert {"time", "weather", "memory", "fetch", "filesystem"} <= set(servers)
+def test_resolve_mcp_servers_empty_with_no_config():
+    # No --mcp-config -> no MCP servers; the live agent runs with just its web-search tool.
+    assert _exec._resolve_mcp_servers(mcp_config=()) == {}
 
 
-def test_resolve_mcp_servers_config_adds_to_defaults(tmp_path):
+def test_resolve_mcp_servers_returns_only_config_servers(tmp_path):
     path = tmp_path / "servers.json"
     path.write_text(
         '{"mcpServers": {"custom": {"command": "uvx", "args": ["x"]}}}', encoding="utf-8"
     )
     servers = _exec._resolve_mcp_servers(mcp_config=(path,))
-    # The config server is added alongside (not instead of) the defaults.
-    assert servers["custom"] == {"command": "uvx", "args": ["x"]}
-    assert "weather" in servers
-
-
-def test_resolve_mcp_servers_config_overrides_default_by_name(tmp_path):
-    path = tmp_path / "servers.json"
-    path.write_text('{"mcpServers": {"time": {"command": "my-time"}}}', encoding="utf-8")
-    servers = _exec._resolve_mcp_servers(mcp_config=(path,))
-    # An explicit config entry overrides the default server of the same name.
-    assert servers["time"] == {"command": "my-time"}
+    # Only the opt-in config server is present — there is no curated default set to merge.
+    assert servers == {"custom": {"command": "uvx", "args": ["x"]}}
 
 
 # --- _warn_without_web_search (the FIRECRAWL_API_KEY notice) ------------------
diff --git a/tests/test_live_tui.py b/tests/test_live_tui.py
index 732f9e4..0dbccac 100644
--- a/tests/test_live_tui.py
+++ b/tests/test_live_tui.py
@@ -72,8 +72,8 @@ def _voicebar(app) -> str:
 
 
 def test_splash_and_status_render() -> None:
-    # The session opens on the ASSEMBLY wordmark + ready line, and the footer shows the only
-    # control (quit) — there is no text prompt mounted (input is voice-only).
+    # The session opens on the ASSEMBLY wordmark + ready line, and the footer shows the
+    # interrupt/quit controls — there is no text prompt mounted (input is voice-only).
     async def go() -> None:
         app = _app()
         async with app.run_test(size=(100, 30)) as pilot:
@@ -81,7 +81,8 @@ async def go() -> None:
             splash = str(app.query_one("#log").children[0].render())
             assert "█" in splash and "Listening… start talking" in splash  # the wordmark splash
             assert "Listening" in _voicebar(app)  # opens in the listening phase
-            assert "Ctrl-C to quit" in str(app.query_one("#status", Static).render())
+            status = str(app.query_one("#status", Static).render())
+            assert "interrupt" in status and "Ctrl-Q to quit" in status
             assert len(app.query("#prompt")) == 0  # no text input — voice only
             assert app.ENABLE_COMMAND_PALETTE is False  # the voice UI hides the command palette
 
@@ -196,6 +197,75 @@ async def go() -> None:
     _run(go())
 
 
+def test_escape_interrupts_a_playing_reply_via_the_session_hook() -> None:
+    # Escape fires the session's reply-interrupt (set once the cascade has a session) and
+    # never quits — the worker unwinds and the renderer returns the bar to listening.
+    async def go() -> None:
+        fired: list[bool] = []
+
+        def hook() -> bool:
+            fired.append(True)
+            return True
+
+        app = _app()
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.set_interrupt(hook)
+            app.action_interrupt()
+            assert fired == [True]
+
+    _run(go())
+
+
+def test_ctrl_c_interrupts_a_playing_reply_without_quitting(monkeypatch) -> None:
+    # While a reply is playing (the hook returns True), Ctrl-C interrupts it and stays — it
+    # must NOT quit, so a long answer can be cut off without ending the session.
+    async def go() -> None:
+        app = _app()
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            exited: list[bool] = []
+            monkeypatch.setattr(app, "exit", lambda *a, **k: exited.append(True))
+            app.set_interrupt(lambda: True)  # a reply is playing
+            app.action_interrupt_or_quit()
+            assert exited == []  # interrupted, not quit
+
+    _run(go())
+
+
+def test_ctrl_c_quits_when_nothing_is_playing(monkeypatch) -> None:
+    # With no reply playing (the hook returns False, or none is wired yet), Ctrl-C quits.
+    async def go() -> None:
+        stops: list[bool] = []
+        app = _app(on_stop=lambda: stops.append(True))
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            exited: list[bool] = []
+            monkeypatch.setattr(app, "exit", lambda *a, **k: exited.append(True))
+            app.set_interrupt(lambda: False)  # nothing playing
+            app.action_interrupt_or_quit()
+            assert stops == [True] and exited == [True]
+
+    _run(go())
+
+
+def test_interrupt_before_a_session_is_wired_is_a_safe_noop(monkeypatch) -> None:
+    # A keypress before the cascade has built its session (no interrupt hook yet): Escape is a
+    # no-op and Ctrl-C falls through to quit, so an early press can never wedge the UI.
+    async def go() -> None:
+        app = _app()
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            exited: list[bool] = []
+            monkeypatch.setattr(app, "exit", lambda *a, **k: exited.append(True))
+            app.action_interrupt()  # no hook wired -> nothing happens, no crash
+            assert exited == []
+            app.action_interrupt_or_quit()  # nothing to interrupt -> quits
+            assert exited == [True]
+
+    _run(go())
+
+
 def test_action_stop_tears_down_audio_and_exits(monkeypatch) -> None:
     async def go() -> None:
         stops: list[bool] = []
@@ -332,10 +402,17 @@ def run(self, **kwargs):
 
 
 def test_tui_run_conversation_drives_the_cascade(monkeypatch) -> None:
-    # The closure handed to the app runs the cascade with the duplex player and the wired deps.
+    # The closure handed to the app runs the cascade with the duplex player and the wired
+    # deps, and the cascade's on_session wires the session's reply-interrupt onto the app.
     fake_duplex = _wire_tui(monkeypatch)
     captured: dict[str, object] = {}
-    monkeypatch.setattr(engine, "run_cascade", lambda **kw: captured.update(kw))
+
+    def fake_run_cascade(**kw):
+        captured.update(kw)
+        # run_cascade hands the freshly built session to on_session before the conversation.
+        kw["on_session"](types.SimpleNamespace(interrupt_reply="session-interrupt"))
+
+    monkeypatch.setattr(engine, "run_cascade", fake_run_cascade)
 
     class FakeApp:
         def __init__(self, *, run_conversation, on_stop, web_note):
@@ -344,8 +421,13 @@ def __init__(self, *, run_conversation, on_stop, web_note):
         def run(self, **kwargs):
             self._rc("renderer-sentinel")  # the app would call this on its worker thread
 
+        def set_interrupt(self, interrupt):
+            captured["interrupt"] = interrupt
+
     monkeypatch.setattr("aai_cli.agent_cascade.tui.LiveAgentApp", FakeApp)
     run_agent_cascade(_opts(), AppState(), json_mode=False)
     assert captured["player"] is fake_duplex.player
     assert captured["deps"] == "deps"
     assert captured["renderer"] == "renderer-sentinel"
+    # The session's interrupt_reply was wired onto the app (so Escape/Ctrl-C can use it).
+    assert captured["interrupt"] == "session-interrupt"
diff --git a/tests/test_llm_command.py b/tests/test_llm_command.py
index c9f0032..54257a6 100644
--- a/tests/test_llm_command.py
+++ b/tests/test_llm_command.py
@@ -37,6 +37,37 @@ def test_llm_help_lists_command():
     assert "gateway" in result.output.lower()
 
 
+def test_known_models_is_the_full_gateway_list():
+    # Pin the exact ids so a typo'd/renamed model id is caught (the --list-models
+    # tests below compare output against KNOWN_MODELS itself, so they can't).
+    assert KNOWN_MODELS == (
+        "claude-opus-4-7",
+        "claude-opus-4-6",
+        "claude-opus-4-5-20251101",
+        "claude-sonnet-4-6",
+        "claude-sonnet-4-5-20250929",
+        "claude-haiku-4-5-20251001",
+        "gpt-5.5",
+        "gpt-5.2",
+        "gpt-5.1",
+        "gpt-5",
+        "gpt-5-mini",
+        "gpt-5-nano",
+        "gpt-4.1",
+        "gpt-oss-120b",
+        "gpt-oss-20b",
+        "gemini-3.5-flash",
+        "gemini-3-flash-preview",
+        "gemini-3.1-flash-lite-preview",
+        "gemini-2.5-pro",
+        "gemini-2.5-flash",
+        "gemini-2.5-flash-lite",
+        "kimi-k2.5",
+        "qwen3-next-80b-a3b",
+        "qwen3-32B",
+    )
+
+
 def test_llm_list_models_exits_without_network(monkeypatch):
     called = {"ran": False}
     monkeypatch.setattr(

From be20e9c0df7448fd12f248c4645aafa9590f16db Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 18:43:41 -0700
Subject: [PATCH 09/10] assembly live: show a tool-call affordance as the agent
 works
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A spoken turn that paused to use a tool (web search) sat silent on "thinking…",
reading as a hang. The brain now feeds an on_tool sink a short, speakable label
("Searching the web") as each tool call lands: build_completer's complete_reply
takes an optional on_tool, and the graph is streamed — rather than invoke-d —
whenever a sink is wired (not just under -v), so calls surface live.

The cascade engine passes the renderer's tool_call as that sink, so every
front-end shows it: the live TUI drops a dim inline "Searching the web…" note,
the line renderer prints it (stderr in piped text mode), and --json emits a new
additive tool.use event. The Renderer protocol gains tool_call.

Also extracts the shared cascade test fakes into tests/_cascade_fakes.py so the
engine/command/TUI suites share one set of doubles and stay under the 500-line gate.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 REFERENCE.md                       |   2 +-
 aai_cli/agent/events.py            |   9 +-
 aai_cli/agent/render.py            |  13 +++
 aai_cli/agent_cascade/brain.py     |  98 +++++++++++++------
 aai_cli/agent_cascade/engine.py    |   9 +-
 aai_cli/agent_cascade/tui.py       |  12 +++
 tests/_cascade_fakes.py            | 105 +++++++++++++++++++++
 tests/test_agent_cascade_brain.py  |  23 ++++-
 tests/test_agent_cascade_engine.py | 146 +++++++++--------------------
 tests/test_agent_events.py         |   4 +
 tests/test_agent_render.py         |  20 ++++
 tests/test_live_tui.py             |  17 ++++
 12 files changed, 319 insertions(+), 139 deletions(-)
 create mode 100644 tests/_cascade_fakes.py

diff --git a/REFERENCE.md b/REFERENCE.md
index 304ba45..9288fbb 100644
--- a/REFERENCE.md
+++ b/REFERENCE.md
@@ -94,7 +94,7 @@ each carrying a `"type"` field to dispatch on:
 | ------- | ----------- |
 | `assembly stream --json` | `begin`, `turn`, `termination` (with `--from-stdin`, a `source` event precedes each file's events) |
 | `assembly agent --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
-| `assembly live --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` |
+| `assembly live --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `tool.use`, `reply.started`, `transcript.agent`, `reply.done` |
 | `assembly dictate --json` | `utterance` |
 | `assembly llm --follow --json` | `answer` |
 | `assembly transcribe <batch> --json` | `result` (one per source), then `reduce` if `--llm-reduce` is set |
diff --git a/aai_cli/agent/events.py b/aai_cli/agent/events.py
index 0f917e1..3247bfc 100644
--- a/aai_cli/agent/events.py
+++ b/aai_cli/agent/events.py
@@ -52,6 +52,13 @@ class ReplyStarted(_Event):
     type: Literal["reply.started"] = "reply.started"
 
 
+class ToolUse(_Event):
+    """The agent invoked a tool mid-reply (``label`` is a short, human description)."""
+
+    type: Literal["tool.use"] = "tool.use"
+    label: str
+
+
 class AgentTranscript(_Event):
     """The agent's reply transcript (``interrupted`` when the user barged in)."""
 
@@ -67,4 +74,4 @@ class ReplyDone(_Event):
     interrupted: bool
 
 
-Event = SessionReady | UserDelta | UserFinal | ReplyStarted | AgentTranscript | ReplyDone
+Event = SessionReady | UserDelta | UserFinal | ToolUse | ReplyStarted | AgentTranscript | ReplyDone
diff --git a/aai_cli/agent/render.py b/aai_cli/agent/render.py
index 98f6a2b..7288fc8 100644
--- a/aai_cli/agent/render.py
+++ b/aai_cli/agent/render.py
@@ -75,6 +75,19 @@ def user_final(self, text: str) -> None:
         else:
             self._finalize_line(_labeled("you: ", text, style="aai.you"))
 
+    def tool_call(self, label: str) -> None:
+        """Surface that the agent is using a tool (e.g. "Searching the web") while it thinks.
+
+        JSON emits a ``tool.use`` event; piped text keeps it off stdout (transcript-only) by
+        routing to stderr; human mode shows a muted inline line.
+        """
+        if self.json_mode:
+            self._emit_event(events.ToolUse(label=label))
+        elif self.text_mode:
+            self._status(f"{label}…")
+        else:
+            self._line(_labeled("", f"{label}…", style="aai.muted"))
+
     # --- agent -------------------------------------------------------------
     def reply_started(self) -> None:
         if self.json_mode:
diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py
index 8df7b2d..e814533 100644
--- a/aai_cli/agent_cascade/brain.py
+++ b/aai_cli/agent_cascade/brain.py
@@ -42,6 +42,16 @@
 # arbitrary tuning knob — a +-1 shift is behaviorally equivalent, so no test can kill it.
 _RESULT_LOG_CAP = 500  # pragma: no mutate
 
+# Human, speakable labels for the tool affordance the live UI shows while a tool runs (so a
+# spoken turn that pauses to use a tool says *why* it's working, not just spin silently).
+_TOOL_LABELS = {WEB_SEARCH_TOOL_NAME: "Searching the web"}
+
+
+def _tool_label(name: str) -> str:
+    """A short present-tense label for a tool call, shown as the live UI's tool affordance."""
+    return _TOOL_LABELS.get(name, f"Using {name}")
+
+
 # Closes every guidance variant: the reply is spoken, so it must stay short and plain.
 _SPOKEN_TAIL = (
     "Your reply is read aloud, so keep it short and spoken — no markdown, lists, code, or raw URLs."
@@ -177,37 +187,44 @@ def build_graph(
 
 def build_completer(
     api_key: str, config: CascadeConfig, *, graph: CompiledAgent | None = None
-) -> Callable[[list[ChatCompletionMessageParam]], str]:
+) -> Callable[..., str]:
     """A ``complete_reply`` for the cascade engine backed by the deepagents graph.
 
     The cascade prepends its own ``system`` message to the history each turn; the graph
-    already owns the system prompt, so we drop it before invoking. The graph runs the
-    full tool loop and we return its final spoken text. Under ``-v`` the loop is streamed
-    so each tool call/result is logged as it lands (see :func:`_run_graph`). ``graph`` is
-    injected in tests so the per-turn wiring runs against a fake with no network.
+    already owns the system prompt, so we drop it before invoking. The graph runs the full
+    tool loop and we return its final spoken text. ``on_tool`` (when given) is called with a
+    short label as each tool call lands, so the front-end can show a "Searching the web…"
+    affordance instead of sitting silent while the agent works; the loop is also streamed —
+    rather than ``invoke``-d — whenever a sink is wired or under ``-v`` (see :func:`_run_graph`).
+    ``graph`` is injected in tests so the per-turn wiring runs against a fake with no network.
     """
     resolved = build_graph(api_key, config) if graph is None else graph
 
-    def complete_reply(messages: list[ChatCompletionMessageParam]) -> str:
+    def complete_reply(
+        messages: list[ChatCompletionMessageParam],
+        on_tool: Callable[[str], None] | None = None,
+    ) -> str:
         conversation = [message for message in messages if message.get("role") != "system"]
-        return _reply_text(_run_graph(resolved, conversation))
+        return _reply_text(_run_graph(resolved, conversation, on_tool))
 
     return complete_reply
 
 
 def _run_graph(
-    graph: CompiledAgent, conversation: list[ChatCompletionMessageParam]
+    graph: CompiledAgent,
+    conversation: list[ChatCompletionMessageParam],
+    on_tool: Callable[[str], None] | None = None,
 ) -> dict[str, object]:
     """Run one turn through the graph, returning its end state.
 
-    Normally a single ``invoke`` (the whole tool loop runs internally). Under verbose
-    mode, and when the graph can stream, drive it as incremental state snapshots instead
-    so :func:`_log_flow` can surface each tool call/result on stderr as it happens — which
-    is what makes a stalled spoken turn debuggable. The test fakes only implement
-    ``invoke``, so they (and the non-verbose path) take the plain branch.
+    Normally a single ``invoke`` (the whole tool loop runs internally). When a tool sink is
+    wired (the live UI's affordance) or under verbose mode, and the graph can stream, drive
+    it as incremental state snapshots instead so :func:`_log_flow` surfaces each tool call as
+    it happens. The test fakes only implement ``invoke``, so they (and the plain path with no
+    sink) take the invoke branch.
     """
     try:
-        return _drive_graph(graph, {"messages": conversation})
+        return _drive_graph(graph, {"messages": conversation}, on_tool)
     except CLIError:
         raise
     except Exception as exc:
@@ -220,41 +237,62 @@ def _run_graph(
         ) from exc
 
 
-def _drive_graph(graph: CompiledAgent, graph_input: dict[str, object]) -> dict[str, object]:
-    """Invoke the graph (or stream it under ``-v`` so :func:`_log_flow` can trace each step)."""
-    if debuglog.active() and hasattr(graph, "stream"):
+def _drive_graph(
+    graph: CompiledAgent,
+    graph_input: dict[str, object],
+    on_tool: Callable[[str], None] | None = None,
+) -> dict[str, object]:
+    """Invoke the graph, or stream it (when a tool sink is wired or under ``-v``) so
+    :func:`_log_flow` can surface each tool call as it lands."""
+    if (on_tool is not None or debuglog.active()) and hasattr(graph, "stream"):
         last: dict[str, object] = {}
         seen = 0
         for chunk in graph.stream(graph_input, None, stream_mode="values"):
-            seen = _log_flow(chunk, seen)
+            seen = _log_flow(chunk, seen, on_tool)
             last = chunk
         return last
     return graph.invoke(graph_input)
 
 
-def _log_flow(state: dict[str, object], seen: int) -> int:
-    """Log the tool calls/results added to ``state`` since the first ``seen`` messages.
+def _log_flow(
+    state: dict[str, object], seen: int, on_tool: Callable[[str], None] | None = None
+) -> int:
+    """Surface the tool calls/results added to ``state`` since the first ``seen`` messages.
 
-    Reuses the coding agent's message→event vocabulary so the flow log knows the same
-    AIMessage/ToolMessage shapes the TUI does. Returns the new high-water message count
-    so the next snapshot only logs what it added.
+    Feeds ``on_tool`` a speakable label as each tool call lands (the live UI's affordance) and,
+    under ``-v``, logs the call/result/interim line to stderr. Reuses the coding agent's
+    message→event vocabulary so it reads the same AIMessage/ToolMessage shapes the TUI does.
+    Returns the new high-water message count so the next snapshot only re-surfaces what it added.
     """
-    from aai_cli.code_agent.events import AssistantText, ToolCall, ToolResult, message_events
+    from aai_cli.code_agent.events import message_events
 
     messages = state.get("messages")
     if not isinstance(messages, list):
         return seen
+    verbose = debuglog.active()
     for message in messages[seen:]:
         for event in message_events(message, announce_calls=True):
-            if isinstance(event, ToolCall):
-                _FLOW_LOG.info("tool call %s args=%s", event.name, event.args)
-            elif isinstance(event, ToolResult):
-                _FLOW_LOG.info("tool result %s -> %s", event.name, _clip(event.content))
-            elif isinstance(event, AssistantText):
-                _FLOW_LOG.info("llm: %s", event.text)
+            _surface_event(event, on_tool, verbose=verbose)
     return len(messages)
 
 
+def _surface_event(event: object, on_tool: Callable[[str], None] | None, *, verbose: bool) -> None:
+    """Surface one flow event: feed a tool call's label to ``on_tool``, and (under ``-v``)
+    log the call/result/interim line to stderr."""
+    from aai_cli.code_agent.events import AssistantText, ToolCall, ToolResult
+
+    if isinstance(event, ToolCall) and on_tool is not None:
+        on_tool(_tool_label(event.name))
+    if not verbose:
+        return
+    if isinstance(event, ToolCall):
+        _FLOW_LOG.info("tool call %s args=%s", event.name, event.args)
+    elif isinstance(event, ToolResult):
+        _FLOW_LOG.info("tool result %s -> %s", event.name, _clip(event.content))
+    elif isinstance(event, AssistantText):
+        _FLOW_LOG.info("llm: %s", event.text)
+
+
 def _clip(text: str) -> str:
     """Flatten a tool result onto one line and truncate it for the flow log.
 
diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py
index ea1774e..60c1087 100644
--- a/aai_cli/agent_cascade/engine.py
+++ b/aai_cli/agent_cascade/engine.py
@@ -58,6 +58,9 @@ def user_partial(self, text: str) -> None:
     def user_final(self, text: str) -> None:
         """Show a finalized user transcript."""
 
+    def tool_call(self, label: str) -> None:
+        """Show that the agent is using a tool (e.g. "Searching the web") while it thinks."""
+
     def reply_started(self) -> None:
         """Mark the start of an agent reply."""
 
@@ -106,7 +109,9 @@ class CascadeDeps:
     """
 
     run_stt: Callable[[Callable[[object], None]], None]
-    complete_reply: Callable[[list[ChatCompletionMessageParam]], str]
+    # complete_reply(messages, on_tool=None) -> spoken text; on_tool is fed a label per tool
+    # call so the front-end can show a "Searching the web…" affordance (brain.build_completer).
+    complete_reply: Callable[..., str]
     synthesize: Callable[[str], bytes]
     spawn: Callable[[Callable[[], None]], _Worker] = _spawn_thread
 
@@ -230,7 +235,7 @@ def _generate_reply(self) -> None:
             *self.history,
         ]
         try:
-            reply = self.deps.complete_reply(messages)
+            reply = self.deps.complete_reply(messages, on_tool=self.renderer.tool_call)
         except CLIError as exc:
             # The reply leg failed (gateway/tool/graph error, now converted to a CLIError in
             # brain._run_graph). Show it in the transcript so the turn doesn't just vanish —
diff --git a/aai_cli/agent_cascade/tui.py b/aai_cli/agent_cascade/tui.py
index ec2bcbc..007c73d 100644
--- a/aai_cli/agent_cascade/tui.py
+++ b/aai_cli/agent_cascade/tui.py
@@ -59,6 +59,9 @@ def user_partial(self, text: str) -> None:
     def user_final(self, text: str) -> None:
         self._dispatch(self._app.show_user_final, text)
 
+    def tool_call(self, label: str) -> None:
+        self._dispatch(self._app.show_tool_call, label)
+
     def reply_started(self) -> None:
         self._dispatch(self._app.begin_reply)
 
@@ -188,6 +191,15 @@ def show_user_final(self, text: str) -> None:
         self._set_phase("thinking")
         self._scroll_end()
 
+    def show_tool_call(self, label: str) -> None:
+        """Surface the agent's tool use inline as it happens (the live tool affordance).
+
+        A spoken turn that pauses to use a tool would otherwise sit silent on "thinking…";
+        this drops a dim "Searching the web…" line so the wait reads as progress, not a hang.
+        """
+        self._mount(Note(f"{label}…"))
+        self._scroll_end()
+
     def begin_reply(self) -> None:
         """Open a fresh reply widget the agent's sentences stream into; switch to speaking."""
         self._set_phase("speaking")
diff --git a/tests/_cascade_fakes.py b/tests/_cascade_fakes.py
new file mode 100644
index 0000000..4985cf4
--- /dev/null
+++ b/tests/_cascade_fakes.py
@@ -0,0 +1,105 @@
+"""Shared fakes for the `assembly live` cascade tests (engine/command/TUI).
+
+The cascade's three network legs and its thread spawner are injected through
+``CascadeDeps``, so the suites drive the orchestration against these fakes — no
+sockets, mic, or speaker. Kept in one module so the engine, command, and TUI tests
+share one set of doubles (and so no single test file grows past the 500-line gate).
+"""
+
+from __future__ import annotations
+
+import types
+
+from aai_cli.agent_cascade.config import CascadeConfig
+from aai_cli.agent_cascade.engine import CascadeDeps, CascadeSession
+
+
+class FakeRenderer:
+    def __init__(self):
+        self.calls = []
+
+    def connected(self):
+        self.calls.append(("connected",))
+
+    def user_partial(self, text):
+        self.calls.append(("user_partial", text))
+
+    def user_final(self, text):
+        self.calls.append(("user_final", text))
+
+    def tool_call(self, label):
+        self.calls.append(("tool_call", label))
+
+    def reply_started(self):
+        self.calls.append(("reply_started",))
+
+    def agent_transcript(self, text, *, interrupted):
+        self.calls.append(("agent_transcript", text, interrupted))
+
+    def reply_done(self, *, interrupted):
+        self.calls.append(("reply_done", interrupted))
+
+
+class FakePlayer:
+    def __init__(self):
+        self.enqueued = []
+        self.flushed = 0
+        self.started = False
+        self.closed = False
+
+    def start(self):
+        self.started = True
+
+    def enqueue(self, pcm):
+        self.enqueued.append(pcm)
+
+    def flush(self):
+        self.flushed += 1
+
+    def close(self):
+        self.closed = True
+
+
+class FakeWorker:
+    def __init__(self, *, alive):
+        self._alive = alive
+        self.joined = 0
+
+    def is_alive(self):
+        return self._alive
+
+    def join(self):
+        self.joined += 1
+        self._alive = False
+
+
+def sync_spawn(target):
+    """Run the reply body inline and hand back a finished worker, so the cascade is
+    driven deterministically without real threads."""
+    target()
+    return FakeWorker(alive=False)
+
+
+def turn(text, *, end_of_turn=True, turn_is_formatted=True):
+    return types.SimpleNamespace(
+        transcript=text, end_of_turn=end_of_turn, turn_is_formatted=turn_is_formatted
+    )
+
+
+def make_session(
+    *,
+    complete_reply=lambda messages, on_tool=None: "Hello there.",
+    synthesize=lambda text: b"pcm:" + text.encode(),
+    spawn=sync_spawn,
+    run_stt=lambda on_turn: None,
+    config=None,
+):
+    deps = CascadeDeps(
+        run_stt=run_stt, complete_reply=complete_reply, synthesize=synthesize, spawn=spawn
+    )
+    renderer = FakeRenderer()
+    player = FakePlayer()
+    session = CascadeSession(
+        deps=deps, renderer=renderer, player=player, config=config or CascadeConfig()
+    )
+    return session, renderer, player
diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py
index a529ce8..cf00351 100644
--- a/tests/test_agent_cascade_brain.py
+++ b/tests/test_agent_cascade_brain.py
@@ -230,7 +230,7 @@ def test_run_graph_streams_and_logs_flow_when_verbose(monkeypatch, caplog, prese
 
 
 def test_run_graph_invokes_when_not_verbose():
-    # Default (non-verbose): the graph is invoked once, never streamed, and nothing is logged.
+    # Default (non-verbose, no tool sink): invoked once, never streamed, nothing logged.
     graph = _StreamingGraph([{"messages": [AIMessage(content="hi")]}])
     completer = brain.build_completer("k", CascadeConfig(), graph=graph)
     assert completer([{"role": "user", "content": "hi"}]) == ""
@@ -238,6 +238,27 @@ def test_run_graph_invokes_when_not_verbose():
     assert graph.stream_kwargs is None
 
 
+def test_on_tool_sink_streams_and_reports_each_tool_call_by_label():
+    # A wired tool sink (the live UI affordance) streams the graph — even without -v — and
+    # reports each tool call by its speakable label, while still returning the final reply.
+    labels: list[str] = []
+    call = AIMessage(
+        content="", tool_calls=[{"name": brain.WEB_SEARCH_TOOL_NAME, "args": {}, "id": "c1"}]
+    )
+    snapshots = [{"messages": [call]}, {"messages": [call, AIMessage(content="Here's the news.")]}]
+    graph = _StreamingGraph(snapshots)
+    completer = brain.build_completer("k", CascadeConfig(), graph=graph)
+    reply = completer([{"role": "user", "content": "news?"}], on_tool=labels.append)
+    assert reply == "Here's the news."
+    assert labels == ["Searching the web"]
+    assert graph.stream_kwargs == "values" and graph.invoked is False  # streamed, not invoked
+
+
+def test_tool_label_maps_web_search_and_falls_back_for_others():
+    assert brain._tool_label(brain.WEB_SEARCH_TOOL_NAME) == "Searching the web"
+    assert brain._tool_label("get_time") == "Using get_time"
+
+
 def test_run_graph_invokes_when_graph_cannot_stream(monkeypatch):
     # Verbose but the (test) graph only implements invoke: fall back to invoke rather than
     # crashing on a missing .stream — the fakes and any non-streaming graph stay supported.
diff --git a/tests/test_agent_cascade_engine.py b/tests/test_agent_cascade_engine.py
index 1032135..23d7ec6 100644
--- a/tests/test_agent_cascade_engine.py
+++ b/tests/test_agent_cascade_engine.py
@@ -15,95 +15,9 @@
 from aai_cli.agent_cascade.config import CascadeConfig
 from aai_cli.agent_cascade.engine import CascadeDeps, CascadeSession, run_cascade
 from aai_cli.core.errors import APIError
-
-
-class FakeRenderer:
-    def __init__(self):
-        self.calls = []
-
-    def connected(self):
-        self.calls.append(("connected",))
-
-    def user_partial(self, text):
-        self.calls.append(("user_partial", text))
-
-    def user_final(self, text):
-        self.calls.append(("user_final", text))
-
-    def reply_started(self):
-        self.calls.append(("reply_started",))
-
-    def agent_transcript(self, text, *, interrupted):
-        self.calls.append(("agent_transcript", text, interrupted))
-
-    def reply_done(self, *, interrupted):
-        self.calls.append(("reply_done", interrupted))
-
-
-class FakePlayer:
-    def __init__(self):
-        self.enqueued = []
-        self.flushed = 0
-        self.started = False
-        self.closed = False
-
-    def start(self):
-        self.started = True
-
-    def enqueue(self, pcm):
-        self.enqueued.append(pcm)
-
-    def flush(self):
-        self.flushed += 1
-
-    def close(self):
-        self.closed = True
-
-
-class FakeWorker:
-    def __init__(self, *, alive):
-        self._alive = alive
-        self.joined = 0
-
-    def is_alive(self):
-        return self._alive
-
-    def join(self):
-        self.joined += 1
-        self._alive = False
-
-
-def _sync_spawn(target):
-    """Run the reply body inline and hand back a finished worker, so the cascade is
-    driven deterministically without real threads."""
-    target()
-    return FakeWorker(alive=False)
-
-
-def _turn(text, *, end_of_turn=True, turn_is_formatted=True):
-    return types.SimpleNamespace(
-        transcript=text, end_of_turn=end_of_turn, turn_is_formatted=turn_is_formatted
-    )
-
-
-def make_session(
-    *,
-    complete_reply=lambda messages: "Hello there.",
-    synthesize=lambda text: b"pcm:" + text.encode(),
-    spawn=_sync_spawn,
-    run_stt=lambda on_turn: None,
-    config=None,
-):
-    deps = CascadeDeps(
-        run_stt=run_stt, complete_reply=complete_reply, synthesize=synthesize, spawn=spawn
-    )
-    renderer = FakeRenderer()
-    player = FakePlayer()
-    session = CascadeSession(
-        deps=deps, renderer=renderer, player=player, config=config or CascadeConfig()
-    )
-    return session, renderer, player
-
+from tests._cascade_fakes import FakePlayer, FakeRenderer, FakeWorker, make_session
+from tests._cascade_fakes import sync_spawn as _sync_spawn
+from tests._cascade_fakes import turn as _turn
 
 # --- greeting ----------------------------------------------------------------
 
@@ -146,7 +60,7 @@ def test_on_turn_blank_transcript_ignored():
 
 
 def test_on_turn_final_renders_and_replies():
-    session, renderer, player = make_session(complete_reply=lambda m: "Sure thing.")
+    session, renderer, player = make_session(complete_reply=lambda m, on_tool=None: "Sure thing.")
     session.on_turn(_turn("what time is it"))
     assert ("user_final", "what time is it") in renderer.calls
     assert {"role": "user", "content": "what time is it"} in session.history
@@ -155,9 +69,23 @@ def test_on_turn_final_renders_and_replies():
     assert ("reply_done", False) in renderer.calls
 
 
+def test_reply_forwards_tool_calls_to_the_renderer():
+    # The reply worker hands complete_reply an on_tool sink; a tool call it makes surfaces on
+    # the renderer, so the live UI can show a "Searching the web…" affordance mid-turn.
+    def reply(messages, on_tool):
+        on_tool("Searching the web")
+        return "Found it."
+
+    session, renderer, _player = make_session(complete_reply=reply)
+    session.on_turn(_turn("what's the news"))
+    assert ("tool_call", "Searching the web") in renderer.calls
+
+
 def test_on_turn_interim_shows_partial_and_does_not_reply():
     replies = []
-    session, renderer, _player = make_session(complete_reply=lambda m: replies.append(m) or "x")
+    session, renderer, _player = make_session(
+        complete_reply=lambda m, on_tool=None: replies.append(m) or "x"
+    )
     session.on_turn(_turn("partial words", end_of_turn=False))
     assert ("user_partial", "partial words") in renderer.calls
     assert replies == []  # no reply generated for an interim turn
@@ -178,7 +106,7 @@ def test_on_turn_interim_barges_in_on_live_reply():
 def test_generate_reply_speaks_each_sentence():
     spoken = []
     session, renderer, player = make_session(
-        complete_reply=lambda m: "One. Two! Three?",
+        complete_reply=lambda m, on_tool=None: "One. Two! Three?",
         synthesize=lambda text: spoken.append(text) or text.encode(),
     )
     session._generate_reply()
@@ -193,7 +121,7 @@ def test_generate_reply_speaks_each_sentence():
 def test_generate_reply_threads_system_prompt_and_history():
     captured = {}
 
-    def capture(messages):
+    def capture(messages, on_tool=None):
         captured["messages"] = messages
         return "Ok."
 
@@ -208,7 +136,7 @@ def capture(messages):
 
 def test_generate_reply_trims_history_window():
     session, _renderer, _player = make_session(
-        complete_reply=lambda m: "a. b.", config=CascadeConfig(max_history=1)
+        complete_reply=lambda m, on_tool=None: "a. b.", config=CascadeConfig(max_history=1)
     )
     session.history.append({"role": "user", "content": "hi"})
     session._generate_reply()
@@ -219,7 +147,7 @@ def test_generate_reply_trims_history_window():
 def test_on_turn_trims_history_window():
     # An empty reply adds no assistant turn, so only on_turn's own trim caps the list.
     session, _renderer, _player = make_session(
-        complete_reply=lambda m: "", config=CascadeConfig(max_history=1)
+        complete_reply=lambda m, on_tool=None: "", config=CascadeConfig(max_history=1)
     )
     session.history.append({"role": "assistant", "content": "old"})
     session.on_turn(_turn("newest"))
@@ -232,7 +160,9 @@ def synth(text):
             session._stop.set()
         return text.encode()
 
-    session, renderer, player = make_session(complete_reply=lambda m: "One. Two. Three.")
+    session, renderer, player = make_session(
+        complete_reply=lambda m, on_tool=None: "One. Two. Three."
+    )
     session.deps.synthesize = synth
     session._generate_reply()
     # Only the first sentence finished enqueuing before the barge-in stop landed.
@@ -242,7 +172,7 @@ def synth(text):
 
 
 def test_generate_reply_stop_before_first_sentence_speaks_nothing():
-    session, renderer, player = make_session(complete_reply=lambda m: "One. Two.")
+    session, renderer, player = make_session(complete_reply=lambda m, on_tool=None: "One. Two.")
     session._stop.set()
     session._generate_reply()
     assert player.enqueued == []
@@ -252,7 +182,7 @@ def test_generate_reply_stop_before_first_sentence_speaks_nothing():
 
 
 def test_generate_reply_llm_failure_is_recorded_and_surfaced():
-    def boom(messages):
+    def boom(messages, on_tool=None):
         del messages
         raise APIError("gateway down")
 
@@ -269,7 +199,9 @@ def test_generate_reply_tts_failure_midway_is_recorded():
     def boom(text):
         raise APIError("tts down")
 
-    session, renderer, player = make_session(complete_reply=lambda m: "Hi.", synthesize=boom)
+    session, renderer, player = make_session(
+        complete_reply=lambda m, on_tool=None: "Hi.", synthesize=boom
+    )
     session._generate_reply()
     assert isinstance(session.error, APIError)
     assert player.enqueued == []
@@ -403,7 +335,7 @@ def run_stt(on_turn):
 
     session_box = {}
 
-    def complete_reply(messages):
+    def complete_reply(messages, on_tool=None):
         session_box["messages"] = messages
         return "Hi back."
 
@@ -432,7 +364,7 @@ def test_run_cascade_hands_the_session_to_on_session_before_greeting():
     player = FakePlayer()
     deps = CascadeDeps(
         run_stt=lambda on_turn: None,
-        complete_reply=lambda m: "hi",
+        complete_reply=lambda m, on_tool=None: "hi",
         synthesize=lambda text: b"",
         spawn=_sync_spawn,
     )
@@ -458,7 +390,10 @@ def run_stt(on_turn):
         on_turn(_turn("hello"))
 
     deps = CascadeDeps(
-        run_stt=run_stt, complete_reply=lambda m: "hi", synthesize=lambda t: b"", spawn=lazy_spawn
+        run_stt=run_stt,
+        complete_reply=lambda m, on_tool=None: "hi",
+        synthesize=lambda t: b"",
+        spawn=lazy_spawn,
     )
     run_cascade(
         renderer=FakeRenderer(), player=FakePlayer(), config=CascadeConfig(greeting=""), deps=deps
@@ -470,7 +405,7 @@ def test_run_cascade_reraises_recorded_leg_error():
     def run_stt(on_turn):
         on_turn(_turn("hi"))
 
-    def boom(messages):
+    def boom(messages, on_tool=None):
         raise APIError("gateway down")
 
     deps = CascadeDeps(
@@ -491,7 +426,10 @@ def run_stt(on_turn):
 
     player = FakePlayer()
     deps = CascadeDeps(
-        run_stt=run_stt, complete_reply=lambda m: "", synthesize=lambda t: b"", spawn=_sync_spawn
+        run_stt=run_stt,
+        complete_reply=lambda m, on_tool=None: "",
+        synthesize=lambda t: b"",
+        spawn=_sync_spawn,
     )
     with pytest.raises(APIError, match="stt failed"):
         run_cascade(
diff --git a/tests/test_agent_events.py b/tests/test_agent_events.py
index 676d8ef..879d1a0 100644
--- a/tests/test_agent_events.py
+++ b/tests/test_agent_events.py
@@ -18,6 +18,10 @@
         (events.UserDelta(text="typing…"), {"type": "transcript.user.delta", "text": "typing…"}),
         (events.UserFinal(text="hello"), {"type": "transcript.user", "text": "hello"}),
         (events.ReplyStarted(), {"type": "reply.started"}),
+        (
+            events.ToolUse(label="Searching the web"),
+            {"type": "tool.use", "label": "Searching the web"},
+        ),
         (
             events.AgentTranscript(text="hi back", interrupted=False),
             {"type": "transcript.agent", "text": "hi back", "interrupted": False},
diff --git a/tests/test_agent_render.py b/tests/test_agent_render.py
index e6216af..5cfc9fa 100644
--- a/tests/test_agent_render.py
+++ b/tests/test_agent_render.py
@@ -124,6 +124,26 @@ def test_human_agent_line_labeled():
     assert "the time is noon" in out
 
 
+def test_json_tool_call_emits_tool_use_event():
+    buf = io.StringIO()
+    AgentRenderer(json_mode=True, out=buf).tool_call("Searching the web")
+    assert {"type": "tool.use", "label": "Searching the web"} in _json_lines(buf)
+
+
+def test_text_tool_call_goes_to_stderr_not_stdout():
+    # The tool affordance is status, so in piped text mode it stays off stdout (transcript-only).
+    out, err = io.StringIO(), io.StringIO()
+    AgentRenderer(json_mode=False, text_mode=True, out=out, err=err).tool_call("Searching the web")
+    assert "Searching the web" in err.getvalue()
+    assert out.getvalue() == ""
+
+
+def test_human_tool_call_shows_inline_line():
+    r, buf = _human()
+    r.tool_call("Searching the web")
+    assert "Searching the web" in buf.getvalue()
+
+
 def test_human_close_commits_open_partial():
     r, buf = _human()
     r.user_partial("half a sentence")
diff --git a/tests/test_live_tui.py b/tests/test_live_tui.py
index 0dbccac..84d26ec 100644
--- a/tests/test_live_tui.py
+++ b/tests/test_live_tui.py
@@ -136,6 +136,20 @@ async def go() -> None:
     _run(go())
 
 
+def test_show_tool_call_mounts_an_inline_affordance() -> None:
+    # A tool call mid-turn drops a dim "Searching the web…" note, so the thinking pause reads
+    # as progress rather than a hang (the live tool affordance).
+    async def go() -> None:
+        app = _app()
+        async with app.run_test(size=(100, 30)) as pilot:
+            await pilot.pause()
+            app.show_tool_call("Searching the web")
+            notes = [str(n.render()) for n in app.query(Note)]
+            assert any("Searching the web" in n for n in notes)
+
+    _run(go())
+
+
 def test_agent_sentence_without_begin_reply_mounts_a_reply() -> None:
     async def go() -> None:
         app = _app()
@@ -295,6 +309,7 @@ def run_conversation(renderer) -> None:
             renderer.connected()
             renderer.user_partial("turn it")
             renderer.user_final("turn it up")
+            renderer.tool_call("Searching the web")
             renderer.reply_started()
             renderer.agent_transcript("Done.", interrupted=False)
             renderer.reply_done(interrupted=False)
@@ -314,6 +329,8 @@ def run_conversation(renderer) -> None:
             )
             assert "» turn it up" in str(app.query_one(UserMessage).render())
             assert app.query_one(AssistantMessage).text == "Done. "
+            # The tool_call leg hopped to the UI thread and surfaced the affordance note.
+            assert any("Searching the web" in str(n.render()) for n in app.query(Note))
         assert done.is_set()  # leaving the run_test context unmounted -> on_stop released it
 
     _run(go())

From 55c0018dffcb42780ec97ede4f52cb6e5561868e Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 19:01:00 -0700
Subject: [PATCH 10/10] assembly live: exit cleanly on Ctrl-C during TUI setup
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

A Ctrl-C during the voice TUI's setup — opening the mic, building the deepagents
graph, loading --mcp-config servers — lands before Textual captures the keyboard,
so it surfaced as a raw KeyboardInterrupt (and, mid asyncio.run/threading
teardown, a noisy traceback). The line-renderer path already mapped this to a
clean exit 130; the TUI dispatch did not. Extract a _launch_tui helper that wraps
_run_live_tui and maps a setup-time KeyboardInterrupt to typer.Exit(130), matching
the assembly code TUI. (In-session Ctrl-C is already a Textual binding, so it never
reaches the graph as an exception.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/commands/agent_cascade/_exec.py | 16 +++++++++++++++-
 tests/test_live_tui.py                  | 15 +++++++++++++++
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/aai_cli/commands/agent_cascade/_exec.py b/aai_cli/commands/agent_cascade/_exec.py
index 2bbd324..5d43caa 100644
--- a/aai_cli/commands/agent_cascade/_exec.py
+++ b/aai_cli/commands/agent_cascade/_exec.py
@@ -247,6 +247,20 @@ def run_conversation(renderer: engine.Renderer) -> None:
     app.run(mouse=False)
 
 
+def _launch_tui(api_key: str, opts: AgentCascadeOptions, config: CascadeConfig) -> None:
+    """Run the voice-only TUI, mapping a setup-time Ctrl-C to a clean exit.
+
+    A Ctrl-C during setup — opening the mic, building the graph, loading ``--mcp-config``
+    servers — lands before Textual captures the keyboard, so it surfaces as a plain
+    ``KeyboardInterrupt`` here. Map it to exit 130 (cancel) rather than letting it dump a
+    half-initialized asyncio/threading traceback.
+    """
+    try:
+        _run_live_tui(api_key, opts, config)
+    except KeyboardInterrupt:
+        raise typer.Exit(code=errors.CANCELLED_EXIT_CODE) from None
+
+
 def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode: bool) -> None:
     """Execute one `assembly agent-cascade` cascade from already-parsed flags."""
     text_mode, json_mode = resolve_output_modes(opts.output_field, json_mode=json_mode)
@@ -290,7 +304,7 @@ def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode:
 
     if _should_use_tui(from_file=from_file, json_mode=json_mode, text_mode=text_mode):
         # The voice-only Textual front-end surfaces the web-search note in-app, not on stderr.
-        _run_live_tui(api_key, opts, config)
+        _launch_tui(api_key, opts, config)
         return
 
     _warn_without_web_search(json_mode=json_mode)
diff --git a/tests/test_live_tui.py b/tests/test_live_tui.py
index 84d26ec..abe49b9 100644
--- a/tests/test_live_tui.py
+++ b/tests/test_live_tui.py
@@ -13,6 +13,7 @@
 import types
 
 import pytest
+import typer
 from textual.widgets import Static
 
 from aai_cli.agent_cascade import engine
@@ -418,6 +419,20 @@ def run(self, **kwargs):
     assert captured["ran"] == {"mouse": False}  # mouse off so transcript text stays selectable
 
 
+def test_tui_setup_keyboard_interrupt_exits_clean(monkeypatch) -> None:
+    # Ctrl-C during TUI setup (mic open / graph build / --mcp-config load) lands before
+    # Textual captures the keyboard; it must exit 130, not surface a raw traceback.
+    _wire_tui(monkeypatch)
+
+    def boom(*_a, **_k):
+        raise KeyboardInterrupt
+
+    monkeypatch.setattr(_exec, "_run_live_tui", boom)
+    with pytest.raises(typer.Exit) as exc:
+        run_agent_cascade(_opts(), AppState(), json_mode=False)
+    assert exc.value.exit_code == 130
+
+
 def test_tui_run_conversation_drives_the_cascade(monkeypatch) -> None:
     # The closure handed to the app runs the cascade with the duplex player and the wired
     # deps, and the cascade's on_session wires the session's reply-interrupt onto the app.