From 2922b4618df0137b2ee65112b746eb5291710d4b Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 13:34:12 -0700
Subject: [PATCH 1/6] assembly code: read skills via a dedicated tool + work
 around gateway empty tool-args
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Two `assembly code` fixes uncovered while building a voice agent:

1. read_skill tool. The skills middleware loads skills from its own backend
   rooted at ~/.claude/skills, but deepagents' stock prompt tells the model to
   open each SKILL.md with `read_file` — which is bound to the cwd sandbox and
   can't reach them, so the model got `File '/aai-cli/SKILL.md' not found`. Add
   a read-only `read_skill` tool bound to the skills directory (with a traversal
   guard) and a prompt that points the model at it. build_skills() now returns
   the (middleware, tool) pair, wired together in _build_agent.

2. Empty tool-call arguments. The LLM Gateway maps OpenAI `arguments` onto
   Anthropic `tool_use.input` but drops `input` entirely when arguments are
   empty (""/"{}"), which Anthropic rejects (400, surfaced as 500 when
   streaming) — and because the failing call sits in history, every later turn
   fails too, wedging the session. _ensure_tool_call_arguments substitutes a
   minimal non-empty placeholder in the outgoing payload so the gateway emits a
   valid input. Request-only; the tool already ran locally with its real args.
   (Reported upstream for a server-side fix; this keeps the CLI resilient.)

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/model.py    | 67 ++++++++++++++++++++++++---
 aai_cli/code_agent/skills.py   | 82 +++++++++++++++++++++++++++++-----
 aai_cli/commands/code/_exec.py | 22 +++++----
 tests/test_code_agent.py       | 77 +++++++++++++++++++++++++++++--
 tests/test_code_command.py     | 31 +++++++++----
 5 files changed, 242 insertions(+), 37 deletions(-)

diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index 7f87f6b..9539b4c 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -8,11 +8,17 @@
 
 from __future__ import annotations
 
+import json
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
 from aai_cli.core import environments
 
+# The gateway omits Anthropic's required ``tool_use.input`` when an OpenAI tool call's
+# ``arguments`` is empty (``""`` / ``"{}"``); substitute a minimal non-empty object so the
+# field is emitted. See :func:`_ensure_tool_call_arguments`.
+_PLACEHOLDER_ARGUMENTS = '{"_": ""}'
+
 if TYPE_CHECKING:
     from langchain_core.language_models.chat_models import BaseChatModel
     from langchain_core.outputs import ChatGenerationChunk
@@ -86,6 +92,52 @@ def _hoist_call_list(tool_calls: list[object]) -> None:
             tool_call["id"] = function.pop("id")
 
 
+def _ensure_tool_call_arguments(messages: object) -> None:
+    """Give every empty tool-call ``arguments`` a non-empty placeholder object, in place.
+
+    The AssemblyAI LLM Gateway maps each OpenAI tool call's ``arguments`` (a JSON string)
+    onto Anthropic's ``tool_use.input`` object, but drops ``input`` entirely when the
+    arguments are empty (``""`` or ``"{}"``). Anthropic *requires* ``input`` to be present,
+    so replaying any argument-less tool call is rejected (400, surfaced as a 500 while
+    streaming) — and because the failing call sits in the conversation history, every later
+    turn fails too, wedging the session. We swap in a minimal non-empty object so the gateway
+    emits a valid ``input``. This only rewrites the request we send: the tool already ran
+    locally with its real (empty) arguments, and the gateway accepts the placeholder even for
+    tools that declare ``additionalProperties: false``. (Drop this once the gateway maps empty
+    arguments to ``input: {}`` itself.)
+    """
+    if not isinstance(messages, list):
+        return
+    for message in messages:
+        tool_calls = message.get("tool_calls") if isinstance(message, dict) else None
+        if isinstance(tool_calls, list):
+            _fill_empty_arguments(tool_calls)
+
+
+def _fill_empty_arguments(tool_calls: list[object]) -> None:
+    """Replace each empty ``function.arguments`` with the placeholder (helper for the above)."""
+    for tool_call in tool_calls:
+        if not isinstance(tool_call, dict):
+            continue
+        function = tool_call.get("function")
+        if isinstance(function, dict) and _is_empty_arguments(function.get("arguments")):
+            function["arguments"] = _PLACEHOLDER_ARGUMENTS
+
+
+def _is_empty_arguments(arguments: object) -> bool:
+    """True when ``arguments`` is an OpenAI args string carrying no fields (``""``/``"{}"``)."""
+    if not isinstance(arguments, str):
+        return False
+    stripped = arguments.strip()
+    if not stripped:
+        return True
+    try:
+        parsed = json.loads(stripped)
+    except ValueError:
+        return False
+    return isinstance(parsed, dict) and not parsed
+
+
 def build_model(
     api_key: str,
     *,
@@ -114,18 +166,21 @@ def build_model(
     class _GatewayChatOpenAI(ChatOpenAI):
         """ChatOpenAI that adapts the gateway's OpenAI-incompatible quirks for langchain.
 
-        Two fix-ups, each working around a gateway response/request bug the upstream client
-        doesn't expect: flatten list-content messages the gateway 500s on (request side, see
-        :func:`_flatten_content`), and hoist each streamed tool-call ``id`` back to the
-        tool-call top level where langchain reads it (response side, see
-        :func:`_hoist_tool_call_ids`).
+        Three fix-ups, each working around a gateway request/response bug the upstream client
+        doesn't expect: flatten list-content messages the gateway 500s on and give empty
+        tool-call arguments a placeholder the gateway can map to ``tool_use.input`` (request
+        side, see :func:`_flatten_content` / :func:`_ensure_tool_call_arguments`), and hoist
+        each streamed tool-call ``id`` back to the tool-call top level where langchain reads it
+        (response side, see :func:`_hoist_tool_call_ids`).
         """
 
         def _get_request_payload(
             self, input_: object, *, stop: list[str] | None = None, **kwargs: object
         ) -> dict:
             payload = super()._get_request_payload(input_, stop=stop, **kwargs)
-            _flatten_content(payload.get("messages"))
+            messages = payload.get("messages")
+            _flatten_content(messages)
+            _ensure_tool_call_arguments(messages)
             return payload
 
         def _convert_chunk_to_generation_chunk(
diff --git a/aai_cli/code_agent/skills.py b/aai_cli/code_agent/skills.py
index eff6668..a953364 100644
--- a/aai_cli/code_agent/skills.py
+++ b/aai_cli/code_agent/skills.py
@@ -1,11 +1,16 @@
 """Import installed agent skills (notably the `assemblyai` skill) into the agent.
 
-`assembly setup` installs the `assemblyai` skill under the coding-agent config root
-(`~/.claude/skills/assemblyai/`, honoring `CLAUDE_CONFIG_DIR`). deepagents can surface
-skills to the model via progressive disclosure, but its `SkillsMiddleware` reads them
-through a backend — and our main file backend is confined to the working directory.
-So we give skills their *own* `FilesystemBackend` rooted at the skills directory and
-inject a standalone `SkillsMiddleware`, independent of the cwd-scoped file tools.
+`assembly setup` installs skills under the coding-agent config root
+(`~/.claude/skills/<skill>/SKILL.md`, honoring `CLAUDE_CONFIG_DIR`). deepagents can
+surface skills to the model via progressive disclosure, but its `SkillsMiddleware` reads
+them through a backend — and our main file backend is confined to the working directory.
+So we give skills their *own* `FilesystemBackend` rooted at the skills directory.
+
+deepagents' stock skills prompt tells the model to open each `SKILL.md` with `read_file`,
+but that tool is bound to the cwd-scoped backend and so can't reach a skill living under
+`~/.claude/skills` (the model just gets ``File '/aai-cli/SKILL.md' not found``). We close
+that gap with a dedicated read-only `read_skill` tool bound to the skills directory, and a
+prompt that points the model at it instead of `read_file`.
 """
 
 from __future__ import annotations
@@ -17,11 +22,34 @@
 
 if TYPE_CHECKING:
     from langchain.agents.middleware import AgentMiddleware
+    from langchain_core.tools import BaseTool
 
 # Mirrors aai_cli.app.coding_agent.skills_root without importing the app layer (a
 # feature slice stays below it): the agent config root, overridable for tests/agents.
 _CLAUDE_CONFIG_DIR = "CLAUDE_CONFIG_DIR"
 
+READ_SKILL_TOOL_NAME = "read_skill"
+
+# Skills prompt fragment. Must keep the three slots deepagents substitutes at runtime
+# (`{skills_locations}`, `{skills_load_warnings}`, `{skills_list}`); the constructor
+# raises if any is missing. The one behavioral change from deepagents' stock prompt is
+# steering the model to `read_skill` — skills live outside the cwd sandbox, so the
+# ordinary `read_file` tool can't open them.
+_SKILLS_PROMPT = """## Skills
+
+You have a library of skills — specialized instructions and workflows for specific tasks.
+
+{skills_locations}{skills_load_warnings}
+**Available skills:**
+
+{skills_list}
+
+**How to use a skill (progressive disclosure):** you see each skill's name, description, and
+path above, but read its full instructions only when a skill matches the task. Read it with
+the `read_skill` tool, passing the path shown above — e.g. `read_skill("/assemblyai/SKILL.md")`
+— then follow what it says. Do **not** use `read_file` for these paths: skills live outside the
+working directory, so only `read_skill` can reach them."""
+
 
 def skills_root() -> Path:
     """Directory holding installed skills (one subdir per skill, each with SKILL.md)."""
@@ -35,12 +63,41 @@ def _has_skills(root: Path) -> bool:
     return root.is_dir() and any(child.joinpath("SKILL.md").is_file() for child in root.iterdir())
 
 
-def build_skills_middleware(root: Path | None = None) -> AgentMiddleware | None:
-    """A ``SkillsMiddleware`` over the installed skills, or ``None`` if none are present.
+def _read_skill_file(root: Path, path: str) -> str:
+    """Read ``path`` (as surfaced in the skills list) from under ``root``, guarding traversal.
+
+    ``path`` is the backend-virtual path shown in the prompt (e.g. ``/assemblyai/SKILL.md``),
+    so it is resolved relative to ``root``. A path that escapes ``root`` (``..`` segments) or
+    names a missing file returns an error string the model can recover from rather than raising.
+    """
+    target = (root / path.lstrip("/")).resolve()
+    if not target.is_relative_to(root.resolve()):
+        return f"Error: '{path}' is outside the skills directory."
+    if not target.is_file():
+        return f"Error: skill file '{path}' not found."
+    return target.read_text(encoding="utf-8")
+
+
+def build_skill_reader(root: Path) -> BaseTool:
+    """Wrap :func:`_read_skill_file` as the ``read_skill`` tool, bound to ``root``."""
+    from langchain_core.tools import tool
+
+    @tool(READ_SKILL_TOOL_NAME)
+    def read_skill(path: str) -> str:
+        """Read a skill's file (e.g. its SKILL.md) by the path shown in the skills list.
+        Use this — not read_file — for any path under the skills library."""
+        return _read_skill_file(root, path)
+
+    return read_skill
+
+
+def build_skills(root: Path | None = None) -> tuple[AgentMiddleware, BaseTool] | None:
+    """The skills ``(middleware, read_skill tool)`` pair, or ``None`` if no skills are present.
 
-    Returns ``None`` (rather than an empty middleware) so the caller simply omits it
-    from the stack when the user has run no `assembly setup` — the agent then starts
-    with no skills section instead of an empty one.
+    Returns ``None`` (rather than an empty middleware) so the caller simply omits both from
+    the stack when the user has run no `assembly setup` — the agent then starts with no skills
+    section and no `read_skill` tool instead of empty ones. The tool is paired with the
+    middleware because the prompt the middleware injects directs the model to it.
     """
     root = root if root is not None else skills_root()
     if not _has_skills(root):
@@ -50,4 +107,5 @@ def build_skills_middleware(root: Path | None = None) -> AgentMiddleware | None:
     from deepagents.middleware.skills import SkillsMiddleware
 
     backend = FilesystemBackend(root_dir=str(root), virtual_mode=True)
-    return SkillsMiddleware(backend=backend, sources=["/"])
+    middleware = SkillsMiddleware(backend=backend, sources=["/"], system_prompt=_SKILLS_PROMPT)
+    return middleware, build_skill_reader(root)
diff --git a/aai_cli/commands/code/_exec.py b/aai_cli/commands/code/_exec.py
index 2985629..b86369e 100644
--- a/aai_cli/commands/code/_exec.py
+++ b/aai_cli/commands/code/_exec.py
@@ -31,7 +31,7 @@
 from aai_cli.code_agent.prompt import DEFAULT_MODEL
 from aai_cli.code_agent.render import RichRenderer, make_approver
 from aai_cli.code_agent.session import CodeSession, EventSink, run_repl
-from aai_cli.code_agent.skills import build_skills_middleware
+from aai_cli.code_agent.skills import build_skills
 from aai_cli.code_agent.store import build_checkpointer
 from aai_cli.code_agent.voice import (
     AUDIO_ERROR_TYPES,
@@ -82,12 +82,9 @@ def _assemble_tools(api_key: str, opts: CodeOptions, bridge: AskBridge) -> list[
 
 
 def _assemble_middlewares(opts: CodeOptions) -> list[AgentMiddleware]:
-    """Skills + long-term memory middleware, in load order."""
+    """The long-term memory middleware (skills are wired in :func:`_build_agent`, since the
+    skills middleware pairs with a tool)."""
     middlewares: list[AgentMiddleware] = []
-    if opts.skills:
-        skills = build_skills_middleware()
-        if skills is not None:
-            middlewares.append(skills)
     if opts.memory:
         middlewares.append(build_memory_middleware())
     return middlewares
@@ -95,11 +92,20 @@ def _assemble_middlewares(opts: CodeOptions) -> list[AgentMiddleware]:
 
 def _build_agent(api_key: str, opts: CodeOptions, bridge: AskBridge) -> CompiledAgent:
     """Wire the gateway model + tools + middlewares + checkpointer into the agent."""
+    tools = _assemble_tools(api_key, opts, bridge)
+    middlewares = _assemble_middlewares(opts)
+    # Skills add both a middleware (the skills prompt section) and the `read_skill` tool the
+    # prompt directs the model to; load the middleware ahead of memory to match prior order.
+    skills = build_skills() if opts.skills else None
+    if skills is not None:
+        middleware, reader = skills
+        middlewares.insert(0, middleware)
+        tools.append(reader)
     return build_agent(
         model=build_model(api_key, model=opts.model),
         root_dir=opts.root_dir.resolve(),
-        tools=_assemble_tools(api_key, opts, bridge),
-        middlewares=_assemble_middlewares(opts),
+        tools=tools,
+        middlewares=middlewares,
         checkpointer=build_checkpointer(persist=opts.persist),
         auto_approve=opts.auto,
     )
diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py
index 28a9561..fffb90b 100644
--- a/tests/test_code_agent.py
+++ b/tests/test_code_agent.py
@@ -215,13 +215,32 @@ def boom(url):
     assert docs_mcp.load_docs_tools("https://example.invalid") == []
 
 
-def test_skills_middleware_present_and_absent(tmp_path: Path) -> None:
-    assert skills.build_skills_middleware(tmp_path) is None  # empty dir -> no skills
+def test_build_skills_present_and_absent(tmp_path: Path) -> None:
+    assert skills.build_skills(tmp_path) is None  # empty dir -> no skills, no tool
 
     skill_dir = tmp_path / "assemblyai"
     skill_dir.mkdir()
     (skill_dir / "SKILL.md").write_text("---\nname: assemblyai\ndescription: x\n---\nbody")
-    assert skills.build_skills_middleware(tmp_path) is not None
+    bundle = skills.build_skills(tmp_path)
+    assert bundle is not None  # constructing the middleware also validates the custom prompt
+    _middleware, reader = bundle
+    assert reader.name == skills.READ_SKILL_TOOL_NAME
+
+
+def test_read_skill_tool_reads_under_root_and_blocks_escape(tmp_path: Path) -> None:
+    skill_dir = tmp_path / "assemblyai"
+    skill_dir.mkdir()
+    (skill_dir / "SKILL.md").write_text("the skill body")
+    (tmp_path.parent / "secret.md").write_text("top secret")
+
+    reader = skills.build_skill_reader(tmp_path)
+    # The path is the prompt's backend-virtual form (leading slash, relative to root).
+    assert reader.invoke({"path": "/assemblyai/SKILL.md"}) == "the skill body"
+    # A traversal out of the skills dir is refused (not the neighbouring file's contents).
+    escaped = reader.invoke({"path": "/../secret.md"})
+    assert "outside the skills directory" in escaped and "top secret" not in escaped
+    # A missing skill file reports an error rather than raising.
+    assert "not found" in reader.invoke({"path": "/assemblyai/MISSING.md"})
 
 
 def test_web_search_tool_gated_on_api_key(monkeypatch: pytest.MonkeyPatch) -> None:
@@ -317,6 +336,58 @@ def test_hoist_tool_call_ids_guards() -> None:
     model_mod._hoist_tool_call_ids({"choices": 99})  # choices not a list -> early return
 
 
+def test_is_empty_arguments() -> None:
+    assert model_mod._is_empty_arguments("")  # empty string
+    assert model_mod._is_empty_arguments("   ")  # whitespace only
+    assert model_mod._is_empty_arguments("{}")  # empty object
+    assert model_mod._is_empty_arguments("{ }")  # empty object with whitespace
+    assert not model_mod._is_empty_arguments('{"path": "/"}')  # real arguments
+    assert not model_mod._is_empty_arguments("[]")  # non-dict JSON is not "empty args"
+    assert not model_mod._is_empty_arguments("{bad json")  # unparseable -> leave alone
+    assert not model_mod._is_empty_arguments(None)  # non-string -> leave alone
+    assert not model_mod._is_empty_arguments({"already": "parsed"})  # non-string -> leave alone
+
+
+def test_ensure_tool_call_arguments_fills_only_empty_calls() -> None:
+    empty_fn: dict[str, object] = {"name": "ls", "arguments": "{}"}
+    blank_fn: dict[str, object] = {"name": "ls", "arguments": "  "}
+    full_fn: dict[str, object] = {"name": "ls", "arguments": '{"path": "/"}'}
+    tool_calls: list[object] = [
+        None,  # tool_call not a dict -> skipped
+        {"id": "0", "function": 7},  # function not a dict -> skipped
+        {"id": "1", "function": empty_fn},  # empty object -> filled
+        {"id": "2", "function": blank_fn},  # whitespace -> filled
+        {"id": "3", "function": full_fn},  # real args -> untouched
+    ]
+    messages: list[object] = [
+        None,  # message not a dict -> skipped
+        {"role": "user", "content": "hi"},  # no tool_calls -> skipped
+        {"tool_calls": 99},  # tool_calls not a list -> skipped
+        {"tool_calls": tool_calls},
+    ]
+    model_mod._ensure_tool_call_arguments(messages)
+    assert empty_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
+    assert blank_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
+    assert full_fn["arguments"] == '{"path": "/"}'  # left untouched
+
+
+def test_ensure_tool_call_arguments_guards() -> None:
+    model_mod._ensure_tool_call_arguments(None)  # not a list -> early return, no error
+    model_mod._ensure_tool_call_arguments([{"tool_calls": 99}])  # tool_calls not a list
+
+
+def test_get_request_payload_fills_empty_tool_call_arguments() -> None:
+    from langchain_core.messages import AIMessage, HumanMessage
+
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    # An assistant tool call with no arguments serializes to arguments="{}", which the gateway
+    # rejects (missing tool_use.input); the payload must carry the placeholder instead.
+    ai = AIMessage(content="", tool_calls=[{"name": "ls", "args": {}, "id": "t1"}])
+    payload = m._get_request_payload([HumanMessage(content="hi"), ai])
+    calls = payload["messages"][1]["tool_calls"]
+    assert calls[0]["function"]["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
+
+
 def test_convert_chunk_hoists_streamed_tool_call_id() -> None:
     from langchain_core.messages import AIMessageChunk
     from langchain_openai import ChatOpenAI
diff --git a/tests/test_code_command.py b/tests/test_code_command.py
index 044fc87..ab2c605 100644
--- a/tests/test_code_command.py
+++ b/tests/test_code_command.py
@@ -167,13 +167,12 @@ def test_assemble_tools_includes_cli_fetch_ask_and_optional_extras(monkeypatch):
     assert [t.name for t in tools] == ["assembly", "fetch_url", "ask_user"]
 
 
-def test_assemble_middlewares_skills_and_memory(monkeypatch):
-    monkeypatch.setattr(_exec, "build_skills_middleware", lambda: "SKILLS")
+def test_assemble_middlewares_memory_only(monkeypatch):
+    # Skills are wired in _build_agent now (they pair a middleware with a tool); this
+    # assembler only handles the optional memory middleware.
     monkeypatch.setattr(_exec, "build_memory_middleware", lambda: "MEM")
-    assert _exec._assemble_middlewares(_opts(skills=True, memory=True)) == ["SKILLS", "MEM"]
-
-    monkeypatch.setattr(_exec, "build_skills_middleware", lambda: None)
-    assert _exec._assemble_middlewares(_opts(skills=True, memory=False)) == []
+    assert _exec._assemble_middlewares(_opts(memory=True)) == ["MEM"]
+    assert _exec._assemble_middlewares(_opts(memory=False)) == []
 
 
 def test_build_agent_wires_model_tools_and_checkpointer(monkeypatch):
@@ -181,16 +180,32 @@ def test_build_agent_wires_model_tools_and_checkpointer(monkeypatch):
     monkeypatch.setattr(_exec, "build_model", lambda key, *, model: f"model:{model}")
     monkeypatch.setattr(_exec, "_assemble_tools", lambda key, opts, bridge: ["t"])
     monkeypatch.setattr(_exec, "_assemble_middlewares", lambda opts: ["m"])
+    # --no-skills: build_skills must not be consulted, so the sentinel never lands.
+    monkeypatch.setattr(_exec, "build_skills", lambda: ("X_MW", "X_TOOL"))
     monkeypatch.setattr(_exec, "build_checkpointer", lambda *, persist: f"ckpt:{persist}")
     monkeypatch.setattr(_exec, "build_agent", lambda **kw: seen.update(kw) or "AGENT")
 
-    agent = _exec._build_agent("k", _opts(model="gpt-5", persist=False), AskBridge())
+    agent = _exec._build_agent("k", _opts(model="gpt-5", persist=False, skills=False), AskBridge())
     assert agent == "AGENT"
     assert seen["model"] == "model:gpt-5"
-    assert seen["tools"] == ["t"] and seen["middlewares"] == ["m"]
+    assert seen["tools"] == ["t"] and seen["middlewares"] == ["m"]  # no skills sentinel added
     assert seen["checkpointer"] == "ckpt:False"
 
 
+def test_build_agent_inserts_skills_middleware_and_read_tool(monkeypatch):
+    seen = {}
+    monkeypatch.setattr(_exec, "build_model", lambda key, *, model: "model")
+    monkeypatch.setattr(_exec, "_assemble_tools", lambda key, opts, bridge: ["base"])
+    monkeypatch.setattr(_exec, "_assemble_middlewares", lambda opts: ["mem"])
+    monkeypatch.setattr(_exec, "build_skills", lambda: ("skills_mw", "read_skill_tool"))
+    monkeypatch.setattr(_exec, "build_checkpointer", lambda *, persist: "ckpt")
+    monkeypatch.setattr(_exec, "build_agent", lambda **kw: seen.update(kw) or "AGENT")
+
+    _exec._build_agent("k", _opts(skills=True), AskBridge())
+    assert seen["middlewares"] == ["skills_mw", "mem"]  # skills loaded ahead of memory
+    assert seen["tools"] == ["base", "read_skill_tool"]  # read_skill tool appended
+
+
 def test_web_note_only_without_key(monkeypatch):
     monkeypatch.delenv("FIRECRAWL_API_KEY", raising=False)
     assert _exec._web_note(_opts(web=True)) is not None

From 2b53537d08dfb09aded4b18c685bb640388111d9 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 14:03:02 -0700
Subject: [PATCH 2/6] assembly code: drop the gateway's spurious blank streamed
 tool-call delta
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Every streaming turn (when tools are available) begins with an empty tool-call
delta — {"function": {"id": "", "name": "", "arguments": ""}}. On a pure-text
turn (e.g. the agent asking clarifying questions) no real tool call follows, so
langchain is left with a tool call whose name is "", deepagents dispatches it,
and the turn dies with `Error:  is not a valid tool`.

Extend the streaming normalizer to drop any tool-call delta with no name, id, or
arguments before langchain converts the chunk (this also harmlessly drops the
gateway's empty argument-continuation deltas). A real text+tool turn still yields
exactly one correct tool call; a pure-text turn yields none.

Reported upstream for a server-side fix; this keeps the CLI resilient meanwhile.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/model.py | 44 +++++++++++++++++++---------
 tests/test_code_agent.py    | 57 +++++++++++++++++++++++++++++++++++++
 2 files changed, 87 insertions(+), 14 deletions(-)

diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index 9539b4c..27f9980 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -46,18 +46,23 @@ def _flatten_content(messages: object) -> None:
 
 
 def _hoist_tool_call_ids(chunk: object) -> None:
-    """Move each streamed tool-call ``id`` from inside ``function`` up to the tool-call top level.
-
-    The AssemblyAI LLM Gateway's *streaming* ``/v1/chat/completions`` nests the tool-call
-    ``id`` under ``function`` — ``{"function": {"id": …, "name": …}}`` — instead of at the
-    tool-call's top level, which is where the OpenAI streaming spec (and
-    ``langchain_openai``, via ``id=rtc.get("id")``) reads it. Left alone, every streamed
-    tool call parses with a name and arguments but ``id=None``, so the reply ``ToolMessage``
-    fails Pydantic validation (``tool_call_id`` must be a string) and the whole turn errors
-    out. We move the id back up before langchain converts the chunk; the id rides only the
-    first delta of a call, so later argument-only deltas (no ``function.id``) are left
-    untouched. (The non-streaming endpoint already places the id correctly, so only the
-    streaming path needs this.)
+    """Normalize a streamed chunk's tool-call deltas: drop blank ones, hoist nested ids.
+
+    Two AssemblyAI LLM Gateway streaming quirks, both fixed in place before langchain
+    converts the chunk:
+
+    1. **Spurious blank deltas.** Every streamed turn (when tools are available) starts with
+       an empty tool-call delta — ``{"function": {"id": "", "name": "", "arguments": ""}}``.
+       On a pure-text turn no real call follows, so langchain is left with a tool call whose
+       ``name`` is ``""``; deepagents then dispatches it and the turn dies with
+       ``Error:  is not a valid tool``. We drop any delta with no name, id, or arguments
+       (which also harmlessly drops the gateway's empty argument-continuation deltas).
+    2. **Misplaced id.** The id is nested under ``function`` instead of at the tool-call top
+       level where the OpenAI spec and ``langchain_openai`` (``id=rtc.get("id")``) read it,
+       so without help every call parses with ``id=None`` and its reply ``ToolMessage`` fails
+       validation. We move it back up; the id rides only a call's first delta.
+
+    (The non-streaming endpoint has neither quirk, so only the streaming path needs this.)
     """
     if not isinstance(chunk, dict):
         return
@@ -68,11 +73,22 @@ def _hoist_tool_call_ids(chunk: object) -> None:
 
 
 def _hoist_in_choice(choice: object) -> None:
-    """Hoist tool-call ids within one streamed choice's delta (helper for ``_hoist_tool_call_ids``)."""
+    """Drop blank tool-call deltas, then hoist ids, within one streamed choice's delta."""
     delta = choice.get("delta") if isinstance(choice, dict) else None
     tool_calls = delta.get("tool_calls") if isinstance(delta, dict) else None
     if isinstance(tool_calls, list):
-        _hoist_call_list(tool_calls)
+        delta["tool_calls"] = [tc for tc in tool_calls if not _is_blank_tool_call(tc)]
+        _hoist_call_list(delta["tool_calls"])
+
+
+def _is_blank_tool_call(tool_call: object) -> bool:
+    """True for the gateway's spurious empty tool-call delta (no name, id, or arguments)."""
+    if not isinstance(tool_call, dict):
+        return False
+    function = tool_call.get("function")
+    if not isinstance(function, dict):
+        return False
+    return not function.get("name") and not function.get("id") and not function.get("arguments")
 
 
 def _hoist_call_list(tool_calls: list[object]) -> None:
diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py
index fffb90b..c40f3d9 100644
--- a/tests/test_code_agent.py
+++ b/tests/test_code_agent.py
@@ -336,6 +336,63 @@ def test_hoist_tool_call_ids_guards() -> None:
     model_mod._hoist_tool_call_ids({"choices": 99})  # choices not a list -> early return
 
 
+def test_is_blank_tool_call() -> None:
+    assert model_mod._is_blank_tool_call({"function": {"id": "", "name": "", "arguments": ""}})
+    assert model_mod._is_blank_tool_call({"function": {}})  # all fields absent
+    assert not model_mod._is_blank_tool_call({"function": {"name": "x"}})  # has a name
+    assert not model_mod._is_blank_tool_call({"function": {"id": "i"}})  # has an id
+    assert not model_mod._is_blank_tool_call({"function": {"arguments": "a"}})  # has arguments
+    assert not model_mod._is_blank_tool_call({"function": 7})  # function not a dict
+    assert not model_mod._is_blank_tool_call(None)  # not a dict
+
+
+def test_hoist_tool_call_ids_drops_spurious_blank_delta() -> None:
+    # The gateway prefixes every streamed turn with an empty tool-call delta; it must be
+    # dropped (else a pure-text turn dispatches a tool call with name="").
+    real_fn: dict[str, object] = {"id": "toolu_X", "name": "get_weather", "arguments": ""}
+    real_call: dict[str, object] = {"index": 0, "function": real_fn}
+    delta: dict[str, object] = {
+        "tool_calls": [
+            {"index": 0, "function": {"id": "", "name": "", "arguments": ""}},  # spurious blank
+            real_call,
+        ]
+    }
+    chunk: dict[str, object] = {"choices": [{"delta": delta}]}
+    model_mod._hoist_tool_call_ids(chunk)
+    assert delta["tool_calls"] == [real_call]  # blank dropped, real call kept
+    assert real_call["id"] == "toolu_X"  # and its id hoisted out of function
+    assert "id" not in real_fn
+
+
+def test_convert_chunk_drops_spurious_blank_tool_call() -> None:
+    from langchain_core.messages import AIMessageChunk
+    from langchain_openai import ChatOpenAI
+
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    assert isinstance(m, ChatOpenAI)  # narrow to the subclass that overrides the converter
+    # A pure-text turn's leading delta carries only the gateway's blank tool call — the
+    # converted chunk must surface no tool call (else deepagents dispatches a nameless tool).
+    chunk = {
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "tool_calls": [
+                        {"index": 0, "function": {"id": "", "name": "", "arguments": ""}}
+                    ],
+                },
+                "finish_reason": None,
+            }
+        ]
+    }
+    gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None)
+    assert gen is not None
+    msg = gen.message
+    assert isinstance(msg, AIMessageChunk)
+    assert msg.tool_call_chunks == []  # the phantom blank tool call is gone
+
+
 def test_is_empty_arguments() -> None:
     assert model_mod._is_empty_arguments("")  # empty string
     assert model_mod._is_empty_arguments("   ")  # whitespace only

From b54eab8f6ca07f5ebe32c577674d20dd4821898f Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 14:28:47 -0700
Subject: [PATCH 3/6] assembly code: make voice readback interruptible on the
 daemon thread
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The coding-agent TUI plays each spoken reply on a daemon thread. Its two cancel
channels both failed there: PcmPlayer chunks writes "so a Ctrl-C lands between
them", but that relies on KeyboardInterrupt reaching the *playing* thread — true
for the foreground `assembly speak` CLI, not the TUI, where Ctrl-C is handled by
Textual on the UI thread. The only cross-thread signal, the `_cancel` event, was
checked solely between synthesizer chunks (the feed wrapper), never during
sounddevice's blocking playback. So readback was effectively uninterruptible:
Ctrl-C did nothing, and the daemon thread stayed blocked in speak() instead of
advancing to listen for the next turn.

Poll the cancel flag inside PcmPlayer's piece-write loop (abort the device and
drop the rest of the chunk when set), and have voice.speak hand the player a live
poll of `_cancel`. Cancellation is now honored within ~one 4 KiB piece (~85 ms)
regardless of which thread the interrupt arrives on. The poll is optional, so
`assembly speak` (foreground, KeyboardInterrupt-driven) is unchanged.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/voice.py | 12 +++++++++---
 aai_cli/tts/audio.py        | 22 ++++++++++++++++++----
 tests/test_code_voice.py    | 30 +++++++++++++++++++++++++++++-
 tests/test_tts_audio.py     | 23 +++++++++++++++++++++++
 4 files changed, 79 insertions(+), 8 deletions(-)

diff --git a/aai_cli/code_agent/voice.py b/aai_cli/code_agent/voice.py
index 4f913cd..ffed3ef 100644
--- a/aai_cli/code_agent/voice.py
+++ b/aai_cli/code_agent/voice.py
@@ -130,8 +130,10 @@ def __enter__(self) -> Player:
     def __exit__(self, exc_type: object, *exc: object) -> object:
         """Drain on a clean exit, abort otherwise; never suppress."""
 
-    def feed(self, pcm: bytes, sample_rate: int) -> None:
-        """Play one PCM chunk, opening the output device on the first call."""
+    def feed(
+        self, pcm: bytes, sample_rate: int, *, cancelled: Callable[[], bool] | None = None
+    ) -> None:
+        """Play one PCM chunk, polling ``cancelled`` between writes to stop mid-chunk."""
 
 
 def _stt_params(sample_rate: int) -> StreamingParameters:
@@ -219,7 +221,11 @@ def speak(self, text: str) -> None:
                 def feed(pcm: bytes, sample_rate: int) -> None:
                     if self._cancel.is_set():
                         _abort_readback()
-                    player.feed(pcm, sample_rate)
+                    # Poll cancel *during* playback too: a chunk can be seconds of audio, and
+                    # in the TUI the only cancel signal is this flag set from another thread.
+                    player.feed(pcm, sample_rate, cancelled=self._cancel.is_set)
+                    if self._cancel.is_set():
+                        _abort_readback()
 
                 self.synth_fn(self.api_key, config, on_audio=feed)
         except _ReadbackInterrupted:
diff --git a/aai_cli/tts/audio.py b/aai_cli/tts/audio.py
index e4aec43..77dedea 100644
--- a/aai_cli/tts/audio.py
+++ b/aai_cli/tts/audio.py
@@ -88,11 +88,20 @@ def __init__(self, *, stream_factory: Callable[[int], _OutputStream] | None = No
     def __enter__(self) -> PcmPlayer:
         return self
 
-    def feed(self, pcm: bytes, sample_rate: int) -> None:
-        """Play one PCM chunk, opening the device on the first chunk."""
+    def feed(
+        self, pcm: bytes, sample_rate: int, *, cancelled: Callable[[], bool] | None = None
+    ) -> None:
+        """Play one PCM chunk, opening the device on the first chunk.
+
+        ``cancelled`` is polled between the small piece-writes; when it returns True the
+        stream is aborted (buffered frames discarded) and the rest of the chunk is dropped.
+        This is the only cancellation path that works when playback runs off the main thread
+        — the coding-agent TUI plays on a daemon thread, so a Ctrl-C ``KeyboardInterrupt``
+        never reaches it; only a flag set from another thread can stop it.
+        """
         if self._stream is None:
             self._stream = self._open(sample_rate)
-        self._write(self._stream, pcm)
+        self._write(self._stream, pcm, cancelled)
 
     def _open(self, sample_rate: int) -> _OutputStream:
         try:
@@ -105,11 +114,16 @@ def _open(self, sample_rate: int) -> _OutputStream:
         return stream
 
     @staticmethod
-    def _write(stream: _OutputStream, pcm: bytes) -> None:
+    def _write(
+        stream: _OutputStream, pcm: bytes, cancelled: Callable[[], bool] | None = None
+    ) -> None:
         # KeyboardInterrupt (a BaseException) passes through this Exception handler
         # to __exit__, which aborts the device; only real device errors are wrapped.
         try:
             for offset in range(0, len(pcm), _PLAYBACK_CHUNK_BYTES):
+                if cancelled is not None and cancelled():
+                    stream.abort()  # discard buffered frames and stop now, mid-chunk
+                    return
                 stream.write(pcm[offset : offset + _PLAYBACK_CHUNK_BYTES])
         except Exception as exc:
             raise _playback_error(exc) from exc
diff --git a/tests/test_code_voice.py b/tests/test_code_voice.py
index 6ffa17d..6233036 100644
--- a/tests/test_code_voice.py
+++ b/tests/test_code_voice.py
@@ -119,7 +119,7 @@ def __exit__(self, exc_type, *exc):
         self.exit_exc_type = exc_type  # records the abort path (an exception on the way out)
         return False
 
-    def feed(self, pcm, sample_rate):
+    def feed(self, pcm, sample_rate, *, cancelled=None):
         self.fed.append((pcm, sample_rate))
 
 
@@ -163,6 +163,34 @@ def fake_synth(api_key, config, *, on_audio):
     assert player.exit_exc_type is not None  # player saw the exception -> aborted, not drained
 
 
+def test_speak_hands_player_a_live_cancel_poll_for_midchunk_stop():
+    # In the TUI the readback plays on a daemon thread, so the only way to stop a chunk
+    # mid-playback is a flag set from another thread. speak() must hand the player a live
+    # poll of that flag (not just check it between synth chunks).
+    seen = {}
+    holder = {}
+
+    class PollPlayer(FakePlayer):
+        def feed(self, pcm, sample_rate, *, cancelled=None):
+            seen["poll"] = cancelled
+            seen["before"] = cancelled() if cancelled else None
+            holder["session"].cancel()  # another thread interrupts mid-playback
+            seen["after"] = cancelled() if cancelled else None
+            super().feed(pcm, sample_rate)
+
+    def fake_synth(api_key, config, *, on_audio):
+        on_audio(b"chunk", 24000)
+
+    session = VoiceSession(
+        api_key="k", readback=True, synth_fn=fake_synth, player_factory=PollPlayer
+    )
+    holder["session"] = session
+    session.speak("hello there")  # returns cleanly — the post-chunk cancel is swallowed
+    assert callable(seen["poll"])
+    assert seen["before"] is False  # not cancelled when the chunk starts playing
+    assert seen["after"] is True  # the poll reflects a cancel raised mid-playback
+
+
 def test_speak_clears_a_stale_cancel_before_playing():
     # A cancel() left set from a prior interrupt must not abort the next readback before it
     # starts — speak() clears the flag on entry, so the chunk plays normally.
diff --git a/tests/test_tts_audio.py b/tests/test_tts_audio.py
index 93c6095..10045fc 100644
--- a/tests/test_tts_audio.py
+++ b/tests/test_tts_audio.py
@@ -77,6 +77,29 @@ def test_play_pcm_writes_audio_in_bounded_chunks():
     assert b"".join(stream.writes) == pcm
 
 
+def test_feed_aborts_midchunk_when_cancelled_flag_flips():
+    # The daemon-thread readback can't see a Ctrl-C KeyboardInterrupt, so a cancel flag set
+    # from another thread must stop playback between piece-writes: abort the device and drop
+    # the rest of the chunk rather than playing it out.
+    stream = FakeStream()
+    pcm = bytes(256) * 40  # > 2 pieces, so there are several poll points
+    with audio.PcmPlayer(stream_factory=lambda rate: stream) as player:
+        # Report "cancelled" once the first piece has been written.
+        player.feed(pcm, 24000, cancelled=lambda: bool(stream.writes))
+    assert len(stream.writes) == 1  # stopped after the first piece instead of draining all
+    assert "abort" in stream.events  # discarded the buffered frames immediately
+
+
+def test_feed_without_cancel_poll_plays_every_piece():
+    # The cancel poll is optional (assembly speak passes none): all pieces still play.
+    stream = FakeStream()
+    pcm = bytes(256) * 40
+    with audio.PcmPlayer(stream_factory=lambda rate: stream) as player:
+        player.feed(pcm, 24000)
+    assert b"".join(stream.writes) == pcm  # nothing dropped
+    assert "abort" not in stream.events
+
+
 def test_play_pcm_aborts_and_propagates_on_ctrl_c():
     # Ctrl-C mid-playback must stop the device immediately (abort, not just stop)
     # and re-raise so the cancel reaches the CLI; the stream is still closed.

From c5790d49bda82e8a3557f8e80a65a898758451e4 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 14:34:29 -0700
Subject: [PATCH 4/6] Default assembly code and assembly live to gpt-5.1

Switch the default LLM Gateway model for `assembly code` (code_agent/prompt.py)
and `assembly live` (agent_cascade/config.py) to gpt-5.1. The live default is now
a literal rather than llm.DEFAULT_MODEL, so it's independent of the one-shot
`assembly llm` default (still claude-haiku). Both override with --model.

Verified gpt-5.1 is accepted by the gateway. Updated the cascade config test and
regenerated the `--help` snapshot (both commands show [default: gpt-5.1]).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/agent_cascade/config.py                  | 5 ++++-
 aai_cli/code_agent/prompt.py                     | 2 +-
 tests/__snapshots__/test_snapshots_help_run.ambr | 5 ++---
 tests/test_agent_cascade_config.py               | 9 +++++++--
 4 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/aai_cli/agent_cascade/config.py b/aai_cli/agent_cascade/config.py
index 7589b95..efa5b43 100644
--- a/aai_cli/agent_cascade/config.py
+++ b/aai_cli/agent_cascade/config.py
@@ -13,7 +13,10 @@
 from aai_cli.agent_cascade.voices import DEFAULT_VOICE
 from aai_cli.core import llm
 
-DEFAULT_MODEL = llm.DEFAULT_MODEL
+# `assembly live` defaults to a capable gateway model (override with --model); kept a
+# literal rather than llm.DEFAULT_MODEL so the live agent's default is independent of the
+# one-shot `assembly llm` default.
+DEFAULT_MODEL = "gpt-5.1"
 DEFAULT_MAX_TOKENS = llm.DEFAULT_MAX_TOKENS
 # The realtime model the cascade transcribes with (same as the agent-cascade template).
 DEFAULT_SPEECH_MODEL = "u3-rt-pro"
diff --git a/aai_cli/code_agent/prompt.py b/aai_cli/code_agent/prompt.py
index fb6c957..4e2a7ef 100644
--- a/aai_cli/code_agent/prompt.py
+++ b/aai_cli/code_agent/prompt.py
@@ -4,7 +4,7 @@
 
 # A capable gateway model by default; override with `--model`. The gateway is the
 # source of truth for what's accepted, so this is only a sensible default.
-DEFAULT_MODEL = "claude-sonnet-4-6"
+DEFAULT_MODEL = "gpt-5.1"
 # Generous ceiling so long edits/explanations aren't clipped; the gateway only bills
 # tokens actually generated, so a high cap costs nothing on short replies.
 DEFAULT_MAX_TOKENS = 8192
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index 65db349..c2334d9 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -287,7 +287,7 @@
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
   │ --model                       TEXT       LLM Gateway model                   │
-  │                                          [default: claude-sonnet-4-6]        │
+  │                                          [default: gpt-5.1]                  │
   │ --dir      -C                 DIRECTORY  Working directory the agent's file  │
   │                                          and shell tools operate in          │
   │                                          [default: .]                        │
@@ -667,8 +667,7 @@
   ╭─ Language model ─────────────────────────────────────────────────────────────╮
   │ --model             TEXT                  LLM Gateway model that powers the  │
   │                                           agent's replies                    │
-  │                                           [default:                          │
-  │                                           claude-haiku-4-5-20251001]         │
+  │                                           [default: gpt-5.1]                 │
   │ --max-tokens        INTEGER RANGE [x>=1]  Max tokens per reply               │
   │                                           [default: 8192]                    │
   │ --llm-config        TEXT                  Set any LLM Gateway request field  │
diff --git a/tests/test_agent_cascade_config.py b/tests/test_agent_cascade_config.py
index e366187..7514ed5 100644
--- a/tests/test_agent_cascade_config.py
+++ b/tests/test_agent_cascade_config.py
@@ -6,7 +6,12 @@
 
 import pytest
 
-from aai_cli.agent_cascade.config import DEFAULT_GREETING, DEFAULT_MAX_HISTORY, CascadeConfig
+from aai_cli.agent_cascade.config import (
+    DEFAULT_GREETING,
+    DEFAULT_MAX_HISTORY,
+    DEFAULT_MODEL,
+    CascadeConfig,
+)
 from aai_cli.agent_cascade.voices import DEFAULT_VOICE
 from aai_cli.core import llm
 
@@ -14,7 +19,7 @@
 def test_default_config_values():
     config = CascadeConfig()
     assert config.voice == DEFAULT_VOICE
-    assert config.model == llm.DEFAULT_MODEL
+    assert config.model == DEFAULT_MODEL == "gpt-5.1"  # `assembly live` defaults to gpt-5.1
     assert config.greeting == DEFAULT_GREETING
     # The sliding-window default keeps the last 40 messages of context.
     assert config.max_history == 40

From 2b410e620c54295f127f86377791de364853b222 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 14:53:42 -0700
Subject: [PATCH 5/6] Split model tests into test_code_model.py; narrow types
 for the gate

Make the branch gate-clean now that the firecrawl WIP no longer blocks it:
- Move the code_agent/model.py unit tests out of test_code_agent.py (which had
  grown past the 500-line file-length gate) into a new test_code_model.py, and
  add it to pyrightconfig.tests.json's ignore list alongside the other
  langchain-boundary test files.
- Narrow `delta` to dict in _hoist_in_choice via early returns so mypy accepts
  the in-place `delta["tool_calls"]` assignment, and isinstance-narrow the chat
  model in the payload/convert-chunk tests.

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/model.py |   8 +-
 pyrightconfig.tests.json    |   1 +
 tests/test_code_agent.py    | 204 ----------------------------------
 tests/test_code_model.py    | 216 ++++++++++++++++++++++++++++++++++++
 4 files changed, 223 insertions(+), 206 deletions(-)
 create mode 100644 tests/test_code_model.py

diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
index 27f9980..4e1c556 100644
--- a/aai_cli/code_agent/model.py
+++ b/aai_cli/code_agent/model.py
@@ -74,8 +74,12 @@ def _hoist_tool_call_ids(chunk: object) -> None:
 
 def _hoist_in_choice(choice: object) -> None:
     """Drop blank tool-call deltas, then hoist ids, within one streamed choice's delta."""
-    delta = choice.get("delta") if isinstance(choice, dict) else None
-    tool_calls = delta.get("tool_calls") if isinstance(delta, dict) else None
+    if not isinstance(choice, dict):
+        return
+    delta = choice.get("delta")
+    if not isinstance(delta, dict):
+        return
+    tool_calls = delta.get("tool_calls")
     if isinstance(tool_calls, list):
         delta["tool_calls"] = [tc for tc in tool_calls if not _is_blank_tool_call(tc)]
         _hoist_call_list(delta["tool_calls"])
diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json
index f9dbdf0..d49b21b 100644
--- a/pyrightconfig.tests.json
+++ b/pyrightconfig.tests.json
@@ -2,6 +2,7 @@
   "include": ["tests"],
   "ignore": [
     "tests/test_code_agent.py",
+    "tests/test_code_model.py",
     "tests/test_code_command.py",
     "tests/test_code_tui.py",
     "tests/test_agent_cascade_brain.py"
diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py
index c40f3d9..a1452bf 100644
--- a/tests/test_code_agent.py
+++ b/tests/test_code_agent.py
@@ -25,12 +25,10 @@
     skills,
     store,
 )
-from aai_cli.code_agent import model as model_mod
 from aai_cli.code_agent.agent import MUTATING_TOOLS, build_agent
 from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult
 from aai_cli.code_agent.render import RichRenderer, make_approver
 from aai_cli.code_agent.session import QUIT_COMMANDS, CodeSession, run_repl
-from aai_cli.core import environments
 
 
 class FakeChatModel(BaseChatModel):
@@ -272,208 +270,6 @@ def test_rich_renderer_smoke(capsys: pytest.CaptureFixture[str]) -> None:
 # --- slice-unit edge cases (cover the lazy bodies + error/guard branches) -----
 
 
-def test_build_model_targets_the_gateway():  # untyped: probes ChatOpenAI subclass attrs
-    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
-    assert m.model_name == "claude-sonnet-4-6"
-    assert m.openai_api_base == environments.active().llm_gateway_base
-    assert m.use_responses_api is False
-
-
-def test_build_model_flattens_list_content_for_gateway():  # untyped: probes the payload dict
-    from langchain_core.messages import HumanMessage, SystemMessage
-
-    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
-    # deepagents hands the model multi-block content arrays; the gateway 500s on those,
-    # so the model must flatten each message's content to a plain string before sending.
-    payload = m._get_request_payload(
-        [
-            SystemMessage(content=[{"type": "text", "text": "a"}, {"type": "text", "text": "b"}]),
-            HumanMessage(content="hi"),
-        ]
-    )
-    assert [msg["content"] for msg in payload["messages"]] == ["ab", "hi"]
-
-
-def test_flatten_content_guards() -> None:
-    model_mod._flatten_content(None)  # not a list -> early return, no error
-    items = ["raw", 123]  # non-dict members are skipped, list left untouched
-    model_mod._flatten_content(items)
-    assert items == ["raw", 123]
-
-
-def test_hoist_tool_call_ids_moves_id_out_of_function_only_when_missing() -> None:
-    # One chunk exercising every branch: each malformed variant is skipped, and only a
-    # tool call carrying a function-nested id gets hoisted. Hold references to the inner
-    # dicts so the in-place mutation is asserted with a clean type.
-    noid_fn: dict[str, object] = {"name": "b"}
-    hoist_fn: dict[str, object] = {"id": "HOIST", "name": "c", "arguments": ""}
-    noid_call: dict[str, object] = {"index": 1, "function": noid_fn}
-    hoist_call: dict[str, object] = {"index": 2, "function": hoist_fn}
-    tool_calls: list[object] = [
-        None,  # tool_call not a dict -> skipped
-        {"index": 0, "function": 7},  # function not a dict -> skipped
-        noid_call,  # function has no id -> nothing to hoist
-        hoist_call,  # the real gateway shape -> id hoisted out of function
-    ]
-    chunk: dict[str, object] = {
-        "choices": [
-            None,  # choice not a dict -> skipped
-            {"delta": None},  # delta not a dict -> skipped
-            {"delta": {"content": "hi"}},  # no tool_calls -> skipped
-            {"delta": {"tool_calls": 99}},  # tool_calls not a list -> skipped
-            {"delta": {"tool_calls": tool_calls}},
-        ]
-    }
-    model_mod._hoist_tool_call_ids(chunk)
-    assert "id" not in noid_call  # no id invented for a call that never had one
-    assert noid_fn == {"name": "b"}  # left untouched
-    assert hoist_call["id"] == "HOIST"  # hoisted to the top level where langchain reads it
-    assert "id" not in hoist_fn  # and removed from function so it isn't duplicated
-
-
-def test_hoist_tool_call_ids_guards() -> None:
-    model_mod._hoist_tool_call_ids(None)  # not a dict -> early return, no error
-    model_mod._hoist_tool_call_ids({"choices": 99})  # choices not a list -> early return
-
-
-def test_is_blank_tool_call() -> None:
-    assert model_mod._is_blank_tool_call({"function": {"id": "", "name": "", "arguments": ""}})
-    assert model_mod._is_blank_tool_call({"function": {}})  # all fields absent
-    assert not model_mod._is_blank_tool_call({"function": {"name": "x"}})  # has a name
-    assert not model_mod._is_blank_tool_call({"function": {"id": "i"}})  # has an id
-    assert not model_mod._is_blank_tool_call({"function": {"arguments": "a"}})  # has arguments
-    assert not model_mod._is_blank_tool_call({"function": 7})  # function not a dict
-    assert not model_mod._is_blank_tool_call(None)  # not a dict
-
-
-def test_hoist_tool_call_ids_drops_spurious_blank_delta() -> None:
-    # The gateway prefixes every streamed turn with an empty tool-call delta; it must be
-    # dropped (else a pure-text turn dispatches a tool call with name="").
-    real_fn: dict[str, object] = {"id": "toolu_X", "name": "get_weather", "arguments": ""}
-    real_call: dict[str, object] = {"index": 0, "function": real_fn}
-    delta: dict[str, object] = {
-        "tool_calls": [
-            {"index": 0, "function": {"id": "", "name": "", "arguments": ""}},  # spurious blank
-            real_call,
-        ]
-    }
-    chunk: dict[str, object] = {"choices": [{"delta": delta}]}
-    model_mod._hoist_tool_call_ids(chunk)
-    assert delta["tool_calls"] == [real_call]  # blank dropped, real call kept
-    assert real_call["id"] == "toolu_X"  # and its id hoisted out of function
-    assert "id" not in real_fn
-
-
-def test_convert_chunk_drops_spurious_blank_tool_call() -> None:
-    from langchain_core.messages import AIMessageChunk
-    from langchain_openai import ChatOpenAI
-
-    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
-    assert isinstance(m, ChatOpenAI)  # narrow to the subclass that overrides the converter
-    # A pure-text turn's leading delta carries only the gateway's blank tool call — the
-    # converted chunk must surface no tool call (else deepagents dispatches a nameless tool).
-    chunk = {
-        "choices": [
-            {
-                "index": 0,
-                "delta": {
-                    "role": "assistant",
-                    "tool_calls": [
-                        {"index": 0, "function": {"id": "", "name": "", "arguments": ""}}
-                    ],
-                },
-                "finish_reason": None,
-            }
-        ]
-    }
-    gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None)
-    assert gen is not None
-    msg = gen.message
-    assert isinstance(msg, AIMessageChunk)
-    assert msg.tool_call_chunks == []  # the phantom blank tool call is gone
-
-
-def test_is_empty_arguments() -> None:
-    assert model_mod._is_empty_arguments("")  # empty string
-    assert model_mod._is_empty_arguments("   ")  # whitespace only
-    assert model_mod._is_empty_arguments("{}")  # empty object
-    assert model_mod._is_empty_arguments("{ }")  # empty object with whitespace
-    assert not model_mod._is_empty_arguments('{"path": "/"}')  # real arguments
-    assert not model_mod._is_empty_arguments("[]")  # non-dict JSON is not "empty args"
-    assert not model_mod._is_empty_arguments("{bad json")  # unparseable -> leave alone
-    assert not model_mod._is_empty_arguments(None)  # non-string -> leave alone
-    assert not model_mod._is_empty_arguments({"already": "parsed"})  # non-string -> leave alone
-
-
-def test_ensure_tool_call_arguments_fills_only_empty_calls() -> None:
-    empty_fn: dict[str, object] = {"name": "ls", "arguments": "{}"}
-    blank_fn: dict[str, object] = {"name": "ls", "arguments": "  "}
-    full_fn: dict[str, object] = {"name": "ls", "arguments": '{"path": "/"}'}
-    tool_calls: list[object] = [
-        None,  # tool_call not a dict -> skipped
-        {"id": "0", "function": 7},  # function not a dict -> skipped
-        {"id": "1", "function": empty_fn},  # empty object -> filled
-        {"id": "2", "function": blank_fn},  # whitespace -> filled
-        {"id": "3", "function": full_fn},  # real args -> untouched
-    ]
-    messages: list[object] = [
-        None,  # message not a dict -> skipped
-        {"role": "user", "content": "hi"},  # no tool_calls -> skipped
-        {"tool_calls": 99},  # tool_calls not a list -> skipped
-        {"tool_calls": tool_calls},
-    ]
-    model_mod._ensure_tool_call_arguments(messages)
-    assert empty_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
-    assert blank_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
-    assert full_fn["arguments"] == '{"path": "/"}'  # left untouched
-
-
-def test_ensure_tool_call_arguments_guards() -> None:
-    model_mod._ensure_tool_call_arguments(None)  # not a list -> early return, no error
-    model_mod._ensure_tool_call_arguments([{"tool_calls": 99}])  # tool_calls not a list
-
-
-def test_get_request_payload_fills_empty_tool_call_arguments() -> None:
-    from langchain_core.messages import AIMessage, HumanMessage
-
-    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
-    # An assistant tool call with no arguments serializes to arguments="{}", which the gateway
-    # rejects (missing tool_use.input); the payload must carry the placeholder instead.
-    ai = AIMessage(content="", tool_calls=[{"name": "ls", "args": {}, "id": "t1"}])
-    payload = m._get_request_payload([HumanMessage(content="hi"), ai])
-    calls = payload["messages"][1]["tool_calls"]
-    assert calls[0]["function"]["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
-
-
-def test_convert_chunk_hoists_streamed_tool_call_id() -> None:
-    from langchain_core.messages import AIMessageChunk
-    from langchain_openai import ChatOpenAI
-
-    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
-    assert isinstance(m, ChatOpenAI)  # narrow to the subclass that overrides the converter
-    # The gateway streams the tool-call id nested inside `function`; the override must hoist
-    # it so langchain's converted chunk carries the id (else the reply ToolMessage gets None).
-    chunk = {
-        "choices": [
-            {
-                "index": 0,
-                "delta": {
-                    "role": "assistant",
-                    "tool_calls": [
-                        {"index": 0, "function": {"id": "toolu_X", "name": "get_weather"}}
-                    ],
-                },
-                "finish_reason": None,
-            }
-        ]
-    }
-    gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None)
-    assert gen is not None
-    msg = gen.message
-    assert isinstance(msg, AIMessageChunk)
-    assert msg.tool_call_chunks[0]["id"] == "toolu_X"
-
-
 def test_fetch_url_fetches_and_truncates(monkeypatch: pytest.MonkeyPatch) -> None:
     import httpx
 
diff --git a/tests/test_code_model.py b/tests/test_code_model.py
new file mode 100644
index 0000000..047f9bf
--- /dev/null
+++ b/tests/test_code_model.py
@@ -0,0 +1,216 @@
+"""Unit tests for the `assembly code` gateway model wiring (code_agent/model.py).
+
+Split out of test_code_agent.py to stay under the 500-line file gate. These cover the
+``_GatewayChatOpenAI`` subclass and its helpers that paper over the LLM Gateway's
+OpenAI-incompatible quirks: content flattening, streamed tool-call id hoisting, dropping
+the gateway's spurious blank tool-call deltas, and filling empty tool-call arguments.
+"""
+
+from __future__ import annotations
+
+from aai_cli.code_agent import model as model_mod
+from aai_cli.core import environments
+
+
+def test_build_model_targets_the_gateway():  # untyped: probes ChatOpenAI subclass attrs
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    assert m.model_name == "claude-sonnet-4-6"
+    assert m.openai_api_base == environments.active().llm_gateway_base
+    assert m.use_responses_api is False
+
+
+def test_build_model_flattens_list_content_for_gateway():  # untyped: probes the payload dict
+    from langchain_core.messages import HumanMessage, SystemMessage
+
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    # deepagents hands the model multi-block content arrays; the gateway 500s on those,
+    # so the model must flatten each message's content to a plain string before sending.
+    payload = m._get_request_payload(
+        [
+            SystemMessage(content=[{"type": "text", "text": "a"}, {"type": "text", "text": "b"}]),
+            HumanMessage(content="hi"),
+        ]
+    )
+    assert [msg["content"] for msg in payload["messages"]] == ["ab", "hi"]
+
+
+def test_flatten_content_guards() -> None:
+    model_mod._flatten_content(None)  # not a list -> early return, no error
+    items = ["raw", 123]  # non-dict members are skipped, list left untouched
+    model_mod._flatten_content(items)
+    assert items == ["raw", 123]
+
+
+def test_hoist_tool_call_ids_moves_id_out_of_function_only_when_missing() -> None:
+    # One chunk exercising every branch: each malformed variant is skipped, and only a
+    # tool call carrying a function-nested id gets hoisted. Hold references to the inner
+    # dicts so the in-place mutation is asserted with a clean type.
+    noid_fn: dict[str, object] = {"name": "b"}
+    hoist_fn: dict[str, object] = {"id": "HOIST", "name": "c", "arguments": ""}
+    noid_call: dict[str, object] = {"index": 1, "function": noid_fn}
+    hoist_call: dict[str, object] = {"index": 2, "function": hoist_fn}
+    tool_calls: list[object] = [
+        None,  # tool_call not a dict -> skipped
+        {"index": 0, "function": 7},  # function not a dict -> skipped
+        noid_call,  # function has no id -> nothing to hoist
+        hoist_call,  # the real gateway shape -> id hoisted out of function
+    ]
+    chunk: dict[str, object] = {
+        "choices": [
+            None,  # choice not a dict -> skipped
+            {"delta": None},  # delta not a dict -> skipped
+            {"delta": {"content": "hi"}},  # no tool_calls -> skipped
+            {"delta": {"tool_calls": 99}},  # tool_calls not a list -> skipped
+            {"delta": {"tool_calls": tool_calls}},
+        ]
+    }
+    model_mod._hoist_tool_call_ids(chunk)
+    assert "id" not in noid_call  # no id invented for a call that never had one
+    assert noid_fn == {"name": "b"}  # left untouched
+    assert hoist_call["id"] == "HOIST"  # hoisted to the top level where langchain reads it
+    assert "id" not in hoist_fn  # and removed from function so it isn't duplicated
+
+
+def test_hoist_tool_call_ids_guards() -> None:
+    model_mod._hoist_tool_call_ids(None)  # not a dict -> early return, no error
+    model_mod._hoist_tool_call_ids({"choices": 99})  # choices not a list -> early return
+
+
+def test_is_blank_tool_call() -> None:
+    assert model_mod._is_blank_tool_call({"function": {"id": "", "name": "", "arguments": ""}})
+    assert model_mod._is_blank_tool_call({"function": {}})  # all fields absent
+    assert not model_mod._is_blank_tool_call({"function": {"name": "x"}})  # has a name
+    assert not model_mod._is_blank_tool_call({"function": {"id": "i"}})  # has an id
+    assert not model_mod._is_blank_tool_call({"function": {"arguments": "a"}})  # has arguments
+    assert not model_mod._is_blank_tool_call({"function": 7})  # function not a dict
+    assert not model_mod._is_blank_tool_call(None)  # not a dict
+
+
+def test_hoist_tool_call_ids_drops_spurious_blank_delta() -> None:
+    # The gateway prefixes every streamed turn with an empty tool-call delta; it must be
+    # dropped (else a pure-text turn dispatches a tool call with name="").
+    real_fn: dict[str, object] = {"id": "toolu_X", "name": "get_weather", "arguments": ""}
+    real_call: dict[str, object] = {"index": 0, "function": real_fn}
+    delta: dict[str, object] = {
+        "tool_calls": [
+            {"index": 0, "function": {"id": "", "name": "", "arguments": ""}},  # spurious blank
+            real_call,
+        ]
+    }
+    chunk: dict[str, object] = {"choices": [{"delta": delta}]}
+    model_mod._hoist_tool_call_ids(chunk)
+    assert delta["tool_calls"] == [real_call]  # blank dropped, real call kept
+    assert real_call["id"] == "toolu_X"  # and its id hoisted out of function
+    assert "id" not in real_fn
+
+
+def test_convert_chunk_drops_spurious_blank_tool_call() -> None:
+    from langchain_core.messages import AIMessageChunk
+    from langchain_openai import ChatOpenAI
+
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    assert isinstance(m, ChatOpenAI)  # narrow to the subclass that overrides the converter
+    # A pure-text turn's leading delta carries only the gateway's blank tool call — the
+    # converted chunk must surface no tool call (else deepagents dispatches a nameless tool).
+    chunk = {
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "tool_calls": [
+                        {"index": 0, "function": {"id": "", "name": "", "arguments": ""}}
+                    ],
+                },
+                "finish_reason": None,
+            }
+        ]
+    }
+    gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None)
+    assert gen is not None
+    msg = gen.message
+    assert isinstance(msg, AIMessageChunk)
+    assert msg.tool_call_chunks == []  # the phantom blank tool call is gone
+
+
+def test_is_empty_arguments() -> None:
+    assert model_mod._is_empty_arguments("")  # empty string
+    assert model_mod._is_empty_arguments("   ")  # whitespace only
+    assert model_mod._is_empty_arguments("{}")  # empty object
+    assert model_mod._is_empty_arguments("{ }")  # empty object with whitespace
+    assert not model_mod._is_empty_arguments('{"path": "/"}')  # real arguments
+    assert not model_mod._is_empty_arguments("[]")  # non-dict JSON is not "empty args"
+    assert not model_mod._is_empty_arguments("{bad json")  # unparseable -> leave alone
+    assert not model_mod._is_empty_arguments(None)  # non-string -> leave alone
+    assert not model_mod._is_empty_arguments({"already": "parsed"})  # non-string -> leave alone
+
+
+def test_ensure_tool_call_arguments_fills_only_empty_calls() -> None:
+    empty_fn: dict[str, object] = {"name": "ls", "arguments": "{}"}
+    blank_fn: dict[str, object] = {"name": "ls", "arguments": "  "}
+    full_fn: dict[str, object] = {"name": "ls", "arguments": '{"path": "/"}'}
+    tool_calls: list[object] = [
+        None,  # tool_call not a dict -> skipped
+        {"id": "0", "function": 7},  # function not a dict -> skipped
+        {"id": "1", "function": empty_fn},  # empty object -> filled
+        {"id": "2", "function": blank_fn},  # whitespace -> filled
+        {"id": "3", "function": full_fn},  # real args -> untouched
+    ]
+    messages: list[object] = [
+        None,  # message not a dict -> skipped
+        {"role": "user", "content": "hi"},  # no tool_calls -> skipped
+        {"tool_calls": 99},  # tool_calls not a list -> skipped
+        {"tool_calls": tool_calls},
+    ]
+    model_mod._ensure_tool_call_arguments(messages)
+    assert empty_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
+    assert blank_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
+    assert full_fn["arguments"] == '{"path": "/"}'  # left untouched
+
+
+def test_ensure_tool_call_arguments_guards() -> None:
+    model_mod._ensure_tool_call_arguments(None)  # not a list -> early return, no error
+    model_mod._ensure_tool_call_arguments([{"tool_calls": 99}])  # tool_calls not a list
+
+
+def test_get_request_payload_fills_empty_tool_call_arguments() -> None:
+    from langchain_core.messages import AIMessage, HumanMessage
+    from langchain_openai import ChatOpenAI
+
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    assert isinstance(m, ChatOpenAI)  # narrow to the subclass that overrides the payload hook
+    # An assistant tool call with no arguments serializes to arguments="{}", which the gateway
+    # rejects (missing tool_use.input); the payload must carry the placeholder instead.
+    ai = AIMessage(content="", tool_calls=[{"name": "ls", "args": {}, "id": "t1"}])
+    payload = m._get_request_payload([HumanMessage(content="hi"), ai])
+    calls = payload["messages"][1]["tool_calls"]
+    assert calls[0]["function"]["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS
+
+
+def test_convert_chunk_hoists_streamed_tool_call_id() -> None:
+    from langchain_core.messages import AIMessageChunk
+    from langchain_openai import ChatOpenAI
+
+    m = model_mod.build_model("sk-test", model="claude-sonnet-4-6")
+    assert isinstance(m, ChatOpenAI)  # narrow to the subclass that overrides the converter
+    # The gateway streams the tool-call id nested inside `function`; the override must hoist
+    # it so langchain's converted chunk carries the id (else the reply ToolMessage gets None).
+    chunk = {
+        "choices": [
+            {
+                "index": 0,
+                "delta": {
+                    "role": "assistant",
+                    "tool_calls": [
+                        {"index": 0, "function": {"id": "toolu_X", "name": "get_weather"}}
+                    ],
+                },
+                "finish_reason": None,
+            }
+        ]
+    }
+    gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None)
+    assert gen is not None
+    msg = gen.message
+    assert isinstance(msg, AIMessageChunk)
+    assert msg.tool_call_chunks[0]["id"] == "toolu_X"

From 8a84c96b310e68e15c04571d643e0ff7e8dabb78 Mon Sep 17 00:00:00 2001
From: Alex Kroman <alex@assemblyai.com>
Date: Thu, 18 Jun 2026 15:36:08 -0700
Subject: [PATCH 6/6] Fix two flaky Windows TUI tests (teardown races)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- code_agent/tui.py: guard on_worker_state_changed on is_running — a turn worker
  can finish after the app tears down (quit / test run_test exit), and driving
  _finish_turn then queries an unmounted DOM (NoMatches on "#spinner"). Skip it
  when the app isn't running. Fixes test_code_tui_voice flakiness on Windows.
- test_live_tui.py: the reply-text assertion waited only for the AssistantMessage
  widget to mount, but agent_transcript sets the text via a separate
  call_from_thread hop — so the wait raced the text on a slow runner. Wait for the
  text itself.
- pyrightconfig.tests.json: ignore test_code_tui_voice.py alongside the other
  Textual-boundary test files (it now duck-types a Worker.StateChanged event).

Co-Authored-By: Claude Opus 4.8 (1M context) <noreply@anthropic.com>
---
 aai_cli/code_agent/tui.py    |  4 +++-
 pyrightconfig.tests.json     |  1 +
 tests/test_code_tui_voice.py | 22 ++++++++++++++++++++++
 tests/test_live_tui.py       | 11 ++++++++++-
 4 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py
index f5bb44a..5d5119d 100644
--- a/aai_cli/code_agent/tui.py
+++ b/aai_cli/code_agent/tui.py
@@ -481,7 +481,9 @@ def _stop_spinner(self) -> None:
         self.query_one("#spinner", Static).display = False
 
     def on_worker_state_changed(self, event: Worker.StateChanged) -> None:
-        if event.worker.is_finished:
+        # Guard on is_running: a worker finishing *after* the app tears down (quit / test exit)
+        # would drive _finish_turn against an unmounted DOM — NoMatches on "#spinner", a flake.
+        if event.worker.is_finished and self.is_running:
             self._finish_turn()
 
     def _finish_turn(self) -> None:
diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json
index d49b21b..93980ab 100644
--- a/pyrightconfig.tests.json
+++ b/pyrightconfig.tests.json
@@ -5,6 +5,7 @@
     "tests/test_code_model.py",
     "tests/test_code_command.py",
     "tests/test_code_tui.py",
+    "tests/test_code_tui_voice.py",
     "tests/test_agent_cascade_brain.py"
   ],
   "pythonVersion": "3.12",
diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py
index 615fddc..2ec6448 100644
--- a/tests/test_code_tui_voice.py
+++ b/tests/test_code_tui_voice.py
@@ -9,6 +9,7 @@
 from __future__ import annotations
 
 import asyncio
+from types import SimpleNamespace
 
 import pytest
 from langchain_core.messages import AIMessage, HumanMessage
@@ -118,6 +119,27 @@ async def go() -> None:
     _run(go())
 
 
+def test_finished_worker_is_ignored_once_the_app_stops_running():  # untyped: duck-typed event
+    # A turn worker can finish *after* the app starts tearing down; driving _finish_turn then
+    # queries an unmounted DOM (NoMatches on #spinner — a Windows CI flake). on_worker_state_changed
+    # must skip it when the app isn't running, and handle it when it is.
+    app = CodeAgentApp(agent=FakeAgent([]))
+    calls: list[bool] = []
+    app._finish_turn = lambda: calls.append(True)  # spy
+    finished = SimpleNamespace(worker=SimpleNamespace(is_finished=True))
+
+    assert app.is_running is False  # never mounted -> torn-down-equivalent
+    app.on_worker_state_changed(finished)  # duck-typed event stands in for Worker.StateChanged
+    assert calls == []  # guarded out: no _finish_turn against a dead DOM
+
+    async def go() -> None:
+        async with app.run_test(size=(100, 30)):
+            app.on_worker_state_changed(finished)
+
+    _run(go())
+    assert calls == [True]  # running -> the finished turn is handled
+
+
 def test_capture_voice_turn_is_a_noop_once_typed() -> None:
     async def go() -> None:
         voice = FakeVoice(transcripts=["ignored"])
diff --git a/tests/test_live_tui.py b/tests/test_live_tui.py
index ba92a06..732f9e4 100644
--- a/tests/test_live_tui.py
+++ b/tests/test_live_tui.py
@@ -232,7 +232,16 @@ def run_conversation(renderer) -> None:
 
         app = _app(run_conversation=run_conversation, on_stop=done.set)
         async with app.run_test(size=(100, 30)) as pilot:
-            assert await _wait_until(pilot, lambda: bool(app.query(AssistantMessage)))
+            # Wait for the reply *text* to land, not just the widget to mount: agent_transcript
+            # sets the text via a separate call_from_thread hop, so a widget-only wait races it
+            # (empty text on a slow runner — a Windows CI flake).
+            assert await _wait_until(
+                pilot,
+                lambda: (
+                    bool(app.query(AssistantMessage))
+                    and app.query_one(AssistantMessage).text == "Done. "
+                ),
+            )
             assert "» turn it up" in str(app.query_one(UserMessage).render())
             assert app.query_one(AssistantMessage).text == "Done. "
         assert done.is_set()  # leaving the run_test context unmounted -> on_stop released it