AssemblyAI · alexkroman · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026 · Jun 18, 2026
diff --git a/aai_cli/agent_cascade/config.py b/aai_cli/agent_cascade/config.py
@@ -13,7 +13,10 @@
 from aai_cli.agent_cascade.voices import DEFAULT_VOICE
 from aai_cli.core import llm
 
-DEFAULT_MODEL = llm.DEFAULT_MODEL
+# `assembly live` defaults to a capable gateway model (override with --model); kept a
+# literal rather than llm.DEFAULT_MODEL so the live agent's default is independent of the
+# one-shot `assembly llm` default.
+DEFAULT_MODEL = "gpt-5.1"
 DEFAULT_MAX_TOKENS = llm.DEFAULT_MAX_TOKENS
 # The realtime model the cascade transcribes with (same as the agent-cascade template).
 DEFAULT_SPEECH_MODEL = "u3-rt-pro"

diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py
@@ -8,11 +8,17 @@
 
 from __future__ import annotations
 
+import json
 from collections.abc import Mapping
 from typing import TYPE_CHECKING
 
 from aai_cli.core import environments
 
+# The gateway omits Anthropic's required ``tool_use.input`` when an OpenAI tool call's
+# ``arguments`` is empty (``""`` / ``"{}"``); substitute a minimal non-empty object so the
+# field is emitted. See :func:`_ensure_tool_call_arguments`.
+_PLACEHOLDER_ARGUMENTS = '{"_": ""}'
+
 if TYPE_CHECKING:
     from langchain_core.language_models.chat_models import BaseChatModel
     from langchain_core.outputs import ChatGenerationChunk
@@ -40,18 +46,23 @@ def _flatten_content(messages: object) -> None:
 
 
 def _hoist_tool_call_ids(chunk: object) -> None:
-    """Move each streamed tool-call ``id`` from inside ``function`` up to the tool-call top level.
-
-    The AssemblyAI LLM Gateway's *streaming* ``/v1/chat/completions`` nests the tool-call
-    ``id`` under ``function`` — ``{"function": {"id": …, "name": …}}`` — instead of at the
-    tool-call's top level, which is where the OpenAI streaming spec (and
-    ``langchain_openai``, via ``id=rtc.get("id")``) reads it. Left alone, every streamed
-    tool call parses with a name and arguments but ``id=None``, so the reply ``ToolMessage``
-    fails Pydantic validation (``tool_call_id`` must be a string) and the whole turn errors
-    out. We move the id back up before langchain converts the chunk; the id rides only the
-    first delta of a call, so later argument-only deltas (no ``function.id``) are left
-    untouched. (The non-streaming endpoint already places the id correctly, so only the
-    streaming path needs this.)
+    """Normalize a streamed chunk's tool-call deltas: drop blank ones, hoist nested ids.
+
+    Two AssemblyAI LLM Gateway streaming quirks, both fixed in place before langchain
+    converts the chunk:
+
+    1. **Spurious blank deltas.** Every streamed turn (when tools are available) starts with
+       an empty tool-call delta — ``{"function": {"id": "", "name": "", "arguments": ""}}``.
+       On a pure-text turn no real call follows, so langchain is left with a tool call whose
+       ``name`` is ``""``; deepagents then dispatches it and the turn dies with
+       ``Error:  is not a valid tool``. We drop any delta with no name, id, or arguments
+       (which also harmlessly drops the gateway's empty argument-continuation deltas).
+    2. **Misplaced id.** The id is nested under ``function`` instead of at the tool-call top
+       level where the OpenAI spec and ``langchain_openai`` (``id=rtc.get("id")``) read it,
+       so without help every call parses with ``id=None`` and its reply ``ToolMessage`` fails
+       validation. We move it back up; the id rides only a call's first delta.
+
+    (The non-streaming endpoint has neither quirk, so only the streaming path needs this.)
     """
     if not isinstance(chunk, dict):
         return
@@ -62,11 +73,26 @@ def _hoist_tool_call_ids(chunk: object) -> None:
 
 
 def _hoist_in_choice(choice: object) -> None:
-    """Hoist tool-call ids within one streamed choice's delta (helper for ``_hoist_tool_call_ids``)."""
-    delta = choice.get("delta") if isinstance(choice, dict) else None
-    tool_calls = delta.get("tool_calls") if isinstance(delta, dict) else None
+    """Drop blank tool-call deltas, then hoist ids, within one streamed choice's delta."""
+    if not isinstance(choice, dict):
+        return
+    delta = choice.get("delta")
+    if not isinstance(delta, dict):
+        return
+    tool_calls = delta.get("tool_calls")
     if isinstance(tool_calls, list):
-        _hoist_call_list(tool_calls)
+        delta["tool_calls"] = [tc for tc in tool_calls if not _is_blank_tool_call(tc)]
+        _hoist_call_list(delta["tool_calls"])
+
+
+def _is_blank_tool_call(tool_call: object) -> bool:
+    """True for the gateway's spurious empty tool-call delta (no name, id, or arguments)."""
+    if not isinstance(tool_call, dict):
+        return False
+    function = tool_call.get("function")
+    if not isinstance(function, dict):
+        return False
+    return not function.get("name") and not function.get("id") and not function.get("arguments")
 
 
 def _hoist_call_list(tool_calls: list[object]) -> None:
@@ -86,6 +112,52 @@ def _hoist_call_list(tool_calls: list[object]) -> None:
             tool_call["id"] = function.pop("id")
 
 
+def _ensure_tool_call_arguments(messages: object) -> None:
+    """Give every empty tool-call ``arguments`` a non-empty placeholder object, in place.
+
+    The AssemblyAI LLM Gateway maps each OpenAI tool call's ``arguments`` (a JSON string)
+    onto Anthropic's ``tool_use.input`` object, but drops ``input`` entirely when the
+    arguments are empty (``""`` or ``"{}"``). Anthropic *requires* ``input`` to be present,
+    so replaying any argument-less tool call is rejected (400, surfaced as a 500 while
+    streaming) — and because the failing call sits in the conversation history, every later
+    turn fails too, wedging the session. We swap in a minimal non-empty object so the gateway
+    emits a valid ``input``. This only rewrites the request we send: the tool already ran
+    locally with its real (empty) arguments, and the gateway accepts the placeholder even for
+    tools that declare ``additionalProperties: false``. (Drop this once the gateway maps empty
+    arguments to ``input: {}`` itself.)
+    """
+    if not isinstance(messages, list):
+        return
+    for message in messages:
+        tool_calls = message.get("tool_calls") if isinstance(message, dict) else None
+        if isinstance(tool_calls, list):
+            _fill_empty_arguments(tool_calls)
+
+
+def _fill_empty_arguments(tool_calls: list[object]) -> None:
+    """Replace each empty ``function.arguments`` with the placeholder (helper for the above)."""
+    for tool_call in tool_calls:
+        if not isinstance(tool_call, dict):
+            continue
+        function = tool_call.get("function")
+        if isinstance(function, dict) and _is_empty_arguments(function.get("arguments")):
+            function["arguments"] = _PLACEHOLDER_ARGUMENTS
+
+
+def _is_empty_arguments(arguments: object) -> bool:
+    """True when ``arguments`` is an OpenAI args string carrying no fields (``""``/``"{}"``)."""
+    if not isinstance(arguments, str):
+        return False
+    stripped = arguments.strip()
+    if not stripped:
+        return True
+    try:
+        parsed = json.loads(stripped)
+    except ValueError:
+        return False
+    return isinstance(parsed, dict) and not parsed
+
+
 def build_model(
     api_key: str,
     *,
@@ -114,18 +186,21 @@ def build_model(
     class _GatewayChatOpenAI(ChatOpenAI):
         """ChatOpenAI that adapts the gateway's OpenAI-incompatible quirks for langchain.
 
-        Two fix-ups, each working around a gateway response/request bug the upstream client
-        doesn't expect: flatten list-content messages the gateway 500s on (request side, see
-        :func:`_flatten_content`), and hoist each streamed tool-call ``id`` back to the
-        tool-call top level where langchain reads it (response side, see
-        :func:`_hoist_tool_call_ids`).
+        Three fix-ups, each working around a gateway request/response bug the upstream client
+        doesn't expect: flatten list-content messages the gateway 500s on and give empty
+        tool-call arguments a placeholder the gateway can map to ``tool_use.input`` (request
+        side, see :func:`_flatten_content` / :func:`_ensure_tool_call_arguments`), and hoist
+        each streamed tool-call ``id`` back to the tool-call top level where langchain reads it
+        (response side, see :func:`_hoist_tool_call_ids`).
         """
 
         def _get_request_payload(
             self, input_: object, *, stop: list[str] | None = None, **kwargs: object
         ) -> dict:
             payload = super()._get_request_payload(input_, stop=stop, **kwargs)
-            _flatten_content(payload.get("messages"))
+            messages = payload.get("messages")
+            _flatten_content(messages)
+            _ensure_tool_call_arguments(messages)
             return payload
 
         def _convert_chunk_to_generation_chunk(

diff --git a/aai_cli/code_agent/prompt.py b/aai_cli/code_agent/prompt.py
@@ -4,7 +4,7 @@
 
 # A capable gateway model by default; override with `--model`. The gateway is the
 # source of truth for what's accepted, so this is only a sensible default.
-DEFAULT_MODEL = "claude-sonnet-4-6"
+DEFAULT_MODEL = "gpt-5.1"
 # Generous ceiling so long edits/explanations aren't clipped; the gateway only bills
 # tokens actually generated, so a high cap costs nothing on short replies.
 DEFAULT_MAX_TOKENS = 8192

diff --git a/aai_cli/code_agent/skills.py b/aai_cli/code_agent/skills.py
@@ -1,11 +1,16 @@
 """Import installed agent skills (notably the `assemblyai` skill) into the agent.
 
-`assembly setup` installs the `assemblyai` skill under the coding-agent config root
-(`~/.claude/skills/assemblyai/`, honoring `CLAUDE_CONFIG_DIR`). deepagents can surface
-skills to the model via progressive disclosure, but its `SkillsMiddleware` reads them
-through a backend — and our main file backend is confined to the working directory.
-So we give skills their *own* `FilesystemBackend` rooted at the skills directory and
-inject a standalone `SkillsMiddleware`, independent of the cwd-scoped file tools.
+`assembly setup` installs skills under the coding-agent config root
+(`~/.claude/skills/<skill>/SKILL.md`, honoring `CLAUDE_CONFIG_DIR`). deepagents can
+surface skills to the model via progressive disclosure, but its `SkillsMiddleware` reads
+them through a backend — and our main file backend is confined to the working directory.
+So we give skills their *own* `FilesystemBackend` rooted at the skills directory.
+
+deepagents' stock skills prompt tells the model to open each `SKILL.md` with `read_file`,
+but that tool is bound to the cwd-scoped backend and so can't reach a skill living under
+`~/.claude/skills` (the model just gets ``File '/aai-cli/SKILL.md' not found``). We close
+that gap with a dedicated read-only `read_skill` tool bound to the skills directory, and a
+prompt that points the model at it instead of `read_file`.
 """
 
 from __future__ import annotations
@@ -17,11 +22,34 @@
 
 if TYPE_CHECKING:
     from langchain.agents.middleware import AgentMiddleware
+    from langchain_core.tools import BaseTool
 
 # Mirrors aai_cli.app.coding_agent.skills_root without importing the app layer (a
 # feature slice stays below it): the agent config root, overridable for tests/agents.
 _CLAUDE_CONFIG_DIR = "CLAUDE_CONFIG_DIR"
 
+READ_SKILL_TOOL_NAME = "read_skill"
+
+# Skills prompt fragment. Must keep the three slots deepagents substitutes at runtime
+# (`{skills_locations}`, `{skills_load_warnings}`, `{skills_list}`); the constructor
+# raises if any is missing. The one behavioral change from deepagents' stock prompt is
+# steering the model to `read_skill` — skills live outside the cwd sandbox, so the
+# ordinary `read_file` tool can't open them.
+_SKILLS_PROMPT = """## Skills
+
+You have a library of skills — specialized instructions and workflows for specific tasks.
+
+{skills_locations}{skills_load_warnings}
+**Available skills:**
+
+{skills_list}
+
+**How to use a skill (progressive disclosure):** you see each skill's name, description, and
+path above, but read its full instructions only when a skill matches the task. Read it with
+the `read_skill` tool, passing the path shown above — e.g. `read_skill("/assemblyai/SKILL.md")`
+— then follow what it says. Do **not** use `read_file` for these paths: skills live outside the
+working directory, so only `read_skill` can reach them."""
+
 
 def skills_root() -> Path:
     """Directory holding installed skills (one subdir per skill, each with SKILL.md)."""
@@ -35,12 +63,41 @@ def _has_skills(root: Path) -> bool:
     return root.is_dir() and any(child.joinpath("SKILL.md").is_file() for child in root.iterdir())
 
 
-def build_skills_middleware(root: Path | None = None) -> AgentMiddleware | None:
-    """A ``SkillsMiddleware`` over the installed skills, or ``None`` if none are present.
+def _read_skill_file(root: Path, path: str) -> str:
+    """Read ``path`` (as surfaced in the skills list) from under ``root``, guarding traversal.
+
+    ``path`` is the backend-virtual path shown in the prompt (e.g. ``/assemblyai/SKILL.md``),
+    so it is resolved relative to ``root``. A path that escapes ``root`` (``..`` segments) or
+    names a missing file returns an error string the model can recover from rather than raising.
+    """
+    target = (root / path.lstrip("/")).resolve()
+    if not target.is_relative_to(root.resolve()):
+        return f"Error: '{path}' is outside the skills directory."
+    if not target.is_file():
+        return f"Error: skill file '{path}' not found."
+    return target.read_text(encoding="utf-8")
+
+
+def build_skill_reader(root: Path) -> BaseTool:
+    """Wrap :func:`_read_skill_file` as the ``read_skill`` tool, bound to ``root``."""
+    from langchain_core.tools import tool
+
+    @tool(READ_SKILL_TOOL_NAME)
+    def read_skill(path: str) -> str:
+        """Read a skill's file (e.g. its SKILL.md) by the path shown in the skills list.
+        Use this — not read_file — for any path under the skills library."""
+        return _read_skill_file(root, path)
+
+    return read_skill
+
+
+def build_skills(root: Path | None = None) -> tuple[AgentMiddleware, BaseTool] | None:
+    """The skills ``(middleware, read_skill tool)`` pair, or ``None`` if no skills are present.
 
-    Returns ``None`` (rather than an empty middleware) so the caller simply omits it
-    from the stack when the user has run no `assembly setup` — the agent then starts
-    with no skills section instead of an empty one.
+    Returns ``None`` (rather than an empty middleware) so the caller simply omits both from
+    the stack when the user has run no `assembly setup` — the agent then starts with no skills
+    section and no `read_skill` tool instead of empty ones. The tool is paired with the
+    middleware because the prompt the middleware injects directs the model to it.
     """
     root = root if root is not None else skills_root()
     if not _has_skills(root):
@@ -50,4 +107,5 @@ def build_skills_middleware(root: Path | None = None) -> AgentMiddleware | None:
     from deepagents.middleware.skills import SkillsMiddleware
 
     backend = FilesystemBackend(root_dir=str(root), virtual_mode=True)
-    return SkillsMiddleware(backend=backend, sources=["/"])
+    middleware = SkillsMiddleware(backend=backend, sources=["/"], system_prompt=_SKILLS_PROMPT)
+    return middleware, build_skill_reader(root)
diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py
@@ -481,7 +481,9 @@ def _stop_spinner(self) -> None:
         self.query_one("#spinner", Static).display = False
 
     def on_worker_state_changed(self, event: Worker.StateChanged) -> None:
-        if event.worker.is_finished:
+        # Guard on is_running: a worker finishing *after* the app tears down (quit / test exit)
+        # would drive _finish_turn against an unmounted DOM — NoMatches on "#spinner", a flake.
+        if event.worker.is_finished and self.is_running:
             self._finish_turn()
 
     def _finish_turn(self) -> None:

diff --git a/aai_cli/code_agent/voice.py b/aai_cli/code_agent/voice.py
@@ -130,8 +130,10 @@ def __enter__(self) -> Player:
     def __exit__(self, exc_type: object, *exc: object) -> object:
         """Drain on a clean exit, abort otherwise; never suppress."""
 
-    def feed(self, pcm: bytes, sample_rate: int) -> None:
-        """Play one PCM chunk, opening the output device on the first call."""
+    def feed(
+        self, pcm: bytes, sample_rate: int, *, cancelled: Callable[[], bool] | None = None
+    ) -> None:
+        """Play one PCM chunk, polling ``cancelled`` between writes to stop mid-chunk."""
 
 
 def _stt_params(sample_rate: int) -> StreamingParameters:
@@ -219,7 +221,11 @@ def speak(self, text: str) -> None:
                 def feed(pcm: bytes, sample_rate: int) -> None:
                     if self._cancel.is_set():
                         _abort_readback()
-                    player.feed(pcm, sample_rate)
+                    # Poll cancel *during* playback too: a chunk can be seconds of audio, and
+                    # in the TUI the only cancel signal is this flag set from another thread.
+                    player.feed(pcm, sample_rate, cancelled=self._cancel.is_set)
+                    if self._cancel.is_set():
+                        _abort_readback()
 
                 self.synth_fn(self.api_key, config, on_audio=feed)
         except _ReadbackInterrupted:

diff --git a/aai_cli/commands/code/_exec.py b/aai_cli/commands/code/_exec.py
@@ -31,7 +31,7 @@
 from aai_cli.code_agent.prompt import DEFAULT_MODEL
 from aai_cli.code_agent.render import RichRenderer, make_approver
 from aai_cli.code_agent.session import CodeSession, EventSink, run_repl
-from aai_cli.code_agent.skills import build_skills_middleware
+from aai_cli.code_agent.skills import build_skills
 from aai_cli.code_agent.store import build_checkpointer
 from aai_cli.code_agent.voice import (
     AUDIO_ERROR_TYPES,
@@ -82,24 +82,30 @@ def _assemble_tools(api_key: str, opts: CodeOptions, bridge: AskBridge) -> list[
 
 
 def _assemble_middlewares(opts: CodeOptions) -> list[AgentMiddleware]:
-    """Skills + long-term memory middleware, in load order."""
+    """The long-term memory middleware (skills are wired in :func:`_build_agent`, since the
+    skills middleware pairs with a tool)."""
     middlewares: list[AgentMiddleware] = []
-    if opts.skills:
-        skills = build_skills_middleware()
-        if skills is not None:
-            middlewares.append(skills)
     if opts.memory:
         middlewares.append(build_memory_middleware())
     return middlewares
 
 
 def _build_agent(api_key: str, opts: CodeOptions, bridge: AskBridge) -> CompiledAgent:
     """Wire the gateway model + tools + middlewares + checkpointer into the agent."""
+    tools = _assemble_tools(api_key, opts, bridge)
+    middlewares = _assemble_middlewares(opts)
+    # Skills add both a middleware (the skills prompt section) and the `read_skill` tool the
+    # prompt directs the model to; load the middleware ahead of memory to match prior order.
+    skills = build_skills() if opts.skills else None
+    if skills is not None:
+        middleware, reader = skills
+        middlewares.insert(0, middleware)
+        tools.append(reader)
     return build_agent(
         model=build_model(api_key, model=opts.model),
         root_dir=opts.root_dir.resolve(),
-        tools=_assemble_tools(api_key, opts, bridge),
-        middlewares=_assemble_middlewares(opts),
+        tools=tools,
+        middlewares=middlewares,
         checkpointer=build_checkpointer(persist=opts.persist),
         auto_approve=opts.auto,
     )