From 2922b4618df0137b2ee65112b746eb5291710d4b Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Thu, 18 Jun 2026 13:34:12 -0700 Subject: [PATCH 1/6] assembly code: read skills via a dedicated tool + work around gateway empty tool-args MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two `assembly code` fixes uncovered while building a voice agent: 1. read_skill tool. The skills middleware loads skills from its own backend rooted at ~/.claude/skills, but deepagents' stock prompt tells the model to open each SKILL.md with `read_file` — which is bound to the cwd sandbox and can't reach them, so the model got `File '/aai-cli/SKILL.md' not found`. Add a read-only `read_skill` tool bound to the skills directory (with a traversal guard) and a prompt that points the model at it. build_skills() now returns the (middleware, tool) pair, wired together in _build_agent. 2. Empty tool-call arguments. The LLM Gateway maps OpenAI `arguments` onto Anthropic `tool_use.input` but drops `input` entirely when arguments are empty (""/"{}"), which Anthropic rejects (400, surfaced as 500 when streaming) — and because the failing call sits in history, every later turn fails too, wedging the session. _ensure_tool_call_arguments substitutes a minimal non-empty placeholder in the outgoing payload so the gateway emits a valid input. Request-only; the tool already ran locally with its real args. (Reported upstream for a server-side fix; this keeps the CLI resilient.) Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/code_agent/model.py | 67 ++++++++++++++++++++++++--- aai_cli/code_agent/skills.py | 82 +++++++++++++++++++++++++++++----- aai_cli/commands/code/_exec.py | 22 +++++---- tests/test_code_agent.py | 77 +++++++++++++++++++++++++++++-- tests/test_code_command.py | 31 +++++++++---- 5 files changed, 242 insertions(+), 37 deletions(-) diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py index 7f87f6b..9539b4c 100644 --- a/aai_cli/code_agent/model.py +++ b/aai_cli/code_agent/model.py @@ -8,11 +8,17 @@ from __future__ import annotations +import json from collections.abc import Mapping from typing import TYPE_CHECKING from aai_cli.core import environments +# The gateway omits Anthropic's required ``tool_use.input`` when an OpenAI tool call's +# ``arguments`` is empty (``""`` / ``"{}"``); substitute a minimal non-empty object so the +# field is emitted. See :func:`_ensure_tool_call_arguments`. +_PLACEHOLDER_ARGUMENTS = '{"_": ""}' + if TYPE_CHECKING: from langchain_core.language_models.chat_models import BaseChatModel from langchain_core.outputs import ChatGenerationChunk @@ -86,6 +92,52 @@ def _hoist_call_list(tool_calls: list[object]) -> None: tool_call["id"] = function.pop("id") +def _ensure_tool_call_arguments(messages: object) -> None: + """Give every empty tool-call ``arguments`` a non-empty placeholder object, in place. + + The AssemblyAI LLM Gateway maps each OpenAI tool call's ``arguments`` (a JSON string) + onto Anthropic's ``tool_use.input`` object, but drops ``input`` entirely when the + arguments are empty (``""`` or ``"{}"``). Anthropic *requires* ``input`` to be present, + so replaying any argument-less tool call is rejected (400, surfaced as a 500 while + streaming) — and because the failing call sits in the conversation history, every later + turn fails too, wedging the session. We swap in a minimal non-empty object so the gateway + emits a valid ``input``. This only rewrites the request we send: the tool already ran + locally with its real (empty) arguments, and the gateway accepts the placeholder even for + tools that declare ``additionalProperties: false``. (Drop this once the gateway maps empty + arguments to ``input: {}`` itself.) + """ + if not isinstance(messages, list): + return + for message in messages: + tool_calls = message.get("tool_calls") if isinstance(message, dict) else None + if isinstance(tool_calls, list): + _fill_empty_arguments(tool_calls) + + +def _fill_empty_arguments(tool_calls: list[object]) -> None: + """Replace each empty ``function.arguments`` with the placeholder (helper for the above).""" + for tool_call in tool_calls: + if not isinstance(tool_call, dict): + continue + function = tool_call.get("function") + if isinstance(function, dict) and _is_empty_arguments(function.get("arguments")): + function["arguments"] = _PLACEHOLDER_ARGUMENTS + + +def _is_empty_arguments(arguments: object) -> bool: + """True when ``arguments`` is an OpenAI args string carrying no fields (``""``/``"{}"``).""" + if not isinstance(arguments, str): + return False + stripped = arguments.strip() + if not stripped: + return True + try: + parsed = json.loads(stripped) + except ValueError: + return False + return isinstance(parsed, dict) and not parsed + + def build_model( api_key: str, *, @@ -114,18 +166,21 @@ def build_model( class _GatewayChatOpenAI(ChatOpenAI): """ChatOpenAI that adapts the gateway's OpenAI-incompatible quirks for langchain. - Two fix-ups, each working around a gateway response/request bug the upstream client - doesn't expect: flatten list-content messages the gateway 500s on (request side, see - :func:`_flatten_content`), and hoist each streamed tool-call ``id`` back to the - tool-call top level where langchain reads it (response side, see - :func:`_hoist_tool_call_ids`). + Three fix-ups, each working around a gateway request/response bug the upstream client + doesn't expect: flatten list-content messages the gateway 500s on and give empty + tool-call arguments a placeholder the gateway can map to ``tool_use.input`` (request + side, see :func:`_flatten_content` / :func:`_ensure_tool_call_arguments`), and hoist + each streamed tool-call ``id`` back to the tool-call top level where langchain reads it + (response side, see :func:`_hoist_tool_call_ids`). """ def _get_request_payload( self, input_: object, *, stop: list[str] | None = None, **kwargs: object ) -> dict: payload = super()._get_request_payload(input_, stop=stop, **kwargs) - _flatten_content(payload.get("messages")) + messages = payload.get("messages") + _flatten_content(messages) + _ensure_tool_call_arguments(messages) return payload def _convert_chunk_to_generation_chunk( diff --git a/aai_cli/code_agent/skills.py b/aai_cli/code_agent/skills.py index eff6668..a953364 100644 --- a/aai_cli/code_agent/skills.py +++ b/aai_cli/code_agent/skills.py @@ -1,11 +1,16 @@ """Import installed agent skills (notably the `assemblyai` skill) into the agent. -`assembly setup` installs the `assemblyai` skill under the coding-agent config root -(`~/.claude/skills/assemblyai/`, honoring `CLAUDE_CONFIG_DIR`). deepagents can surface -skills to the model via progressive disclosure, but its `SkillsMiddleware` reads them -through a backend — and our main file backend is confined to the working directory. -So we give skills their *own* `FilesystemBackend` rooted at the skills directory and -inject a standalone `SkillsMiddleware`, independent of the cwd-scoped file tools. +`assembly setup` installs skills under the coding-agent config root +(`~/.claude/skills//SKILL.md`, honoring `CLAUDE_CONFIG_DIR`). deepagents can +surface skills to the model via progressive disclosure, but its `SkillsMiddleware` reads +them through a backend — and our main file backend is confined to the working directory. +So we give skills their *own* `FilesystemBackend` rooted at the skills directory. + +deepagents' stock skills prompt tells the model to open each `SKILL.md` with `read_file`, +but that tool is bound to the cwd-scoped backend and so can't reach a skill living under +`~/.claude/skills` (the model just gets ``File '/aai-cli/SKILL.md' not found``). We close +that gap with a dedicated read-only `read_skill` tool bound to the skills directory, and a +prompt that points the model at it instead of `read_file`. """ from __future__ import annotations @@ -17,11 +22,34 @@ if TYPE_CHECKING: from langchain.agents.middleware import AgentMiddleware + from langchain_core.tools import BaseTool # Mirrors aai_cli.app.coding_agent.skills_root without importing the app layer (a # feature slice stays below it): the agent config root, overridable for tests/agents. _CLAUDE_CONFIG_DIR = "CLAUDE_CONFIG_DIR" +READ_SKILL_TOOL_NAME = "read_skill" + +# Skills prompt fragment. Must keep the three slots deepagents substitutes at runtime +# (`{skills_locations}`, `{skills_load_warnings}`, `{skills_list}`); the constructor +# raises if any is missing. The one behavioral change from deepagents' stock prompt is +# steering the model to `read_skill` — skills live outside the cwd sandbox, so the +# ordinary `read_file` tool can't open them. +_SKILLS_PROMPT = """## Skills + +You have a library of skills — specialized instructions and workflows for specific tasks. + +{skills_locations}{skills_load_warnings} +**Available skills:** + +{skills_list} + +**How to use a skill (progressive disclosure):** you see each skill's name, description, and +path above, but read its full instructions only when a skill matches the task. Read it with +the `read_skill` tool, passing the path shown above — e.g. `read_skill("/assemblyai/SKILL.md")` +— then follow what it says. Do **not** use `read_file` for these paths: skills live outside the +working directory, so only `read_skill` can reach them.""" + def skills_root() -> Path: """Directory holding installed skills (one subdir per skill, each with SKILL.md).""" @@ -35,12 +63,41 @@ def _has_skills(root: Path) -> bool: return root.is_dir() and any(child.joinpath("SKILL.md").is_file() for child in root.iterdir()) -def build_skills_middleware(root: Path | None = None) -> AgentMiddleware | None: - """A ``SkillsMiddleware`` over the installed skills, or ``None`` if none are present. +def _read_skill_file(root: Path, path: str) -> str: + """Read ``path`` (as surfaced in the skills list) from under ``root``, guarding traversal. + + ``path`` is the backend-virtual path shown in the prompt (e.g. ``/assemblyai/SKILL.md``), + so it is resolved relative to ``root``. A path that escapes ``root`` (``..`` segments) or + names a missing file returns an error string the model can recover from rather than raising. + """ + target = (root / path.lstrip("/")).resolve() + if not target.is_relative_to(root.resolve()): + return f"Error: '{path}' is outside the skills directory." + if not target.is_file(): + return f"Error: skill file '{path}' not found." + return target.read_text(encoding="utf-8") + + +def build_skill_reader(root: Path) -> BaseTool: + """Wrap :func:`_read_skill_file` as the ``read_skill`` tool, bound to ``root``.""" + from langchain_core.tools import tool + + @tool(READ_SKILL_TOOL_NAME) + def read_skill(path: str) -> str: + """Read a skill's file (e.g. its SKILL.md) by the path shown in the skills list. + Use this — not read_file — for any path under the skills library.""" + return _read_skill_file(root, path) + + return read_skill + + +def build_skills(root: Path | None = None) -> tuple[AgentMiddleware, BaseTool] | None: + """The skills ``(middleware, read_skill tool)`` pair, or ``None`` if no skills are present. - Returns ``None`` (rather than an empty middleware) so the caller simply omits it - from the stack when the user has run no `assembly setup` — the agent then starts - with no skills section instead of an empty one. + Returns ``None`` (rather than an empty middleware) so the caller simply omits both from + the stack when the user has run no `assembly setup` — the agent then starts with no skills + section and no `read_skill` tool instead of empty ones. The tool is paired with the + middleware because the prompt the middleware injects directs the model to it. """ root = root if root is not None else skills_root() if not _has_skills(root): @@ -50,4 +107,5 @@ def build_skills_middleware(root: Path | None = None) -> AgentMiddleware | None: from deepagents.middleware.skills import SkillsMiddleware backend = FilesystemBackend(root_dir=str(root), virtual_mode=True) - return SkillsMiddleware(backend=backend, sources=["/"]) + middleware = SkillsMiddleware(backend=backend, sources=["/"], system_prompt=_SKILLS_PROMPT) + return middleware, build_skill_reader(root) diff --git a/aai_cli/commands/code/_exec.py b/aai_cli/commands/code/_exec.py index 2985629..b86369e 100644 --- a/aai_cli/commands/code/_exec.py +++ b/aai_cli/commands/code/_exec.py @@ -31,7 +31,7 @@ from aai_cli.code_agent.prompt import DEFAULT_MODEL from aai_cli.code_agent.render import RichRenderer, make_approver from aai_cli.code_agent.session import CodeSession, EventSink, run_repl -from aai_cli.code_agent.skills import build_skills_middleware +from aai_cli.code_agent.skills import build_skills from aai_cli.code_agent.store import build_checkpointer from aai_cli.code_agent.voice import ( AUDIO_ERROR_TYPES, @@ -82,12 +82,9 @@ def _assemble_tools(api_key: str, opts: CodeOptions, bridge: AskBridge) -> list[ def _assemble_middlewares(opts: CodeOptions) -> list[AgentMiddleware]: - """Skills + long-term memory middleware, in load order.""" + """The long-term memory middleware (skills are wired in :func:`_build_agent`, since the + skills middleware pairs with a tool).""" middlewares: list[AgentMiddleware] = [] - if opts.skills: - skills = build_skills_middleware() - if skills is not None: - middlewares.append(skills) if opts.memory: middlewares.append(build_memory_middleware()) return middlewares @@ -95,11 +92,20 @@ def _assemble_middlewares(opts: CodeOptions) -> list[AgentMiddleware]: def _build_agent(api_key: str, opts: CodeOptions, bridge: AskBridge) -> CompiledAgent: """Wire the gateway model + tools + middlewares + checkpointer into the agent.""" + tools = _assemble_tools(api_key, opts, bridge) + middlewares = _assemble_middlewares(opts) + # Skills add both a middleware (the skills prompt section) and the `read_skill` tool the + # prompt directs the model to; load the middleware ahead of memory to match prior order. + skills = build_skills() if opts.skills else None + if skills is not None: + middleware, reader = skills + middlewares.insert(0, middleware) + tools.append(reader) return build_agent( model=build_model(api_key, model=opts.model), root_dir=opts.root_dir.resolve(), - tools=_assemble_tools(api_key, opts, bridge), - middlewares=_assemble_middlewares(opts), + tools=tools, + middlewares=middlewares, checkpointer=build_checkpointer(persist=opts.persist), auto_approve=opts.auto, ) diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py index 28a9561..fffb90b 100644 --- a/tests/test_code_agent.py +++ b/tests/test_code_agent.py @@ -215,13 +215,32 @@ def boom(url): assert docs_mcp.load_docs_tools("https://example.invalid") == [] -def test_skills_middleware_present_and_absent(tmp_path: Path) -> None: - assert skills.build_skills_middleware(tmp_path) is None # empty dir -> no skills +def test_build_skills_present_and_absent(tmp_path: Path) -> None: + assert skills.build_skills(tmp_path) is None # empty dir -> no skills, no tool skill_dir = tmp_path / "assemblyai" skill_dir.mkdir() (skill_dir / "SKILL.md").write_text("---\nname: assemblyai\ndescription: x\n---\nbody") - assert skills.build_skills_middleware(tmp_path) is not None + bundle = skills.build_skills(tmp_path) + assert bundle is not None # constructing the middleware also validates the custom prompt + _middleware, reader = bundle + assert reader.name == skills.READ_SKILL_TOOL_NAME + + +def test_read_skill_tool_reads_under_root_and_blocks_escape(tmp_path: Path) -> None: + skill_dir = tmp_path / "assemblyai" + skill_dir.mkdir() + (skill_dir / "SKILL.md").write_text("the skill body") + (tmp_path.parent / "secret.md").write_text("top secret") + + reader = skills.build_skill_reader(tmp_path) + # The path is the prompt's backend-virtual form (leading slash, relative to root). + assert reader.invoke({"path": "/assemblyai/SKILL.md"}) == "the skill body" + # A traversal out of the skills dir is refused (not the neighbouring file's contents). + escaped = reader.invoke({"path": "/../secret.md"}) + assert "outside the skills directory" in escaped and "top secret" not in escaped + # A missing skill file reports an error rather than raising. + assert "not found" in reader.invoke({"path": "/assemblyai/MISSING.md"}) def test_web_search_tool_gated_on_api_key(monkeypatch: pytest.MonkeyPatch) -> None: @@ -317,6 +336,58 @@ def test_hoist_tool_call_ids_guards() -> None: model_mod._hoist_tool_call_ids({"choices": 99}) # choices not a list -> early return +def test_is_empty_arguments() -> None: + assert model_mod._is_empty_arguments("") # empty string + assert model_mod._is_empty_arguments(" ") # whitespace only + assert model_mod._is_empty_arguments("{}") # empty object + assert model_mod._is_empty_arguments("{ }") # empty object with whitespace + assert not model_mod._is_empty_arguments('{"path": "/"}') # real arguments + assert not model_mod._is_empty_arguments("[]") # non-dict JSON is not "empty args" + assert not model_mod._is_empty_arguments("{bad json") # unparseable -> leave alone + assert not model_mod._is_empty_arguments(None) # non-string -> leave alone + assert not model_mod._is_empty_arguments({"already": "parsed"}) # non-string -> leave alone + + +def test_ensure_tool_call_arguments_fills_only_empty_calls() -> None: + empty_fn: dict[str, object] = {"name": "ls", "arguments": "{}"} + blank_fn: dict[str, object] = {"name": "ls", "arguments": " "} + full_fn: dict[str, object] = {"name": "ls", "arguments": '{"path": "/"}'} + tool_calls: list[object] = [ + None, # tool_call not a dict -> skipped + {"id": "0", "function": 7}, # function not a dict -> skipped + {"id": "1", "function": empty_fn}, # empty object -> filled + {"id": "2", "function": blank_fn}, # whitespace -> filled + {"id": "3", "function": full_fn}, # real args -> untouched + ] + messages: list[object] = [ + None, # message not a dict -> skipped + {"role": "user", "content": "hi"}, # no tool_calls -> skipped + {"tool_calls": 99}, # tool_calls not a list -> skipped + {"tool_calls": tool_calls}, + ] + model_mod._ensure_tool_call_arguments(messages) + assert empty_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS + assert blank_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS + assert full_fn["arguments"] == '{"path": "/"}' # left untouched + + +def test_ensure_tool_call_arguments_guards() -> None: + model_mod._ensure_tool_call_arguments(None) # not a list -> early return, no error + model_mod._ensure_tool_call_arguments([{"tool_calls": 99}]) # tool_calls not a list + + +def test_get_request_payload_fills_empty_tool_call_arguments() -> None: + from langchain_core.messages import AIMessage, HumanMessage + + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + # An assistant tool call with no arguments serializes to arguments="{}", which the gateway + # rejects (missing tool_use.input); the payload must carry the placeholder instead. + ai = AIMessage(content="", tool_calls=[{"name": "ls", "args": {}, "id": "t1"}]) + payload = m._get_request_payload([HumanMessage(content="hi"), ai]) + calls = payload["messages"][1]["tool_calls"] + assert calls[0]["function"]["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS + + def test_convert_chunk_hoists_streamed_tool_call_id() -> None: from langchain_core.messages import AIMessageChunk from langchain_openai import ChatOpenAI diff --git a/tests/test_code_command.py b/tests/test_code_command.py index 044fc87..ab2c605 100644 --- a/tests/test_code_command.py +++ b/tests/test_code_command.py @@ -167,13 +167,12 @@ def test_assemble_tools_includes_cli_fetch_ask_and_optional_extras(monkeypatch): assert [t.name for t in tools] == ["assembly", "fetch_url", "ask_user"] -def test_assemble_middlewares_skills_and_memory(monkeypatch): - monkeypatch.setattr(_exec, "build_skills_middleware", lambda: "SKILLS") +def test_assemble_middlewares_memory_only(monkeypatch): + # Skills are wired in _build_agent now (they pair a middleware with a tool); this + # assembler only handles the optional memory middleware. monkeypatch.setattr(_exec, "build_memory_middleware", lambda: "MEM") - assert _exec._assemble_middlewares(_opts(skills=True, memory=True)) == ["SKILLS", "MEM"] - - monkeypatch.setattr(_exec, "build_skills_middleware", lambda: None) - assert _exec._assemble_middlewares(_opts(skills=True, memory=False)) == [] + assert _exec._assemble_middlewares(_opts(memory=True)) == ["MEM"] + assert _exec._assemble_middlewares(_opts(memory=False)) == [] def test_build_agent_wires_model_tools_and_checkpointer(monkeypatch): @@ -181,16 +180,32 @@ def test_build_agent_wires_model_tools_and_checkpointer(monkeypatch): monkeypatch.setattr(_exec, "build_model", lambda key, *, model: f"model:{model}") monkeypatch.setattr(_exec, "_assemble_tools", lambda key, opts, bridge: ["t"]) monkeypatch.setattr(_exec, "_assemble_middlewares", lambda opts: ["m"]) + # --no-skills: build_skills must not be consulted, so the sentinel never lands. + monkeypatch.setattr(_exec, "build_skills", lambda: ("X_MW", "X_TOOL")) monkeypatch.setattr(_exec, "build_checkpointer", lambda *, persist: f"ckpt:{persist}") monkeypatch.setattr(_exec, "build_agent", lambda **kw: seen.update(kw) or "AGENT") - agent = _exec._build_agent("k", _opts(model="gpt-5", persist=False), AskBridge()) + agent = _exec._build_agent("k", _opts(model="gpt-5", persist=False, skills=False), AskBridge()) assert agent == "AGENT" assert seen["model"] == "model:gpt-5" - assert seen["tools"] == ["t"] and seen["middlewares"] == ["m"] + assert seen["tools"] == ["t"] and seen["middlewares"] == ["m"] # no skills sentinel added assert seen["checkpointer"] == "ckpt:False" +def test_build_agent_inserts_skills_middleware_and_read_tool(monkeypatch): + seen = {} + monkeypatch.setattr(_exec, "build_model", lambda key, *, model: "model") + monkeypatch.setattr(_exec, "_assemble_tools", lambda key, opts, bridge: ["base"]) + monkeypatch.setattr(_exec, "_assemble_middlewares", lambda opts: ["mem"]) + monkeypatch.setattr(_exec, "build_skills", lambda: ("skills_mw", "read_skill_tool")) + monkeypatch.setattr(_exec, "build_checkpointer", lambda *, persist: "ckpt") + monkeypatch.setattr(_exec, "build_agent", lambda **kw: seen.update(kw) or "AGENT") + + _exec._build_agent("k", _opts(skills=True), AskBridge()) + assert seen["middlewares"] == ["skills_mw", "mem"] # skills loaded ahead of memory + assert seen["tools"] == ["base", "read_skill_tool"] # read_skill tool appended + + def test_web_note_only_without_key(monkeypatch): monkeypatch.delenv("FIRECRAWL_API_KEY", raising=False) assert _exec._web_note(_opts(web=True)) is not None From 2b53537d08dfb09aded4b18c685bb640388111d9 Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Thu, 18 Jun 2026 14:03:02 -0700 Subject: [PATCH 2/6] assembly code: drop the gateway's spurious blank streamed tool-call delta MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Every streaming turn (when tools are available) begins with an empty tool-call delta — {"function": {"id": "", "name": "", "arguments": ""}}. On a pure-text turn (e.g. the agent asking clarifying questions) no real tool call follows, so langchain is left with a tool call whose name is "", deepagents dispatches it, and the turn dies with `Error: is not a valid tool`. Extend the streaming normalizer to drop any tool-call delta with no name, id, or arguments before langchain converts the chunk (this also harmlessly drops the gateway's empty argument-continuation deltas). A real text+tool turn still yields exactly one correct tool call; a pure-text turn yields none. Reported upstream for a server-side fix; this keeps the CLI resilient meanwhile. Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/code_agent/model.py | 44 +++++++++++++++++++--------- tests/test_code_agent.py | 57 +++++++++++++++++++++++++++++++++++++ 2 files changed, 87 insertions(+), 14 deletions(-) diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py index 9539b4c..27f9980 100644 --- a/aai_cli/code_agent/model.py +++ b/aai_cli/code_agent/model.py @@ -46,18 +46,23 @@ def _flatten_content(messages: object) -> None: def _hoist_tool_call_ids(chunk: object) -> None: - """Move each streamed tool-call ``id`` from inside ``function`` up to the tool-call top level. - - The AssemblyAI LLM Gateway's *streaming* ``/v1/chat/completions`` nests the tool-call - ``id`` under ``function`` — ``{"function": {"id": …, "name": …}}`` — instead of at the - tool-call's top level, which is where the OpenAI streaming spec (and - ``langchain_openai``, via ``id=rtc.get("id")``) reads it. Left alone, every streamed - tool call parses with a name and arguments but ``id=None``, so the reply ``ToolMessage`` - fails Pydantic validation (``tool_call_id`` must be a string) and the whole turn errors - out. We move the id back up before langchain converts the chunk; the id rides only the - first delta of a call, so later argument-only deltas (no ``function.id``) are left - untouched. (The non-streaming endpoint already places the id correctly, so only the - streaming path needs this.) + """Normalize a streamed chunk's tool-call deltas: drop blank ones, hoist nested ids. + + Two AssemblyAI LLM Gateway streaming quirks, both fixed in place before langchain + converts the chunk: + + 1. **Spurious blank deltas.** Every streamed turn (when tools are available) starts with + an empty tool-call delta — ``{"function": {"id": "", "name": "", "arguments": ""}}``. + On a pure-text turn no real call follows, so langchain is left with a tool call whose + ``name`` is ``""``; deepagents then dispatches it and the turn dies with + ``Error: is not a valid tool``. We drop any delta with no name, id, or arguments + (which also harmlessly drops the gateway's empty argument-continuation deltas). + 2. **Misplaced id.** The id is nested under ``function`` instead of at the tool-call top + level where the OpenAI spec and ``langchain_openai`` (``id=rtc.get("id")``) read it, + so without help every call parses with ``id=None`` and its reply ``ToolMessage`` fails + validation. We move it back up; the id rides only a call's first delta. + + (The non-streaming endpoint has neither quirk, so only the streaming path needs this.) """ if not isinstance(chunk, dict): return @@ -68,11 +73,22 @@ def _hoist_tool_call_ids(chunk: object) -> None: def _hoist_in_choice(choice: object) -> None: - """Hoist tool-call ids within one streamed choice's delta (helper for ``_hoist_tool_call_ids``).""" + """Drop blank tool-call deltas, then hoist ids, within one streamed choice's delta.""" delta = choice.get("delta") if isinstance(choice, dict) else None tool_calls = delta.get("tool_calls") if isinstance(delta, dict) else None if isinstance(tool_calls, list): - _hoist_call_list(tool_calls) + delta["tool_calls"] = [tc for tc in tool_calls if not _is_blank_tool_call(tc)] + _hoist_call_list(delta["tool_calls"]) + + +def _is_blank_tool_call(tool_call: object) -> bool: + """True for the gateway's spurious empty tool-call delta (no name, id, or arguments).""" + if not isinstance(tool_call, dict): + return False + function = tool_call.get("function") + if not isinstance(function, dict): + return False + return not function.get("name") and not function.get("id") and not function.get("arguments") def _hoist_call_list(tool_calls: list[object]) -> None: diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py index fffb90b..c40f3d9 100644 --- a/tests/test_code_agent.py +++ b/tests/test_code_agent.py @@ -336,6 +336,63 @@ def test_hoist_tool_call_ids_guards() -> None: model_mod._hoist_tool_call_ids({"choices": 99}) # choices not a list -> early return +def test_is_blank_tool_call() -> None: + assert model_mod._is_blank_tool_call({"function": {"id": "", "name": "", "arguments": ""}}) + assert model_mod._is_blank_tool_call({"function": {}}) # all fields absent + assert not model_mod._is_blank_tool_call({"function": {"name": "x"}}) # has a name + assert not model_mod._is_blank_tool_call({"function": {"id": "i"}}) # has an id + assert not model_mod._is_blank_tool_call({"function": {"arguments": "a"}}) # has arguments + assert not model_mod._is_blank_tool_call({"function": 7}) # function not a dict + assert not model_mod._is_blank_tool_call(None) # not a dict + + +def test_hoist_tool_call_ids_drops_spurious_blank_delta() -> None: + # The gateway prefixes every streamed turn with an empty tool-call delta; it must be + # dropped (else a pure-text turn dispatches a tool call with name=""). + real_fn: dict[str, object] = {"id": "toolu_X", "name": "get_weather", "arguments": ""} + real_call: dict[str, object] = {"index": 0, "function": real_fn} + delta: dict[str, object] = { + "tool_calls": [ + {"index": 0, "function": {"id": "", "name": "", "arguments": ""}}, # spurious blank + real_call, + ] + } + chunk: dict[str, object] = {"choices": [{"delta": delta}]} + model_mod._hoist_tool_call_ids(chunk) + assert delta["tool_calls"] == [real_call] # blank dropped, real call kept + assert real_call["id"] == "toolu_X" # and its id hoisted out of function + assert "id" not in real_fn + + +def test_convert_chunk_drops_spurious_blank_tool_call() -> None: + from langchain_core.messages import AIMessageChunk + from langchain_openai import ChatOpenAI + + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + assert isinstance(m, ChatOpenAI) # narrow to the subclass that overrides the converter + # A pure-text turn's leading delta carries only the gateway's blank tool call — the + # converted chunk must surface no tool call (else deepagents dispatches a nameless tool). + chunk = { + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [ + {"index": 0, "function": {"id": "", "name": "", "arguments": ""}} + ], + }, + "finish_reason": None, + } + ] + } + gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None) + assert gen is not None + msg = gen.message + assert isinstance(msg, AIMessageChunk) + assert msg.tool_call_chunks == [] # the phantom blank tool call is gone + + def test_is_empty_arguments() -> None: assert model_mod._is_empty_arguments("") # empty string assert model_mod._is_empty_arguments(" ") # whitespace only From b54eab8f6ca07f5ebe32c577674d20dd4821898f Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Thu, 18 Jun 2026 14:28:47 -0700 Subject: [PATCH 3/6] assembly code: make voice readback interruptible on the daemon thread MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The coding-agent TUI plays each spoken reply on a daemon thread. Its two cancel channels both failed there: PcmPlayer chunks writes "so a Ctrl-C lands between them", but that relies on KeyboardInterrupt reaching the *playing* thread — true for the foreground `assembly speak` CLI, not the TUI, where Ctrl-C is handled by Textual on the UI thread. The only cross-thread signal, the `_cancel` event, was checked solely between synthesizer chunks (the feed wrapper), never during sounddevice's blocking playback. So readback was effectively uninterruptible: Ctrl-C did nothing, and the daemon thread stayed blocked in speak() instead of advancing to listen for the next turn. Poll the cancel flag inside PcmPlayer's piece-write loop (abort the device and drop the rest of the chunk when set), and have voice.speak hand the player a live poll of `_cancel`. Cancellation is now honored within ~one 4 KiB piece (~85 ms) regardless of which thread the interrupt arrives on. The poll is optional, so `assembly speak` (foreground, KeyboardInterrupt-driven) is unchanged. Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/code_agent/voice.py | 12 +++++++++--- aai_cli/tts/audio.py | 22 ++++++++++++++++++---- tests/test_code_voice.py | 30 +++++++++++++++++++++++++++++- tests/test_tts_audio.py | 23 +++++++++++++++++++++++ 4 files changed, 79 insertions(+), 8 deletions(-) diff --git a/aai_cli/code_agent/voice.py b/aai_cli/code_agent/voice.py index 4f913cd..ffed3ef 100644 --- a/aai_cli/code_agent/voice.py +++ b/aai_cli/code_agent/voice.py @@ -130,8 +130,10 @@ def __enter__(self) -> Player: def __exit__(self, exc_type: object, *exc: object) -> object: """Drain on a clean exit, abort otherwise; never suppress.""" - def feed(self, pcm: bytes, sample_rate: int) -> None: - """Play one PCM chunk, opening the output device on the first call.""" + def feed( + self, pcm: bytes, sample_rate: int, *, cancelled: Callable[[], bool] | None = None + ) -> None: + """Play one PCM chunk, polling ``cancelled`` between writes to stop mid-chunk.""" def _stt_params(sample_rate: int) -> StreamingParameters: @@ -219,7 +221,11 @@ def speak(self, text: str) -> None: def feed(pcm: bytes, sample_rate: int) -> None: if self._cancel.is_set(): _abort_readback() - player.feed(pcm, sample_rate) + # Poll cancel *during* playback too: a chunk can be seconds of audio, and + # in the TUI the only cancel signal is this flag set from another thread. + player.feed(pcm, sample_rate, cancelled=self._cancel.is_set) + if self._cancel.is_set(): + _abort_readback() self.synth_fn(self.api_key, config, on_audio=feed) except _ReadbackInterrupted: diff --git a/aai_cli/tts/audio.py b/aai_cli/tts/audio.py index e4aec43..77dedea 100644 --- a/aai_cli/tts/audio.py +++ b/aai_cli/tts/audio.py @@ -88,11 +88,20 @@ def __init__(self, *, stream_factory: Callable[[int], _OutputStream] | None = No def __enter__(self) -> PcmPlayer: return self - def feed(self, pcm: bytes, sample_rate: int) -> None: - """Play one PCM chunk, opening the device on the first chunk.""" + def feed( + self, pcm: bytes, sample_rate: int, *, cancelled: Callable[[], bool] | None = None + ) -> None: + """Play one PCM chunk, opening the device on the first chunk. + + ``cancelled`` is polled between the small piece-writes; when it returns True the + stream is aborted (buffered frames discarded) and the rest of the chunk is dropped. + This is the only cancellation path that works when playback runs off the main thread + — the coding-agent TUI plays on a daemon thread, so a Ctrl-C ``KeyboardInterrupt`` + never reaches it; only a flag set from another thread can stop it. + """ if self._stream is None: self._stream = self._open(sample_rate) - self._write(self._stream, pcm) + self._write(self._stream, pcm, cancelled) def _open(self, sample_rate: int) -> _OutputStream: try: @@ -105,11 +114,16 @@ def _open(self, sample_rate: int) -> _OutputStream: return stream @staticmethod - def _write(stream: _OutputStream, pcm: bytes) -> None: + def _write( + stream: _OutputStream, pcm: bytes, cancelled: Callable[[], bool] | None = None + ) -> None: # KeyboardInterrupt (a BaseException) passes through this Exception handler # to __exit__, which aborts the device; only real device errors are wrapped. try: for offset in range(0, len(pcm), _PLAYBACK_CHUNK_BYTES): + if cancelled is not None and cancelled(): + stream.abort() # discard buffered frames and stop now, mid-chunk + return stream.write(pcm[offset : offset + _PLAYBACK_CHUNK_BYTES]) except Exception as exc: raise _playback_error(exc) from exc diff --git a/tests/test_code_voice.py b/tests/test_code_voice.py index 6ffa17d..6233036 100644 --- a/tests/test_code_voice.py +++ b/tests/test_code_voice.py @@ -119,7 +119,7 @@ def __exit__(self, exc_type, *exc): self.exit_exc_type = exc_type # records the abort path (an exception on the way out) return False - def feed(self, pcm, sample_rate): + def feed(self, pcm, sample_rate, *, cancelled=None): self.fed.append((pcm, sample_rate)) @@ -163,6 +163,34 @@ def fake_synth(api_key, config, *, on_audio): assert player.exit_exc_type is not None # player saw the exception -> aborted, not drained +def test_speak_hands_player_a_live_cancel_poll_for_midchunk_stop(): + # In the TUI the readback plays on a daemon thread, so the only way to stop a chunk + # mid-playback is a flag set from another thread. speak() must hand the player a live + # poll of that flag (not just check it between synth chunks). + seen = {} + holder = {} + + class PollPlayer(FakePlayer): + def feed(self, pcm, sample_rate, *, cancelled=None): + seen["poll"] = cancelled + seen["before"] = cancelled() if cancelled else None + holder["session"].cancel() # another thread interrupts mid-playback + seen["after"] = cancelled() if cancelled else None + super().feed(pcm, sample_rate) + + def fake_synth(api_key, config, *, on_audio): + on_audio(b"chunk", 24000) + + session = VoiceSession( + api_key="k", readback=True, synth_fn=fake_synth, player_factory=PollPlayer + ) + holder["session"] = session + session.speak("hello there") # returns cleanly — the post-chunk cancel is swallowed + assert callable(seen["poll"]) + assert seen["before"] is False # not cancelled when the chunk starts playing + assert seen["after"] is True # the poll reflects a cancel raised mid-playback + + def test_speak_clears_a_stale_cancel_before_playing(): # A cancel() left set from a prior interrupt must not abort the next readback before it # starts — speak() clears the flag on entry, so the chunk plays normally. diff --git a/tests/test_tts_audio.py b/tests/test_tts_audio.py index 93c6095..10045fc 100644 --- a/tests/test_tts_audio.py +++ b/tests/test_tts_audio.py @@ -77,6 +77,29 @@ def test_play_pcm_writes_audio_in_bounded_chunks(): assert b"".join(stream.writes) == pcm +def test_feed_aborts_midchunk_when_cancelled_flag_flips(): + # The daemon-thread readback can't see a Ctrl-C KeyboardInterrupt, so a cancel flag set + # from another thread must stop playback between piece-writes: abort the device and drop + # the rest of the chunk rather than playing it out. + stream = FakeStream() + pcm = bytes(256) * 40 # > 2 pieces, so there are several poll points + with audio.PcmPlayer(stream_factory=lambda rate: stream) as player: + # Report "cancelled" once the first piece has been written. + player.feed(pcm, 24000, cancelled=lambda: bool(stream.writes)) + assert len(stream.writes) == 1 # stopped after the first piece instead of draining all + assert "abort" in stream.events # discarded the buffered frames immediately + + +def test_feed_without_cancel_poll_plays_every_piece(): + # The cancel poll is optional (assembly speak passes none): all pieces still play. + stream = FakeStream() + pcm = bytes(256) * 40 + with audio.PcmPlayer(stream_factory=lambda rate: stream) as player: + player.feed(pcm, 24000) + assert b"".join(stream.writes) == pcm # nothing dropped + assert "abort" not in stream.events + + def test_play_pcm_aborts_and_propagates_on_ctrl_c(): # Ctrl-C mid-playback must stop the device immediately (abort, not just stop) # and re-raise so the cancel reaches the CLI; the stream is still closed. From c5790d49bda82e8a3557f8e80a65a898758451e4 Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Thu, 18 Jun 2026 14:34:29 -0700 Subject: [PATCH 4/6] Default assembly code and assembly live to gpt-5.1 Switch the default LLM Gateway model for `assembly code` (code_agent/prompt.py) and `assembly live` (agent_cascade/config.py) to gpt-5.1. The live default is now a literal rather than llm.DEFAULT_MODEL, so it's independent of the one-shot `assembly llm` default (still claude-haiku). Both override with --model. Verified gpt-5.1 is accepted by the gateway. Updated the cascade config test and regenerated the `--help` snapshot (both commands show [default: gpt-5.1]). Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/agent_cascade/config.py | 5 ++++- aai_cli/code_agent/prompt.py | 2 +- tests/__snapshots__/test_snapshots_help_run.ambr | 5 ++--- tests/test_agent_cascade_config.py | 9 +++++++-- 4 files changed, 14 insertions(+), 7 deletions(-) diff --git a/aai_cli/agent_cascade/config.py b/aai_cli/agent_cascade/config.py index 7589b95..efa5b43 100644 --- a/aai_cli/agent_cascade/config.py +++ b/aai_cli/agent_cascade/config.py @@ -13,7 +13,10 @@ from aai_cli.agent_cascade.voices import DEFAULT_VOICE from aai_cli.core import llm -DEFAULT_MODEL = llm.DEFAULT_MODEL +# `assembly live` defaults to a capable gateway model (override with --model); kept a +# literal rather than llm.DEFAULT_MODEL so the live agent's default is independent of the +# one-shot `assembly llm` default. +DEFAULT_MODEL = "gpt-5.1" DEFAULT_MAX_TOKENS = llm.DEFAULT_MAX_TOKENS # The realtime model the cascade transcribes with (same as the agent-cascade template). DEFAULT_SPEECH_MODEL = "u3-rt-pro" diff --git a/aai_cli/code_agent/prompt.py b/aai_cli/code_agent/prompt.py index fb6c957..4e2a7ef 100644 --- a/aai_cli/code_agent/prompt.py +++ b/aai_cli/code_agent/prompt.py @@ -4,7 +4,7 @@ # A capable gateway model by default; override with `--model`. The gateway is the # source of truth for what's accepted, so this is only a sensible default. -DEFAULT_MODEL = "claude-sonnet-4-6" +DEFAULT_MODEL = "gpt-5.1" # Generous ceiling so long edits/explanations aren't clipped; the gateway only bills # tokens actually generated, so a high cap costs nothing on short replies. DEFAULT_MAX_TOKENS = 8192 diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 65db349..c2334d9 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -287,7 +287,7 @@ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --model TEXT LLM Gateway model │ - │ [default: claude-sonnet-4-6] │ + │ [default: gpt-5.1] │ │ --dir -C DIRECTORY Working directory the agent's file │ │ and shell tools operate in │ │ [default: .] │ @@ -667,8 +667,7 @@ ╭─ Language model ─────────────────────────────────────────────────────────────╮ │ --model TEXT LLM Gateway model that powers the │ │ agent's replies │ - │ [default: │ - │ claude-haiku-4-5-20251001] │ + │ [default: gpt-5.1] │ │ --max-tokens INTEGER RANGE [x>=1] Max tokens per reply │ │ [default: 8192] │ │ --llm-config TEXT Set any LLM Gateway request field │ diff --git a/tests/test_agent_cascade_config.py b/tests/test_agent_cascade_config.py index e366187..7514ed5 100644 --- a/tests/test_agent_cascade_config.py +++ b/tests/test_agent_cascade_config.py @@ -6,7 +6,12 @@ import pytest -from aai_cli.agent_cascade.config import DEFAULT_GREETING, DEFAULT_MAX_HISTORY, CascadeConfig +from aai_cli.agent_cascade.config import ( + DEFAULT_GREETING, + DEFAULT_MAX_HISTORY, + DEFAULT_MODEL, + CascadeConfig, +) from aai_cli.agent_cascade.voices import DEFAULT_VOICE from aai_cli.core import llm @@ -14,7 +19,7 @@ def test_default_config_values(): config = CascadeConfig() assert config.voice == DEFAULT_VOICE - assert config.model == llm.DEFAULT_MODEL + assert config.model == DEFAULT_MODEL == "gpt-5.1" # `assembly live` defaults to gpt-5.1 assert config.greeting == DEFAULT_GREETING # The sliding-window default keeps the last 40 messages of context. assert config.max_history == 40 From 2b410e620c54295f127f86377791de364853b222 Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Thu, 18 Jun 2026 14:53:42 -0700 Subject: [PATCH 5/6] Split model tests into test_code_model.py; narrow types for the gate Make the branch gate-clean now that the firecrawl WIP no longer blocks it: - Move the code_agent/model.py unit tests out of test_code_agent.py (which had grown past the 500-line file-length gate) into a new test_code_model.py, and add it to pyrightconfig.tests.json's ignore list alongside the other langchain-boundary test files. - Narrow `delta` to dict in _hoist_in_choice via early returns so mypy accepts the in-place `delta["tool_calls"]` assignment, and isinstance-narrow the chat model in the payload/convert-chunk tests. Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/code_agent/model.py | 8 +- pyrightconfig.tests.json | 1 + tests/test_code_agent.py | 204 ---------------------------------- tests/test_code_model.py | 216 ++++++++++++++++++++++++++++++++++++ 4 files changed, 223 insertions(+), 206 deletions(-) create mode 100644 tests/test_code_model.py diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py index 27f9980..4e1c556 100644 --- a/aai_cli/code_agent/model.py +++ b/aai_cli/code_agent/model.py @@ -74,8 +74,12 @@ def _hoist_tool_call_ids(chunk: object) -> None: def _hoist_in_choice(choice: object) -> None: """Drop blank tool-call deltas, then hoist ids, within one streamed choice's delta.""" - delta = choice.get("delta") if isinstance(choice, dict) else None - tool_calls = delta.get("tool_calls") if isinstance(delta, dict) else None + if not isinstance(choice, dict): + return + delta = choice.get("delta") + if not isinstance(delta, dict): + return + tool_calls = delta.get("tool_calls") if isinstance(tool_calls, list): delta["tool_calls"] = [tc for tc in tool_calls if not _is_blank_tool_call(tc)] _hoist_call_list(delta["tool_calls"]) diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json index f9dbdf0..d49b21b 100644 --- a/pyrightconfig.tests.json +++ b/pyrightconfig.tests.json @@ -2,6 +2,7 @@ "include": ["tests"], "ignore": [ "tests/test_code_agent.py", + "tests/test_code_model.py", "tests/test_code_command.py", "tests/test_code_tui.py", "tests/test_agent_cascade_brain.py" diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py index c40f3d9..a1452bf 100644 --- a/tests/test_code_agent.py +++ b/tests/test_code_agent.py @@ -25,12 +25,10 @@ skills, store, ) -from aai_cli.code_agent import model as model_mod from aai_cli.code_agent.agent import MUTATING_TOOLS, build_agent from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult from aai_cli.code_agent.render import RichRenderer, make_approver from aai_cli.code_agent.session import QUIT_COMMANDS, CodeSession, run_repl -from aai_cli.core import environments class FakeChatModel(BaseChatModel): @@ -272,208 +270,6 @@ def test_rich_renderer_smoke(capsys: pytest.CaptureFixture[str]) -> None: # --- slice-unit edge cases (cover the lazy bodies + error/guard branches) ----- -def test_build_model_targets_the_gateway(): # untyped: probes ChatOpenAI subclass attrs - m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") - assert m.model_name == "claude-sonnet-4-6" - assert m.openai_api_base == environments.active().llm_gateway_base - assert m.use_responses_api is False - - -def test_build_model_flattens_list_content_for_gateway(): # untyped: probes the payload dict - from langchain_core.messages import HumanMessage, SystemMessage - - m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") - # deepagents hands the model multi-block content arrays; the gateway 500s on those, - # so the model must flatten each message's content to a plain string before sending. - payload = m._get_request_payload( - [ - SystemMessage(content=[{"type": "text", "text": "a"}, {"type": "text", "text": "b"}]), - HumanMessage(content="hi"), - ] - ) - assert [msg["content"] for msg in payload["messages"]] == ["ab", "hi"] - - -def test_flatten_content_guards() -> None: - model_mod._flatten_content(None) # not a list -> early return, no error - items = ["raw", 123] # non-dict members are skipped, list left untouched - model_mod._flatten_content(items) - assert items == ["raw", 123] - - -def test_hoist_tool_call_ids_moves_id_out_of_function_only_when_missing() -> None: - # One chunk exercising every branch: each malformed variant is skipped, and only a - # tool call carrying a function-nested id gets hoisted. Hold references to the inner - # dicts so the in-place mutation is asserted with a clean type. - noid_fn: dict[str, object] = {"name": "b"} - hoist_fn: dict[str, object] = {"id": "HOIST", "name": "c", "arguments": ""} - noid_call: dict[str, object] = {"index": 1, "function": noid_fn} - hoist_call: dict[str, object] = {"index": 2, "function": hoist_fn} - tool_calls: list[object] = [ - None, # tool_call not a dict -> skipped - {"index": 0, "function": 7}, # function not a dict -> skipped - noid_call, # function has no id -> nothing to hoist - hoist_call, # the real gateway shape -> id hoisted out of function - ] - chunk: dict[str, object] = { - "choices": [ - None, # choice not a dict -> skipped - {"delta": None}, # delta not a dict -> skipped - {"delta": {"content": "hi"}}, # no tool_calls -> skipped - {"delta": {"tool_calls": 99}}, # tool_calls not a list -> skipped - {"delta": {"tool_calls": tool_calls}}, - ] - } - model_mod._hoist_tool_call_ids(chunk) - assert "id" not in noid_call # no id invented for a call that never had one - assert noid_fn == {"name": "b"} # left untouched - assert hoist_call["id"] == "HOIST" # hoisted to the top level where langchain reads it - assert "id" not in hoist_fn # and removed from function so it isn't duplicated - - -def test_hoist_tool_call_ids_guards() -> None: - model_mod._hoist_tool_call_ids(None) # not a dict -> early return, no error - model_mod._hoist_tool_call_ids({"choices": 99}) # choices not a list -> early return - - -def test_is_blank_tool_call() -> None: - assert model_mod._is_blank_tool_call({"function": {"id": "", "name": "", "arguments": ""}}) - assert model_mod._is_blank_tool_call({"function": {}}) # all fields absent - assert not model_mod._is_blank_tool_call({"function": {"name": "x"}}) # has a name - assert not model_mod._is_blank_tool_call({"function": {"id": "i"}}) # has an id - assert not model_mod._is_blank_tool_call({"function": {"arguments": "a"}}) # has arguments - assert not model_mod._is_blank_tool_call({"function": 7}) # function not a dict - assert not model_mod._is_blank_tool_call(None) # not a dict - - -def test_hoist_tool_call_ids_drops_spurious_blank_delta() -> None: - # The gateway prefixes every streamed turn with an empty tool-call delta; it must be - # dropped (else a pure-text turn dispatches a tool call with name=""). - real_fn: dict[str, object] = {"id": "toolu_X", "name": "get_weather", "arguments": ""} - real_call: dict[str, object] = {"index": 0, "function": real_fn} - delta: dict[str, object] = { - "tool_calls": [ - {"index": 0, "function": {"id": "", "name": "", "arguments": ""}}, # spurious blank - real_call, - ] - } - chunk: dict[str, object] = {"choices": [{"delta": delta}]} - model_mod._hoist_tool_call_ids(chunk) - assert delta["tool_calls"] == [real_call] # blank dropped, real call kept - assert real_call["id"] == "toolu_X" # and its id hoisted out of function - assert "id" not in real_fn - - -def test_convert_chunk_drops_spurious_blank_tool_call() -> None: - from langchain_core.messages import AIMessageChunk - from langchain_openai import ChatOpenAI - - m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") - assert isinstance(m, ChatOpenAI) # narrow to the subclass that overrides the converter - # A pure-text turn's leading delta carries only the gateway's blank tool call — the - # converted chunk must surface no tool call (else deepagents dispatches a nameless tool). - chunk = { - "choices": [ - { - "index": 0, - "delta": { - "role": "assistant", - "tool_calls": [ - {"index": 0, "function": {"id": "", "name": "", "arguments": ""}} - ], - }, - "finish_reason": None, - } - ] - } - gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None) - assert gen is not None - msg = gen.message - assert isinstance(msg, AIMessageChunk) - assert msg.tool_call_chunks == [] # the phantom blank tool call is gone - - -def test_is_empty_arguments() -> None: - assert model_mod._is_empty_arguments("") # empty string - assert model_mod._is_empty_arguments(" ") # whitespace only - assert model_mod._is_empty_arguments("{}") # empty object - assert model_mod._is_empty_arguments("{ }") # empty object with whitespace - assert not model_mod._is_empty_arguments('{"path": "/"}') # real arguments - assert not model_mod._is_empty_arguments("[]") # non-dict JSON is not "empty args" - assert not model_mod._is_empty_arguments("{bad json") # unparseable -> leave alone - assert not model_mod._is_empty_arguments(None) # non-string -> leave alone - assert not model_mod._is_empty_arguments({"already": "parsed"}) # non-string -> leave alone - - -def test_ensure_tool_call_arguments_fills_only_empty_calls() -> None: - empty_fn: dict[str, object] = {"name": "ls", "arguments": "{}"} - blank_fn: dict[str, object] = {"name": "ls", "arguments": " "} - full_fn: dict[str, object] = {"name": "ls", "arguments": '{"path": "/"}'} - tool_calls: list[object] = [ - None, # tool_call not a dict -> skipped - {"id": "0", "function": 7}, # function not a dict -> skipped - {"id": "1", "function": empty_fn}, # empty object -> filled - {"id": "2", "function": blank_fn}, # whitespace -> filled - {"id": "3", "function": full_fn}, # real args -> untouched - ] - messages: list[object] = [ - None, # message not a dict -> skipped - {"role": "user", "content": "hi"}, # no tool_calls -> skipped - {"tool_calls": 99}, # tool_calls not a list -> skipped - {"tool_calls": tool_calls}, - ] - model_mod._ensure_tool_call_arguments(messages) - assert empty_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS - assert blank_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS - assert full_fn["arguments"] == '{"path": "/"}' # left untouched - - -def test_ensure_tool_call_arguments_guards() -> None: - model_mod._ensure_tool_call_arguments(None) # not a list -> early return, no error - model_mod._ensure_tool_call_arguments([{"tool_calls": 99}]) # tool_calls not a list - - -def test_get_request_payload_fills_empty_tool_call_arguments() -> None: - from langchain_core.messages import AIMessage, HumanMessage - - m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") - # An assistant tool call with no arguments serializes to arguments="{}", which the gateway - # rejects (missing tool_use.input); the payload must carry the placeholder instead. - ai = AIMessage(content="", tool_calls=[{"name": "ls", "args": {}, "id": "t1"}]) - payload = m._get_request_payload([HumanMessage(content="hi"), ai]) - calls = payload["messages"][1]["tool_calls"] - assert calls[0]["function"]["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS - - -def test_convert_chunk_hoists_streamed_tool_call_id() -> None: - from langchain_core.messages import AIMessageChunk - from langchain_openai import ChatOpenAI - - m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") - assert isinstance(m, ChatOpenAI) # narrow to the subclass that overrides the converter - # The gateway streams the tool-call id nested inside `function`; the override must hoist - # it so langchain's converted chunk carries the id (else the reply ToolMessage gets None). - chunk = { - "choices": [ - { - "index": 0, - "delta": { - "role": "assistant", - "tool_calls": [ - {"index": 0, "function": {"id": "toolu_X", "name": "get_weather"}} - ], - }, - "finish_reason": None, - } - ] - } - gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None) - assert gen is not None - msg = gen.message - assert isinstance(msg, AIMessageChunk) - assert msg.tool_call_chunks[0]["id"] == "toolu_X" - - def test_fetch_url_fetches_and_truncates(monkeypatch: pytest.MonkeyPatch) -> None: import httpx diff --git a/tests/test_code_model.py b/tests/test_code_model.py new file mode 100644 index 0000000..047f9bf --- /dev/null +++ b/tests/test_code_model.py @@ -0,0 +1,216 @@ +"""Unit tests for the `assembly code` gateway model wiring (code_agent/model.py). + +Split out of test_code_agent.py to stay under the 500-line file gate. These cover the +``_GatewayChatOpenAI`` subclass and its helpers that paper over the LLM Gateway's +OpenAI-incompatible quirks: content flattening, streamed tool-call id hoisting, dropping +the gateway's spurious blank tool-call deltas, and filling empty tool-call arguments. +""" + +from __future__ import annotations + +from aai_cli.code_agent import model as model_mod +from aai_cli.core import environments + + +def test_build_model_targets_the_gateway(): # untyped: probes ChatOpenAI subclass attrs + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + assert m.model_name == "claude-sonnet-4-6" + assert m.openai_api_base == environments.active().llm_gateway_base + assert m.use_responses_api is False + + +def test_build_model_flattens_list_content_for_gateway(): # untyped: probes the payload dict + from langchain_core.messages import HumanMessage, SystemMessage + + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + # deepagents hands the model multi-block content arrays; the gateway 500s on those, + # so the model must flatten each message's content to a plain string before sending. + payload = m._get_request_payload( + [ + SystemMessage(content=[{"type": "text", "text": "a"}, {"type": "text", "text": "b"}]), + HumanMessage(content="hi"), + ] + ) + assert [msg["content"] for msg in payload["messages"]] == ["ab", "hi"] + + +def test_flatten_content_guards() -> None: + model_mod._flatten_content(None) # not a list -> early return, no error + items = ["raw", 123] # non-dict members are skipped, list left untouched + model_mod._flatten_content(items) + assert items == ["raw", 123] + + +def test_hoist_tool_call_ids_moves_id_out_of_function_only_when_missing() -> None: + # One chunk exercising every branch: each malformed variant is skipped, and only a + # tool call carrying a function-nested id gets hoisted. Hold references to the inner + # dicts so the in-place mutation is asserted with a clean type. + noid_fn: dict[str, object] = {"name": "b"} + hoist_fn: dict[str, object] = {"id": "HOIST", "name": "c", "arguments": ""} + noid_call: dict[str, object] = {"index": 1, "function": noid_fn} + hoist_call: dict[str, object] = {"index": 2, "function": hoist_fn} + tool_calls: list[object] = [ + None, # tool_call not a dict -> skipped + {"index": 0, "function": 7}, # function not a dict -> skipped + noid_call, # function has no id -> nothing to hoist + hoist_call, # the real gateway shape -> id hoisted out of function + ] + chunk: dict[str, object] = { + "choices": [ + None, # choice not a dict -> skipped + {"delta": None}, # delta not a dict -> skipped + {"delta": {"content": "hi"}}, # no tool_calls -> skipped + {"delta": {"tool_calls": 99}}, # tool_calls not a list -> skipped + {"delta": {"tool_calls": tool_calls}}, + ] + } + model_mod._hoist_tool_call_ids(chunk) + assert "id" not in noid_call # no id invented for a call that never had one + assert noid_fn == {"name": "b"} # left untouched + assert hoist_call["id"] == "HOIST" # hoisted to the top level where langchain reads it + assert "id" not in hoist_fn # and removed from function so it isn't duplicated + + +def test_hoist_tool_call_ids_guards() -> None: + model_mod._hoist_tool_call_ids(None) # not a dict -> early return, no error + model_mod._hoist_tool_call_ids({"choices": 99}) # choices not a list -> early return + + +def test_is_blank_tool_call() -> None: + assert model_mod._is_blank_tool_call({"function": {"id": "", "name": "", "arguments": ""}}) + assert model_mod._is_blank_tool_call({"function": {}}) # all fields absent + assert not model_mod._is_blank_tool_call({"function": {"name": "x"}}) # has a name + assert not model_mod._is_blank_tool_call({"function": {"id": "i"}}) # has an id + assert not model_mod._is_blank_tool_call({"function": {"arguments": "a"}}) # has arguments + assert not model_mod._is_blank_tool_call({"function": 7}) # function not a dict + assert not model_mod._is_blank_tool_call(None) # not a dict + + +def test_hoist_tool_call_ids_drops_spurious_blank_delta() -> None: + # The gateway prefixes every streamed turn with an empty tool-call delta; it must be + # dropped (else a pure-text turn dispatches a tool call with name=""). + real_fn: dict[str, object] = {"id": "toolu_X", "name": "get_weather", "arguments": ""} + real_call: dict[str, object] = {"index": 0, "function": real_fn} + delta: dict[str, object] = { + "tool_calls": [ + {"index": 0, "function": {"id": "", "name": "", "arguments": ""}}, # spurious blank + real_call, + ] + } + chunk: dict[str, object] = {"choices": [{"delta": delta}]} + model_mod._hoist_tool_call_ids(chunk) + assert delta["tool_calls"] == [real_call] # blank dropped, real call kept + assert real_call["id"] == "toolu_X" # and its id hoisted out of function + assert "id" not in real_fn + + +def test_convert_chunk_drops_spurious_blank_tool_call() -> None: + from langchain_core.messages import AIMessageChunk + from langchain_openai import ChatOpenAI + + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + assert isinstance(m, ChatOpenAI) # narrow to the subclass that overrides the converter + # A pure-text turn's leading delta carries only the gateway's blank tool call — the + # converted chunk must surface no tool call (else deepagents dispatches a nameless tool). + chunk = { + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [ + {"index": 0, "function": {"id": "", "name": "", "arguments": ""}} + ], + }, + "finish_reason": None, + } + ] + } + gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None) + assert gen is not None + msg = gen.message + assert isinstance(msg, AIMessageChunk) + assert msg.tool_call_chunks == [] # the phantom blank tool call is gone + + +def test_is_empty_arguments() -> None: + assert model_mod._is_empty_arguments("") # empty string + assert model_mod._is_empty_arguments(" ") # whitespace only + assert model_mod._is_empty_arguments("{}") # empty object + assert model_mod._is_empty_arguments("{ }") # empty object with whitespace + assert not model_mod._is_empty_arguments('{"path": "/"}') # real arguments + assert not model_mod._is_empty_arguments("[]") # non-dict JSON is not "empty args" + assert not model_mod._is_empty_arguments("{bad json") # unparseable -> leave alone + assert not model_mod._is_empty_arguments(None) # non-string -> leave alone + assert not model_mod._is_empty_arguments({"already": "parsed"}) # non-string -> leave alone + + +def test_ensure_tool_call_arguments_fills_only_empty_calls() -> None: + empty_fn: dict[str, object] = {"name": "ls", "arguments": "{}"} + blank_fn: dict[str, object] = {"name": "ls", "arguments": " "} + full_fn: dict[str, object] = {"name": "ls", "arguments": '{"path": "/"}'} + tool_calls: list[object] = [ + None, # tool_call not a dict -> skipped + {"id": "0", "function": 7}, # function not a dict -> skipped + {"id": "1", "function": empty_fn}, # empty object -> filled + {"id": "2", "function": blank_fn}, # whitespace -> filled + {"id": "3", "function": full_fn}, # real args -> untouched + ] + messages: list[object] = [ + None, # message not a dict -> skipped + {"role": "user", "content": "hi"}, # no tool_calls -> skipped + {"tool_calls": 99}, # tool_calls not a list -> skipped + {"tool_calls": tool_calls}, + ] + model_mod._ensure_tool_call_arguments(messages) + assert empty_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS + assert blank_fn["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS + assert full_fn["arguments"] == '{"path": "/"}' # left untouched + + +def test_ensure_tool_call_arguments_guards() -> None: + model_mod._ensure_tool_call_arguments(None) # not a list -> early return, no error + model_mod._ensure_tool_call_arguments([{"tool_calls": 99}]) # tool_calls not a list + + +def test_get_request_payload_fills_empty_tool_call_arguments() -> None: + from langchain_core.messages import AIMessage, HumanMessage + from langchain_openai import ChatOpenAI + + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + assert isinstance(m, ChatOpenAI) # narrow to the subclass that overrides the payload hook + # An assistant tool call with no arguments serializes to arguments="{}", which the gateway + # rejects (missing tool_use.input); the payload must carry the placeholder instead. + ai = AIMessage(content="", tool_calls=[{"name": "ls", "args": {}, "id": "t1"}]) + payload = m._get_request_payload([HumanMessage(content="hi"), ai]) + calls = payload["messages"][1]["tool_calls"] + assert calls[0]["function"]["arguments"] == model_mod._PLACEHOLDER_ARGUMENTS + + +def test_convert_chunk_hoists_streamed_tool_call_id() -> None: + from langchain_core.messages import AIMessageChunk + from langchain_openai import ChatOpenAI + + m = model_mod.build_model("sk-test", model="claude-sonnet-4-6") + assert isinstance(m, ChatOpenAI) # narrow to the subclass that overrides the converter + # The gateway streams the tool-call id nested inside `function`; the override must hoist + # it so langchain's converted chunk carries the id (else the reply ToolMessage gets None). + chunk = { + "choices": [ + { + "index": 0, + "delta": { + "role": "assistant", + "tool_calls": [ + {"index": 0, "function": {"id": "toolu_X", "name": "get_weather"}} + ], + }, + "finish_reason": None, + } + ] + } + gen = m._convert_chunk_to_generation_chunk(chunk, AIMessageChunk, None) + assert gen is not None + msg = gen.message + assert isinstance(msg, AIMessageChunk) + assert msg.tool_call_chunks[0]["id"] == "toolu_X" From 8a84c96b310e68e15c04571d643e0ff7e8dabb78 Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Thu, 18 Jun 2026 15:36:08 -0700 Subject: [PATCH 6/6] Fix two flaky Windows TUI tests (teardown races) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - code_agent/tui.py: guard on_worker_state_changed on is_running — a turn worker can finish after the app tears down (quit / test run_test exit), and driving _finish_turn then queries an unmounted DOM (NoMatches on "#spinner"). Skip it when the app isn't running. Fixes test_code_tui_voice flakiness on Windows. - test_live_tui.py: the reply-text assertion waited only for the AssistantMessage widget to mount, but agent_transcript sets the text via a separate call_from_thread hop — so the wait raced the text on a slow runner. Wait for the text itself. - pyrightconfig.tests.json: ignore test_code_tui_voice.py alongside the other Textual-boundary test files (it now duck-types a Worker.StateChanged event). Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/code_agent/tui.py | 4 +++- pyrightconfig.tests.json | 1 + tests/test_code_tui_voice.py | 22 ++++++++++++++++++++++ tests/test_live_tui.py | 11 ++++++++++- 4 files changed, 36 insertions(+), 2 deletions(-) diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py index f5bb44a..5d5119d 100644 --- a/aai_cli/code_agent/tui.py +++ b/aai_cli/code_agent/tui.py @@ -481,7 +481,9 @@ def _stop_spinner(self) -> None: self.query_one("#spinner", Static).display = False def on_worker_state_changed(self, event: Worker.StateChanged) -> None: - if event.worker.is_finished: + # Guard on is_running: a worker finishing *after* the app tears down (quit / test exit) + # would drive _finish_turn against an unmounted DOM — NoMatches on "#spinner", a flake. + if event.worker.is_finished and self.is_running: self._finish_turn() def _finish_turn(self) -> None: diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json index d49b21b..93980ab 100644 --- a/pyrightconfig.tests.json +++ b/pyrightconfig.tests.json @@ -5,6 +5,7 @@ "tests/test_code_model.py", "tests/test_code_command.py", "tests/test_code_tui.py", + "tests/test_code_tui_voice.py", "tests/test_agent_cascade_brain.py" ], "pythonVersion": "3.12", diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py index 615fddc..2ec6448 100644 --- a/tests/test_code_tui_voice.py +++ b/tests/test_code_tui_voice.py @@ -9,6 +9,7 @@ from __future__ import annotations import asyncio +from types import SimpleNamespace import pytest from langchain_core.messages import AIMessage, HumanMessage @@ -118,6 +119,27 @@ async def go() -> None: _run(go()) +def test_finished_worker_is_ignored_once_the_app_stops_running(): # untyped: duck-typed event + # A turn worker can finish *after* the app starts tearing down; driving _finish_turn then + # queries an unmounted DOM (NoMatches on #spinner — a Windows CI flake). on_worker_state_changed + # must skip it when the app isn't running, and handle it when it is. + app = CodeAgentApp(agent=FakeAgent([])) + calls: list[bool] = [] + app._finish_turn = lambda: calls.append(True) # spy + finished = SimpleNamespace(worker=SimpleNamespace(is_finished=True)) + + assert app.is_running is False # never mounted -> torn-down-equivalent + app.on_worker_state_changed(finished) # duck-typed event stands in for Worker.StateChanged + assert calls == [] # guarded out: no _finish_turn against a dead DOM + + async def go() -> None: + async with app.run_test(size=(100, 30)): + app.on_worker_state_changed(finished) + + _run(go()) + assert calls == [True] # running -> the finished turn is handled + + def test_capture_voice_turn_is_a_noop_once_typed() -> None: async def go() -> None: voice = FakeVoice(transcripts=["ignored"]) diff --git a/tests/test_live_tui.py b/tests/test_live_tui.py index ba92a06..732f9e4 100644 --- a/tests/test_live_tui.py +++ b/tests/test_live_tui.py @@ -232,7 +232,16 @@ def run_conversation(renderer) -> None: app = _app(run_conversation=run_conversation, on_stop=done.set) async with app.run_test(size=(100, 30)) as pilot: - assert await _wait_until(pilot, lambda: bool(app.query(AssistantMessage))) + # Wait for the reply *text* to land, not just the widget to mount: agent_transcript + # sets the text via a separate call_from_thread hop, so a widget-only wait races it + # (empty text on a slow runner — a Windows CI flake). + assert await _wait_until( + pilot, + lambda: ( + bool(app.query(AssistantMessage)) + and app.query_one(AssistantMessage).text == "Done. " + ), + ) assert "» turn it up" in str(app.query_one(UserMessage).render()) assert app.query_one(AssistantMessage).text == "Done. " assert done.is_set() # leaving the run_test context unmounted -> on_stop released it