From 222c030fd7b3ebb46c51a2a71a63c1adcab931b1 Mon Sep 17 00:00:00 2001 From: Alex Kroman Date: Wed, 17 Jun 2026 21:19:38 -0700 Subject: [PATCH 1/3] Fix `assembly code` TUI: CLI-style approval, voice-mode banner, mic robustness, unique sessions MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Approval prompt: replace the Textual Button row with a plain y/a/n keyboard-hint line, so it reads like a CLI prompt rather than chrome. - Voice mode banner: defer the first mic open until after the splash paints (call_after_refresh) — opening PortAudio inline on mount raced Textual's initial render and left the banner blank until a resize/focus repaint. - Mic open: redirect PortAudio's C-level stderr noise (which corrupted the TUI screen) via a safe-by-construction stdio.suppress_native_stderr; and on a mono open failure, reopen at the device's real channel count and downmix to mono, with a clear permission error when the device exposes 0 channels. - Sessions: give each `assembly code` run a unique thread id instead of reusing a fixed "default" thread (which silently resumed prior chats); `--session NAME` still resumes a named one. Gates verified pre-commit: ruff, pyright, mypy, full pytest suite, 100% patch coverage, mutation gate. Full check.sh not run at user request. Co-Authored-By: Claude Opus 4.8 (1M context) --- aai_cli/code_agent/store.py | 15 +++ aai_cli/code_agent/tui.py | 32 +++---- aai_cli/commands/code/__init__.py | 11 ++- aai_cli/core/microphone.py | 96 ++++++++++++++++--- aai_cli/core/stdio.py | 52 +++++++++- .../test_snapshots_help_run.ambr | 5 +- tests/test_code_agent.py | 7 ++ tests/test_code_command.py | 15 ++- tests/test_code_tui.py | 29 ++---- tests/test_microphone.py | 79 +++++++++++++++ tests/test_stdio.py | 57 +++++++++++ 11 files changed, 342 insertions(+), 56 deletions(-) diff --git a/aai_cli/code_agent/store.py b/aai_cli/code_agent/store.py index 7c0b2975..01b218da 100644 --- a/aai_cli/code_agent/store.py +++ b/aai_cli/code_agent/store.py @@ -8,6 +8,7 @@ from __future__ import annotations +import uuid from pathlib import Path from typing import TYPE_CHECKING @@ -18,6 +19,20 @@ _APP = "assemblyai" +# Length of a generated session id — short enough to read off the splash and retype as +# ``--session `` to resume, with ample uniqueness for one user's sessions. +_SESSION_ID_LEN = 12 + + +def new_session_id() -> str: + """A fresh, unique session id so each run starts a clean conversation by default. + + `assembly code` no longer reuses a fixed ``"default"`` thread (which silently resumed the + previous conversation); each run gets its own id unless ``--session NAME`` names one to + resume. Shown on the splash as ``Thread: `` so it can be resumed later. + """ + return uuid.uuid4().hex[:_SESSION_ID_LEN] + def sessions_db_path() -> Path: """Path to the SQLite file holding persisted coding sessions (dir created).""" diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py index cb699cf1..264d64c9 100644 --- a/aai_cli/code_agent/tui.py +++ b/aai_cli/code_agent/tui.py @@ -19,7 +19,7 @@ from textual.app import App, ComposeResult from textual.containers import Horizontal, Vertical from textual.screen import ModalScreen -from textual.widgets import Button, Input, Label, RichLog, Static +from textual.widgets import Input, Label, RichLog, Static from textual.worker import Worker from aai_cli.code_agent import banner @@ -60,11 +60,6 @@ def _spinner_text(elapsed_s: int, frame: str) -> str: return f"{frame} Working… ({elapsed_s}s)" -def _approval_decision(button_id: str | None) -> str: - """Map a pressed approval button's id to a decision, defaulting to reject if unset.""" - return button_id or "reject" - - def _abbrev_home(path: Path) -> str: """Render ``path`` with the home directory collapsed to ``~``.""" try: @@ -97,8 +92,10 @@ def _status_text(cwd: Path, *, auto_approve: bool) -> str: class ApprovalScreen(ModalScreen[str]): """A compact, bottom-docked prompt to approve/auto-approve/reject one tool call. - The transparent screen background leaves the transcript visible above (no full-screen - takeover); the decision is one of ``"approve"``, ``"auto"``, or ``"reject"``. + Keyboard-only — a plain one-line ``y / a / n`` hint instead of clickable buttons, so it + reads like a CLI prompt rather than a chrome-heavy dialog. The transparent screen + background leaves the transcript visible above (no full-screen takeover); the decision is + one of ``"approve"``, ``"auto"``, or ``"reject"``. """ DEFAULT_CSS = """ @@ -108,8 +105,6 @@ class ApprovalScreen(ModalScreen[str]): border: round #f59e0b; background: #000000; padding: 0 1; margin: 0 1 1 1; } ApprovalScreen #approvalbox Label { height: auto; } - ApprovalScreen #approvalbox Horizontal { height: auto; } - ApprovalScreen #approvalbox Button { margin: 0 1 0 0; } """ BINDINGS: ClassVar = [ ("y", "approve", "Approve"), @@ -128,13 +123,10 @@ def compose(self) -> ComposeResult: f"Run tool [b]{escape(self._tool_name)}[/b]? " f"[dim]{escape(_format_args(self._args))}[/dim]" ) - with Horizontal(): - yield Button("Approve (y)", id="approve", variant="success") - yield Button("Auto-approve (a)", id="auto", variant="primary") - yield Button("Reject (n)", id="reject", variant="error") - - def on_button_pressed(self, event: Button.Pressed) -> None: - self.dismiss(_approval_decision(event.button.id)) + yield Label( + f"[b #22c55e]y[/] approve [b {banner.BRAND_HEX}]a[/] auto-approve " + "[b #f04438]n[/] reject" + ) def action_approve(self) -> None: self.dismiss("approve") @@ -270,7 +262,11 @@ def on_mount(self) -> None: if self._initial: self._submit(self._initial) else: - self._begin_listening() # in voice mode, capture the first spoken turn + # Defer the first mic open until *after* the splash has painted. Opening PortAudio + # is a GIL-holding C call; run inline on mount it races Textual's initial render and + # the banner never flushes — it stays blank until a resize/focus forces a full + # repaint. call_after_refresh runs once the screen is on-screen, so the splash wins. + self.call_after_refresh(self._begin_listening) # in voice mode, capture first turn # --- event rendering (always called on the UI thread) --------------------- diff --git a/aai_cli/commands/code/__init__.py b/aai_cli/commands/code/__init__.py index b37052d0..9d71d404 100644 --- a/aai_cli/commands/code/__init__.py +++ b/aai_cli/commands/code/__init__.py @@ -6,6 +6,7 @@ from aai_cli import command_registry, help_panels from aai_cli.app.context import run_with_options +from aai_cli.code_agent import store from aai_cli.code_agent.prompt import DEFAULT_MODEL from aai_cli.commands.code import _exec as code_exec from aai_cli.core import llm as gateway @@ -62,8 +63,10 @@ def code( memory: bool = typer.Option( True, "--memory/--no-memory", help="Load and persist the agent's long-term memory" ), - session: str = typer.Option( - "default", "--session", help="Conversation session name (reuse to resume it)" + session: str | None = typer.Option( + None, + "--session", + help="Resume a named session. Default: a new unique session each run", ), persist: bool = typer.Option( True, "--persist/--fresh", help="Persist the session to disk (--fresh: ephemeral)" @@ -98,7 +101,9 @@ def code( skills=skills, web=web, memory=memory, - session=session, + # No --session given -> a fresh unique id, so each run starts a clean conversation + # instead of silently resuming the previous one. + session=session if session is not None else store.new_session_id(), persist=persist, tui=tui, voice=voice, diff --git a/aai_cli/core/microphone.py b/aai_cli/core/microphone.py index 755858f1..e75576d4 100644 --- a/aai_cli/core/microphone.py +++ b/aai_cli/core/microphone.py @@ -6,6 +6,7 @@ from types import ModuleType from typing import Any, Protocol, cast +from aai_cli.core import stdio from aai_cli.core.errors import CLIError with warnings.catch_warnings(): @@ -17,6 +18,8 @@ # Used when the device's native rate can't be determined (e.g. headless CI). _FALLBACK_RATE = 48000 +# Channel count for the multichannel-input fallback: capture stereo, then downmix to mono. +_STEREO_CHANNELS = 2 class _RawInputStream(Protocol): @@ -82,7 +85,11 @@ def default_rate(kind: str, device: int | None = None) -> int: """ sd = _sounddevice() try: - raw_rate = sd.query_devices(device, kind).get("default_samplerate", _FALLBACK_RATE) + # query_devices triggers PortAudio's lazy init, which prints device-probe noise to + # the C-level stderr; suppress it so a TUI mic-open can't corrupt the rendered screen. + with stdio.suppress_native_stderr(): + devices = sd.query_devices(device, kind) + raw_rate = devices.get("default_samplerate", _FALLBACK_RATE) if not isinstance(raw_rate, str | int | float): return _FALLBACK_RATE rate = int(float(raw_rate)) @@ -104,35 +111,100 @@ def resample_pcm16(chunk: bytes, state: Any, *, src_rate: int, dst_rate: int) -> class _SoundDeviceMic: """Iterator of PCM16 byte chunks from a sounddevice raw input stream. - Yields ~100 ms blocks; closeable so MicrophoneSource can tear it down. + Yields ~100 ms blocks; closeable so MicrophoneSource can tear it down. When opened with + ``channels=2`` (the multichannel-input fallback below), each interleaved stereo block is + downmixed to mono so downstream — resampling and the STT stream — always sees one channel. """ - def __init__(self, stream: _RawInputStream, blocksize: int) -> None: + def __init__(self, stream: _RawInputStream, blocksize: int, *, channels: int = 1) -> None: self._stream = stream self._blocksize = blocksize + self._channels = channels def __iter__(self) -> Iterator[bytes]: return self def __next__(self) -> bytes: data, _overflowed = self._stream.read(self._blocksize) - return bytes(data) + pcm = bytes(data) + if self._channels == _STEREO_CHANNELS: + # Average L/R into a single channel (width=2 → int16). + pcm = audioop.tomono(pcm, 2, 0.5, 0.5) + return pcm def close(self) -> None: self._stream.stop() self._stream.close() +def _open_input_stream( + sd: _SoundDeviceModule, *, sample_rate: int, device: int | None, channels: int, blocksize: int +) -> _RawInputStream: + """Open and start a started PCM16 input stream at ``channels`` channels. + + Wrapped in ``suppress_native_stderr`` because opening/starting is PortAudio's stderr-noisy + moment — kept off the terminal so a TUI mic-open can't corrupt the rendered screen. + """ + with stdio.suppress_native_stderr(): + stream = sd.RawInputStream( + samplerate=sample_rate, + device=device, + channels=channels, + dtype="int16", + blocksize=blocksize, + ) + stream.start() + return stream + + +def _max_input_channels(sd: _SoundDeviceModule, device: int | None) -> int: + """The device's advertised input-channel count (0 when it exposes no input).""" + with stdio.suppress_native_stderr(): + info = sd.query_devices(device, "input") + raw = info.get("max_input_channels", 0) + return raw if isinstance(raw, int) else 0 + + def _default_mic_stream(*, sample_rate: int, device: int | None) -> Iterator[bytes]: - """A sounddevice-backed PCM16 mic stream (imported lazily to keep startup fast).""" - sd = _sounddevice() + """A sounddevice-backed PCM16 mono mic stream (imported lazily to keep startup fast). + Tries a mono open first. PortAudio rejects ``channels=1`` (``-9998``) when the device + exposes no usable mono input: either it has zero input channels (no mic permission, or the + default input isn't a microphone) — which no channel count can fix, so we raise an + actionable error — or it's a multichannel-only input, which we reopen at stereo and + downmix. Devices that already do mono never reach the fallback. + """ + sd = _sounddevice() blocksize = max(1, sample_rate // 10) # ~100 ms per read - stream = sd.RawInputStream( - samplerate=sample_rate, device=device, channels=1, dtype="int16", blocksize=blocksize - ) - stream.start() - return _SoundDeviceMic(stream, blocksize) + try: + return _SoundDeviceMic( + _open_input_stream( + sd, sample_rate=sample_rate, device=device, channels=1, blocksize=blocksize + ), + blocksize, + ) + except Exception: + max_in = _max_input_channels(sd, device) + if max_in < 1: + raise CLIError( + "The default microphone reports no input channels.", + error_type="mic_error", + exit_code=1, + suggestion=( + "Grant microphone access to your terminal in System Settings > Privacy & " + "Security > Microphone, or pick another input with --device." + ), + ) from None + if max_in < _STEREO_CHANNELS: + raise # a 1-channel device should accept mono; surface the real PortAudio error + stream = _open_input_stream( + sd, + sample_rate=sample_rate, + device=device, + channels=_STEREO_CHANNELS, + blocksize=blocksize, + ) + return _SoundDeviceMic(stream, blocksize, channels=_STEREO_CHANNELS) class MicrophoneSource: @@ -172,6 +244,8 @@ def __iter__(self) -> Iterator[bytes]: stream: Any = self._factory(sample_rate=self._capture_rate, device=self.device) except ImportError as exc: raise audio_missing_error() from exc + except CLIError: + raise # the factory already raised an actionable error; don't bury it in a re-wrap except Exception as exc: # "device None" reads like a bug; name the default mic in plain words. target = ( diff --git a/aai_cli/core/stdio.py b/aai_cli/core/stdio.py index 05db2ef3..f6905644 100644 --- a/aai_cli/core/stdio.py +++ b/aai_cli/core/stdio.py @@ -3,7 +3,57 @@ import contextlib import os import sys -from collections.abc import Iterator +from collections.abc import Generator, Iterator + +# The OS-level stderr descriptor. Used as a literal rather than ``sys.stderr.fileno()`` +# because inside a Textual app ``sys.stderr`` is swapped for a redirector whose ``fileno()`` +# returns an unusable fd — duping that raised EBADF and broke the very mic open we were +# trying to quiet. fd 2 is the real inherited stderr, which Textual never touches. +_STDERR_FD = 2 + + +@contextlib.contextmanager +def suppress_native_stderr() -> Generator[None]: + """Send OS-level stderr to /dev/null for the block, then restore it. + + Catches diagnostics that C extensions write straight to fd 2 via ``fprintf`` — + PortAudio/CoreAudio/ALSA print device-probe noise there on the first audio call, + *below* Python's logging, so silencing loggers can't reach it. Inside a full-screen + TUI (which draws to stdout and never repaints stderr) those raw writes scribble over + the rendered screen; the mic-open path wraps its PortAudio calls in this so they land + in the void instead. + + Safe by construction: if the descriptor can't be duplicated/redirected for any reason, + the block runs with stderr untouched rather than raising — suppression is cosmetic and + must never break the operation it wraps. Exceptions from the body propagate normally + (only the fd is redirected, not raised errors). + """ + saved_fd: int | None = None + devnull_fd: int | None = None + try: + saved_fd = os.dup(_STDERR_FD) + devnull_fd = os.open(os.devnull, os.O_WRONLY) + os.dup2(devnull_fd, _STDERR_FD) + except OSError: + # Couldn't redirect — abandon suppression, never break the caller. + _close_quietly(saved_fd) + _close_quietly(devnull_fd) + yield + return + try: + yield + finally: + with contextlib.suppress(OSError): + os.dup2(saved_fd, _STDERR_FD) # restore the real stderr + _close_quietly(saved_fd) + _close_quietly(devnull_fd) + + +def _close_quietly(fd: int | None) -> None: + """Close ``fd`` if it was opened, ignoring an already-closed/invalid descriptor.""" + if fd is not None: + with contextlib.suppress(OSError): + os.close(fd) def silence_stdout() -> None: diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 2879f6f9..a36aa130 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -416,9 +416,8 @@ │ --memory --no-memory Load and persist the agent's │ │ long-term memory │ │ [default: memory] │ - │ --session TEXT Conversation session name (reuse to │ - │ resume it) │ - │ [default: default] │ + │ --session TEXT Resume a named session. Default: a │ + │ new unique session each run │ │ --persist --fresh Persist the session to disk │ │ (--fresh: ephemeral) │ │ [default: persist] │ diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py index 739285bf..76af37af 100644 --- a/tests/test_code_agent.py +++ b/tests/test_code_agent.py @@ -166,6 +166,13 @@ def test_checkpointer_in_memory_vs_sqlite(tmp_path, monkeypatch): # untyped: to saver.conn.close() +def test_new_session_id_is_unique_and_short() -> None: + a = store.new_session_id() + b = store.new_session_id() + assert a != b # each run gets its own thread id (no silent resume of a shared default) + assert len(a) == 12 and a.isalnum() # short hex, readable off the splash to resume later + + def test_cli_tool_invokes_runner_with_args() -> None: captured: list[list[str]] = [] diff --git a/tests/test_code_command.py b/tests/test_code_command.py index a4384db9..b548cd6e 100644 --- a/tests/test_code_command.py +++ b/tests/test_code_command.py @@ -41,7 +41,20 @@ def test_command_parses_flags_into_options(monkeypatch): opts = captured["o"] assert opts.prompt == "build a thing" assert opts.auto is True and opts.web is False - assert opts.session == "s1" and opts.persist is False + assert opts.session == "s1" and opts.persist is False # an explicit --session is honored + + +def test_command_defaults_to_a_fresh_unique_session_each_run(monkeypatch): + # No --session: each invocation gets its own id (so a run never silently resumes the + # previous conversation), and two runs differ. + seen = [] + monkeypatch.setattr( + _exec, "run_code", lambda opts, state, *, json_mode: seen.append(opts.session) + ) + assert runner.invoke(app, ["code"]).exit_code == 0 + assert runner.invoke(app, ["code"]).exit_code == 0 + assert seen[0] != "default" # not the old shared, auto-resumed thread + assert seen[0] and seen[1] and seen[0] != seen[1] # a distinct id per run def test_run_code_dispatches_to_tui_with_voice_by_default_when_tty(monkeypatch): diff --git a/tests/test_code_tui.py b/tests/test_code_tui.py index df36ed0d..8abeee08 100644 --- a/tests/test_code_tui.py +++ b/tests/test_code_tui.py @@ -14,7 +14,7 @@ import pytest from langchain_core.messages import AIMessage, HumanMessage -from textual.widgets import Input, RichLog, Static +from textual.widgets import Input, Label, RichLog, Static from aai_cli.code_agent import tui from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult @@ -51,14 +51,6 @@ def test_format_args_and_abbrev_home() -> None: assert tui._abbrev_home(outside) == str(outside) -def test_approval_decision_defaults_to_reject() -> None: - assert tui._approval_decision("approve") == "approve" - assert tui._approval_decision("auto") == "auto" - # A button with no id (Textual allows None) is treated as a rejection, not approval. - assert tui._approval_decision(None) == "reject" - assert tui._approval_decision("") == "reject" - - def test_git_branch_and_status(tmp_path: Path) -> None: assert tui._git_branch(tmp_path) is None # no .git (tmp_path / ".git").mkdir() @@ -206,23 +198,22 @@ async def go() -> None: _run(go()) -def test_approval_button_press_dismisses() -> None: - # Covers ApprovalScreen.on_button_pressed (the click path; key paths are covered - # by the approve/reject modal tests above). The bracketed name/args also guard the - # compose() escape() — without it, Label markup parsing would raise on mount. - results: list[str | None] = [] - +def test_approval_prompt_renders_keyboard_hint() -> None: + # The prompt is a plain y/a/n keyboard hint, not clickable buttons — assert each + # option's copy renders so dropping one is caught. The bracketed name/args also guard + # the compose() escape(): without it, Label markup parsing would raise on mount. async def go() -> None: app = CodeAgentApp(agent=FakeAgent([])) async with app.run_test(size=(100, 30)) as pilot: await pilot.pause() - app.push_screen(ApprovalScreen("exec[", {"cmd": "[ls"}), results.append) - await pilot.pause() - await pilot.click("#reject") + app.push_screen(ApprovalScreen("exec[", {"cmd": "[ls"})) await pilot.pause() + rendered = " ".join(str(label.render()) for label in app.screen.query(Label)) + assert "approve" in rendered + assert "auto-approve" in rendered + assert "reject" in rendered _run(go()) - assert results == ["reject"] def test_approval_box_is_compact_and_bottom_docked() -> None: diff --git a/tests/test_microphone.py b/tests/test_microphone.py index d215e187..207f6b4a 100644 --- a/tests/test_microphone.py +++ b/tests/test_microphone.py @@ -286,3 +286,82 @@ def test_default_mic_stream_missing_sounddevice_raises_mic_missing(monkeypatch): _default_mic_stream(sample_rate=16000, device=None) assert exc.value.error_type == "mic_missing" assert exc.value.exit_code == 2 + + +class _FakeStereoStream(_FakeRawStream): + """A 2-channel input stream: one interleaved stereo frame (L=256, R=768).""" + + def __init__(self, **kwargs): + super().__init__(**kwargs) + # int16 LE: L=256 (b"\x00\x01"), R=768 (b"\x00\x03"), interleaved one frame. + self._chunks = [(b"\x00\x01\x00\x03", False)] + + +def test_sounddevice_mic_downmixes_stereo_to_mono(): + # channels=2 averages L/R per frame: (256 + 768) / 2 == 512 (b"\x00\x02"). + mic = _SoundDeviceMic(_FakeStereoStream(), blocksize=1, channels=2) + assert next(iter(mic)) == b"\x00\x02" + + +def _fake_sd_rejecting_mono(max_input_channels: int, opened: list[int]) -> Any: + """A sounddevice whose mono open fails with -9998; query reports ``max_input_channels``.""" + + def raw_input_stream(**kwargs): + opened.append(kwargs["channels"]) + if kwargs["channels"] == 1: + raise OSError("Error opening RawInputStream: Invalid number of channels [-9998]") + return _FakeStereoStream(**kwargs) + + fake_sd: Any = types.ModuleType("sounddevice") + fake_sd.RawInputStream = raw_input_stream + fake_sd.query_devices = lambda device, kind: {"max_input_channels": max_input_channels} + return fake_sd + + +def test_default_mic_stream_falls_back_to_stereo_downmix(monkeypatch): + # A multichannel-only input (mono rejected, but >=2 channels available) is reopened at + # stereo and downmixed to mono — so voice works on devices that won't open as mono. + opened: list[int] = [] + monkeypatch.setitem(sys.modules, "sounddevice", _fake_sd_rejecting_mono(2, opened)) + stream = _default_mic_stream(sample_rate=16000, device=None) + assert opened == [1, 2] # tried mono, then reopened stereo + assert next(iter(stream)) == b"\x00\x02" # yields downmixed mono + + +def test_default_mic_stream_zero_input_channels_raises_permission_error(monkeypatch): + # 0 input channels can't be salvaged (no mic permission / wrong default device): raise an + # actionable error pointing at the macOS Microphone privacy setting, not the cryptic code. + opened: list[int] = [] + monkeypatch.setitem(sys.modules, "sounddevice", _fake_sd_rejecting_mono(0, opened)) + with pytest.raises(CLIError) as exc: + _default_mic_stream(sample_rate=16000, device=None) + assert opened == [1] # only the mono attempt; no pointless stereo retry + assert exc.value.error_type == "mic_error" + assert "no input channels" in exc.value.message.lower() + assert exc.value.suggestion is not None + assert "Microphone" in exc.value.suggestion + + +def test_default_mic_stream_single_channel_failure_reraises_original(monkeypatch): + # A genuine 1-channel device should accept mono; if it still failed, the channel fallback + # can't help, so surface the real PortAudio error rather than masking it. + opened: list[int] = [] + monkeypatch.setitem(sys.modules, "sounddevice", _fake_sd_rejecting_mono(1, opened)) + with pytest.raises(OSError, match="Invalid number of channels"): + _default_mic_stream(sample_rate=16000, device=None) + assert opened == [1] # no stereo retry on a 1-channel device + + +def test_microphone_source_passes_through_factory_clierror(): + # An actionable CLIError from the factory (e.g. the zero-channel case) must propagate + # intact, not get re-wrapped into the generic "Could not open" message. + err = CLIError("no input channels", error_type="mic_error", exit_code=1, suggestion="grant it") + + def boom(**_kwargs): + raise err + + mic = MicrophoneSource(capture_rate=16000, stream_factory=boom) + with pytest.raises(CLIError) as exc: + list(mic) + assert exc.value is err # passed through unchanged + assert exc.value.suggestion == "grant it" diff --git a/tests/test_stdio.py b/tests/test_stdio.py index 9b0e2fc0..8a9a58f7 100644 --- a/tests/test_stdio.py +++ b/tests/test_stdio.py @@ -1,4 +1,5 @@ import io +import os from aai_cli.core import stdio @@ -121,3 +122,59 @@ def boom(*_a, **_k): # Raising inside the suppressed block must not propagate. monkeypatch.setattr("os.open", boom) stdio.silence_stdout() + + +def test_suppress_native_stderr_redirects_during_block_then_restores(monkeypatch): + # The fd dance: dup the real stderr (fd 2 itself — never sys.stderr.fileno(), which is + # an unusable redirector inside a TUI), point it at /dev/null for the body, then restore + # and close both temporaries. The body must run *while* redirected (between the dup2s). + events: list[object] = [] + monkeypatch.setattr("os.dup", lambda fd: events.append(("dup", fd)) or 50) + monkeypatch.setattr("os.open", lambda path, flags: events.append(("open", path)) or 99) + monkeypatch.setattr("os.dup2", lambda src, dst: events.append(("dup2", src, dst))) + monkeypatch.setattr("os.close", lambda fd: events.append(("close", fd))) + + with stdio.suppress_native_stderr(): + events.append("body") + + assert events == [ + ("dup", 2), # save the real stderr fd (literal 2) + ("open", os.devnull), # open /dev/null + ("dup2", 99, 2), # point stderr at /dev/null + "body", # the block runs while stderr is redirected + ("dup2", 50, 2), # restore the saved fd + ("close", 50), + ("close", 99), + ] + + +def test_suppress_native_stderr_runs_body_when_redirect_fails(monkeypatch): + # Safe by construction: if the fd can't be duplicated, the block still runs (suppression + # is cosmetic and must never break the wrapped mic open) and stderr is never redirected. + def boom(_fd: int) -> int: + raise OSError("cannot dup") + + redirected: list[tuple[int, int]] = [] + monkeypatch.setattr("os.dup", boom) + monkeypatch.setattr("os.dup2", lambda src, dst: redirected.append((src, dst))) + ran: list[bool] = [] + + with stdio.suppress_native_stderr(): + ran.append(True) + + assert ran == [True] # body ran despite the dup failure + assert redirected == [] # never redirected -> nothing left to restore + + +def test_suppress_native_stderr_swallows_close_failure(monkeypatch): + # A teardown close hitting an already-closed/invalid fd must not escape the block. + def boom(_fd: int) -> None: + raise OSError("already closed") + + monkeypatch.setattr("os.dup", lambda _fd: 50) + monkeypatch.setattr("os.open", lambda _path, _flags: 99) + monkeypatch.setattr("os.dup2", lambda _src, _dst: None) + monkeypatch.setattr("os.close", boom) + + with stdio.suppress_native_stderr(): + pass # exits cleanly even though both teardown closes raise From af0d3a8844914ea762961435e3a257721dfb28fe Mon Sep 17 00:00:00 2001 From: Alex Kroman <12372+alexkroman@users.noreply.github.com> Date: Thu, 18 Jun 2026 09:20:59 -0700 Subject: [PATCH 2/3] Refactor TUI into modular components; rename agent-cascade to live (#240) ## Summary This PR refactors the coding-agent TUI into smaller, more maintainable modules while keeping the main `CodeAgentApp` class intact. It also renames the `agent-cascade` command to `live` for clarity. The changes improve code organization without altering user-facing behavior. ## Key Changes **TUI Refactoring:** - **Split `tui.py`** into focused modules to stay under the 500-line file-length gate: - `modals.py`: `ApprovalScreen` and `AskScreen` modal dialogs with voice support - `messages.py`: Transcript widget classes (`UserMessage`, `AssistantMessage`, `ToolOutput`, etc.) - `voice_ui.py`: Voice capture/readback mechanics (`_VoiceIO` protocol, `_VoiceLegs` mixin) - `tui_status.py`: Pure text helpers for status line and spinner (`_spinner_text`, `_status_text`, etc.) - `summarize.py`: Tool activity summaries shared by TUI and Rich fallback - `risk.py`: Risk heuristics for tool approval prompts - **Extracted helper modules:** - `agent_cascade/brain.py`: Deepagents graph builder for the live cascade (system prompt, tool guidance, completer) - Moved `approval_from_speech` mapping to `modals.py` for voice-answerable approval **Command Rename:** - Renamed `agent-cascade` command to `live` throughout (command registration, help text, tests, docs, templates) **Test Reorganization:** - Split large test files to stay under the gate: - `test_code_modals.py`: Modal screen tests with voice doubles - `test_code_messages.py`: Transcript widget rendering tests - `test_code_tui_voice.py`: Voice toggle and readback tests (expanded) - `test_code_tui_status.py`: Pure status/spinner text helpers - `test_code_summarize.py`: Tool summarization tests - `test_code_session_stream.py`: Streaming and cancellation tests - `test_code_risk.py`: Risk heuristic tests - `test_agent_cascade_brain.py`: Deepagents graph builder tests **Installer Improvements:** - Enhanced `install.sh` with dev mode support (`--install-method git` / `--dev`) - Added usage help and environment variable overrides - Supports both release (published) and editable (development) installs **Events & Session:** - Added `AssistantDelta` event for per-token streaming (frozen, hashable) - Updated `CodeSession` to handle dual-mode streaming (`values` + `messages`) ## Implementation Details - **Voice modals** speak prompts and listen for spoken replies off the UI thread (daemon threads), marshaling back via `call_from_thread` - **Risk warnings** are pure functions (no Textual imports) so they unit-test cleanly - **Summarizers** clip long tool args/output to keep the transcript scannable (mirroring deepagents-code's collapsible rows) - **Deepagents brain** builds system prompts that advertise only available tools, preventing the agent from narrating actions it can't take - All refactored modules maintain the same public APIs; `CodeAgentApp` remains the single entry point ## Testing - New test files cover the extracted modules with real Textual app headless tests and pure function unit tests - Snapshot tests updated for the `live` command rename - CI workflow enhanced with end-to-end install.sh validation https://claude.ai/code/session_01Ad72JciKrsz4TKG7ZY9GR6 --------- Co-authored-by: Claude --- .github/workflows/ci.yml | 37 ++ README.md | 37 +- REFERENCE.md | 2 +- aai_cli/agent_cascade/brain.py | 189 ++++++++ aai_cli/agent_cascade/engine.py | 15 +- aai_cli/code_agent/events.py | 29 +- aai_cli/code_agent/messages.py | 110 +++++ aai_cli/code_agent/modals.py | 202 ++++++++ aai_cli/code_agent/model.py | 17 +- aai_cli/code_agent/render.py | 19 +- aai_cli/code_agent/risk.py | 68 +++ aai_cli/code_agent/session.py | 39 +- aai_cli/code_agent/summarize.py | 96 ++++ aai_cli/code_agent/tui.py | 438 +++++++++--------- aai_cli/code_agent/tui_status.py | 51 ++ aai_cli/code_agent/voice_ui.py | 107 +++++ aai_cli/code_agent/web_search.py | 4 + aai_cli/code_gen/agent_cascade.py | 4 +- aai_cli/commands/agent/__init__.py | 2 +- aai_cli/commands/agent_cascade/__init__.py | 34 +- aai_cli/commands/agent_cascade/_exec.py | 4 +- aai_cli/core/microphone.py | 40 ++ install.sh | 188 +++++++- pyproject.toml | 7 +- pyrightconfig.tests.json | 3 +- scripts/generated_code_compile_gate.py | 4 +- .../test_snapshots_help_root.ambr | 83 ++-- .../test_snapshots_help_run.ambr | 233 +++++----- tests/test_agent_cascade_brain.py | 235 ++++++++++ tests/test_agent_cascade_command.py | 51 +- tests/test_agent_cascade_show_code.py | 24 +- tests/test_code_agent.py | 43 -- tests/test_code_messages.py | 149 ++++++ tests/test_code_modals.py | 236 ++++++++++ tests/test_code_risk.py | 46 ++ tests/test_code_session_stream.py | 157 +++++++ tests/test_code_summarize.py | 93 ++++ tests/test_code_tui.py | 115 +++-- tests/test_code_tui_status.py | 49 ++ tests/test_code_tui_voice.py | 166 ++++++- tests/test_microphone.py | 101 +++- tests/test_sandbox_access.py | 6 +- tests/test_smoke.py | 2 +- 43 files changed, 2930 insertions(+), 605 deletions(-) create mode 100644 aai_cli/agent_cascade/brain.py create mode 100644 aai_cli/code_agent/messages.py create mode 100644 aai_cli/code_agent/modals.py create mode 100644 aai_cli/code_agent/risk.py create mode 100644 aai_cli/code_agent/summarize.py create mode 100644 aai_cli/code_agent/tui_status.py create mode 100644 aai_cli/code_agent/voice_ui.py create mode 100644 tests/test_agent_cascade_brain.py create mode 100644 tests/test_code_messages.py create mode 100644 tests/test_code_modals.py create mode 100644 tests/test_code_risk.py create mode 100644 tests/test_code_session_stream.py create mode 100644 tests/test_code_summarize.py create mode 100644 tests/test_code_tui_status.py diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index a181d476..7a38cff6 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -316,3 +316,40 @@ jobs: python -m pip install -e . pip-audit # Append `--ignore-vuln ` to accept an unfixable transitive advisory. python -m pip_audit + + # End-to-end check that install.sh actually installs a working `assembly`. Runs + # the script in dev mode (--install-method git) so it installs *this* checkout + # editable via uv — exercising both the installer and the PR's own code — then + # smoke-tests the resulting CLI. Catches install.sh regressions (arg parsing, + # the uv/pipx selection, the editable path) that shellcheck alone can't. + install-script: + name: install script smoke + runs-on: ubuntu-latest + timeout-minutes: 15 + steps: + - uses: actions/checkout@df4cb1c069e1874edd31b4311f1884172cec0e10 # v6.0.3 + with: + persist-credentials: false # no job pushes; don't leave the token in .git/config + fetch-depth: 0 # hatch-vcs derives the version from git history for the editable build + # Provide uv so install.sh takes its preferred (uv) path rather than + # bootstrapping it over the network. + - uses: astral-sh/setup-uv@fac544c07dec837d0ccb6301d7b5580bf5edae39 # v8.2.0 + with: + enable-cache: true + cache-dependency-glob: uv.lock + + # PortAudio + ffmpeg so `assembly --help` (which imports the full command + # tree) loads cleanly; also lets install.sh's dep check find them present. + - name: System deps (PortAudio + ffmpeg) + run: sudo apt-get update && sudo apt-get install -y libportaudio2 ffmpeg + + - name: Run install.sh (editable, from this checkout) + run: ./install.sh --install-method git + + - name: Smoke-test the installed CLI + run: | + # uv tool installs land in ~/.local/bin; put it on PATH for this step. + export PATH="$HOME/.local/bin:$PATH" + assembly --version + help_out="$(assembly --help)" + echo "$help_out" | grep -q transcribe diff --git a/README.md b/README.md index ed70dd87..674a9818 100644 --- a/README.md +++ b/README.md @@ -17,10 +17,10 @@ Learn more about the platform in the [AssemblyAI docs](https://www.assemblyai.co Install on macOS or Linux with one command: ```sh -curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | sh +curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash ``` -This installs [uv](https://docs.astral.sh/uv/) if needed, then installs `assembly` as a uv tool. +This installs `assembly` with [uv](https://docs.astral.sh/uv/) (or pipx), bootstrapping uv if needed. Sign in (stores your API key in the OS keyring) and run your first transcription: @@ -36,7 +36,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins - **🎯 One command for everything**: transcription, real-time streaming, voice agents, LLM prompts, and WER benchmarking — no SDK boilerplate. - **🔌 Built for pipelines**: data goes to stdout, errors to stderr, `--json` gives stable machine-readable output, and `-` reads audio from stdin. - **🔐 Secure by default**: your API key lives in the OS keyring, never in a dotfile — and run commands have no `--api-key` flag, so keys can't leak into `ps` or shell history. -- **🛠️ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `agent-cascade`). +- **🛠️ From demo to deployed app**: `assembly init` scaffolds a runnable FastAPI starter, `assembly dev` / `share` / `deploy` run, tunnel, and ship it, and `--show-code` prints the equivalent Python SDK script for any run command (`transcribe` / `stream` / `agent` / `live`). - **🤖 Agent-ready**: `assembly setup install` wires your coding agent up with the AssemblyAI docs MCP server and skills. - **📖 Open source**: MIT licensed. @@ -48,7 +48,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins | `assembly stream` | Real-time transcription from your microphone, a file, or a URL — on macOS it can capture system audio too | | `assembly dictate` | Signal-driven dictation: records immediately, send SIGTERM for instant text — scriptable from hotkey tools like Hammerspoon (Sync STT API, up to 120 s per utterance) | | `assembly agent` | Full-duplex spoken conversation with a voice agent, right in your terminal | -| `assembly agent-cascade` | Same live conversation, but wired client-side from Streaming STT + the LLM Gateway + streaming TTS, like the `agent-cascade` starter (sandbox-only) | +| `assembly live` | Talk live to a tool-using voice agent, wired client-side from Streaming STT + a deepagents brain on the LLM Gateway + streaming TTS — it can web-search, fetch URLs, and read the docs mid-conversation, like the `agent-cascade` starter (sandbox-only) | | `assembly speak` | Synthesize text to speech over the streaming-TTS WebSocket (sandbox-only) | | `assembly llm` | Prompt the LLM Gateway over a transcript, files, stdin, or a live stream | | `assembly code` | Terminal coding agent (deepagents SDK) backed only by the LLM Gateway — reads/writes/edits files, runs shell, searches the docs MCP, and can invoke the `assembly` CLI itself; mutating actions ask for approval. Defaults to voice in a terminal (speak your request, replies read back via streaming TTS in the sandbox); pass `--no-voice` for the keyboard TUI | @@ -63,7 +63,7 @@ That's it. Run `assembly onboard` for a guided tour, or see [Installation](#-ins | `assembly transcripts` / `sessions` | Browse and fetch past transcripts and streaming sessions | | `assembly keys` / `balance` / `usage` / `limits` / `audit` | Account self-service via browser login | -Add `--show-code` to `transcribe` / `stream` / `agent` / `agent-cascade` to print the equivalent Python SDK script instead of running — the built-in path from CLI experiment to SDK code. +Add `--show-code` to `transcribe` / `stream` / `agent` / `live` to print the equivalent Python SDK script instead of running — the built-in path from CLI experiment to SDK code. ## ✨ Things you can do with it @@ -194,7 +194,7 @@ assembly transcripts list --json --limit 5 \ assembly agent --voice ivy --system-prompt "you're a helpful interviewer" ``` -**Graduate to the SDK** — `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`agent-cascade` run instead of executing it: +**Graduate to the SDK** — `--show-code` prints the equivalent Python script for any `transcribe`/`stream`/`agent`/`live` run instead of executing it: ```sh assembly agent --system-prompt "you're a story generator" --show-code > story.py @@ -231,12 +231,26 @@ Requires Python 3.12+ (Homebrew brings its own; for pipx/uv see the `--python` h ### Install script (recommended — macOS / Linux) ```sh -curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | sh +curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash ``` -The [`install.sh`](install.sh) script bootstraps [uv](https://docs.astral.sh/uv/) if it -isn't already present, then runs `uv tool install` to put `assembly` on your `PATH`. Re-run -it any time to update to the latest version. +The [`install.sh`](install.sh) script installs `assembly` with whichever tool installer you +already have — [uv](https://docs.astral.sh/uv/) if present, otherwise [pipx](https://pipx.pypa.io) — +and bootstraps uv only when neither is found. It then installs the optional live-audio system +dependencies via [Homebrew](https://brew.sh) when `brew` is available, or prints the right +install command for your platform otherwise. Re-run it any time to update to the latest version. + +For a **development install** — an editable checkout so local source edits take effect without +reinstalling (`uv tool install -e .`) — pass `--install-method git` (or `--dev`). It reuses the +checkout you run it from, or clones the repo to `~/.local/share/assembly-cli` (override with +`--dir`): + +```sh +# from a clone you already have +./install.sh --dev +# or fetch + editable-install in one shot +curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash -s -- --install-method git +``` ### Homebrew (macOS / Linux) @@ -266,7 +280,8 @@ Only the live-audio commands need anything extra: `stream`, `dictate`, and `agen microphone capture and [`ffmpeg`](https://ffmpeg.org) on `PATH` to stream non-WAV audio; `assembly share` uses [`cloudflared`](https://github.com/cloudflare/cloudflared) for its public tunnel. Plain `transcribe` uploads your file directly and needs none of them. The -[`install.sh`](install.sh) script checks for these and prints the right install command when any are missing. +[`install.sh`](install.sh) script checks for these and installs them via Homebrew when `brew` is +available, otherwise printing the right install command for your platform. - Debian/Ubuntu: `sudo apt-get install libportaudio2 ffmpeg` - Fedora: `sudo dnf install portaudio ffmpeg` diff --git a/REFERENCE.md b/REFERENCE.md index bf9f3d8e..09216ef4 100644 --- a/REFERENCE.md +++ b/REFERENCE.md @@ -94,7 +94,7 @@ each carrying a `"type"` field to dispatch on: | ------- | ----------- | | `assembly stream --json` | `begin`, `turn`, `termination` (with `--from-stdin`, a `source` event precedes each file's events) | | `assembly agent --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` | -| `assembly agent-cascade --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` | +| `assembly live --json` | `session.ready`, `transcript.user.delta`, `transcript.user`, `reply.started`, `transcript.agent`, `reply.done` | | `assembly dictate --json` | `utterance` | | `assembly llm --follow --json` | `answer` | | `assembly transcribe --json` | `result` (one per source), then `reduce` if `--llm-reduce` is set | diff --git a/aai_cli/agent_cascade/brain.py b/aai_cli/agent_cascade/brain.py new file mode 100644 index 00000000..966e3e68 --- /dev/null +++ b/aai_cli/agent_cascade/brain.py @@ -0,0 +1,189 @@ +"""Deepagents-powered reply brain for the live voice cascade. + +`assembly live` answers each spoken turn with a deepagents graph instead of a single +LLM completion, so the agent can transparently reach for tools — web search, URL +fetch, the AssemblyAI docs — mid-conversation, mimicking a live multimodal assistant +(the "talk to Gemini Live" experience). The graph is built once per session +(:func:`build_graph`) and invoked statelessly per turn with the running history the +cascade already keeps (:func:`build_completer`); tools are read-only and auto-approved, +because a spoken turn can't pause for a keyboard confirmation, and the system prompt +keeps every reply short and speakable. + +The graph is the only network seam: :func:`build_completer` accepts an injected graph, +so the per-turn orchestration is unit-tested against a fake with no sockets — the same +seam the rest of the cascade uses for its STT/LLM/TTS legs. +""" + +from __future__ import annotations + +from collections.abc import Callable, Sequence +from typing import TYPE_CHECKING + +from aai_cli.agent_cascade.config import CascadeConfig +from aai_cli.code_agent.agent import CompiledAgent +from aai_cli.code_agent.fetch_tool import FETCH_TOOL_NAME +from aai_cli.code_agent.web_search import WEB_SEARCH_TOOL_NAME + +if TYPE_CHECKING: + from langchain_core.tools import BaseTool + from openai.types.chat import ChatCompletionMessageParam + +# Closes every guidance variant: the reply is spoken, so it must stay short and plain. +_SPOKEN_TAIL = ( + "Your reply is read aloud, so keep it short and spoken — no markdown, lists, code, or raw URLs." +) + +# When the session has *no* tools wired (e.g. no web search and the docs host is +# unreachable), the model must answer from its own knowledge — and crucially must not +# promise an action it can't take. Without this, telling it "you can search the web" while +# no search tool is bound makes it narrate "I'll search for that…" and then stop, so the +# answer never comes (the tool it announced was never actually available to call). +_NO_TOOLS_GUIDANCE = ( + "You have no external tools available, so answer from your own knowledge. Never say " + "you will search the web, look something up, or fetch a page — you can't do any of " + "that, so don't promise it; if a question needs information you don't have, say so " + f"briefly instead. {_SPOKEN_TAIL}" +) + + +def _join_clause(parts: list[str]) -> str: + """Join capability phrases into a readable clause: ``a``, ``a and b``, ``a, b, and c``.""" + *initial, last = parts + if not initial: + return last + # Oxford comma only once there are three-or-more items (two or more lead the last). + joiner = ", and " if initial[1:] else " and " + return f"{', '.join(initial)}{joiner}{last}" + + +def _tool_capabilities(tools: Sequence[BaseTool]) -> list[str]: + """The spoken-capability phrases backed by an actually-present tool. + + Derived from the resolved tool names so the prompt never advertises a capability the + agent can't perform: web search is present only with a ``TAVILY_API_KEY``, and the docs + tools are best-effort (absent when the docs host is unreachable). + """ + names = {tool.name for tool in tools} + capabilities: list[str] = [] + if WEB_SEARCH_TOOL_NAME in names: + capabilities.append("search the web for current or unfamiliar facts") + if FETCH_TOOL_NAME in names: + capabilities.append("fetch a specific URL") + if names - {WEB_SEARCH_TOOL_NAME, FETCH_TOOL_NAME}: + capabilities.append("look up the AssemblyAI documentation") + return capabilities + + +def build_system_prompt(persona: str, *, tools: Sequence[BaseTool]) -> str: + """The live agent's system prompt: the user's persona plus tool guidance. + + The guidance is tailored to ``tools`` so the model is only told about capabilities it + actually has — advertising a missing tool (web search without a ``TAVILY_API_KEY``) made + the agent announce an action it then couldn't take, leaving the turn hanging with no + answer. With no tools at all the model is told to answer from its own knowledge. + """ + capabilities = _tool_capabilities(tools) + if not capabilities: + return f"{persona}\n\n{_NO_TOOLS_GUIDANCE}" + guidance = ( + f"You can use tools to help answer: {_join_clause(capabilities)}. Reach for a " + "tool when a question needs fresh or external information; answer directly and " + "instantly when you already know. Only offer to do what these tools allow — don't " + f"say you'll search the web or look something up unless it's listed here. {_SPOKEN_TAIL}" + ) + return f"{persona}\n\n{guidance}" + + +def build_live_tools() -> list[BaseTool]: + """The live agent's read-only toolset: URL fetch, web search (if keyed), and docs. + + All three are reused from the coding agent's tool modules. Unlike there they are + *not* approval-gated — a spoken turn can't wait for a keyboard confirmation, so the + live agent only gets read-only tools and runs them automatically. Web search is + present only when ``TAVILY_API_KEY`` is set; the docs MCP is best-effort (an empty + list when the host is unreachable), so neither blocks a session. + """ + from aai_cli.code_agent.docs_mcp import load_docs_tools + from aai_cli.code_agent.fetch_tool import build_fetch_tool + from aai_cli.code_agent.web_search import build_web_search_tool + + tools: list[BaseTool] = [build_fetch_tool()] + search = build_web_search_tool() + if search is not None: + tools.append(search) + tools.extend(load_docs_tools()) + return tools + + +def build_graph( + api_key: str, config: CascadeConfig, *, tools: Sequence[BaseTool] | None = None +) -> CompiledAgent: + """Compile the deepagents graph for one live session over the gateway model. + + Reuses the coding agent's gateway-bound ``ChatOpenAI`` (so the live agent can only + ever reach AssemblyAI), threading the cascade's ``--max-tokens``/``--llm-config`` + through it. ``tools`` defaults to :func:`build_live_tools`; tests pass an explicit + (possibly empty) list to skip the network-touching docs probe. + """ + from deepagents import create_deep_agent + + from aai_cli.code_agent.model import build_model + + model = build_model( + api_key, model=config.model, max_tokens=config.max_tokens, extra=config.llm_extra + ) + resolved = build_live_tools() if tools is None else list(tools) + return create_deep_agent( + model=model, + tools=resolved, + system_prompt=build_system_prompt(config.system_prompt, tools=resolved), + ) + + +def build_completer( + api_key: str, config: CascadeConfig, *, graph: CompiledAgent | None = None +) -> Callable[[list[ChatCompletionMessageParam]], str]: + """A ``complete_reply`` for the cascade engine backed by the deepagents graph. + + The cascade prepends its own ``system`` message to the history each turn; the graph + already owns the system prompt, so we drop it before invoking. The graph runs the + full tool loop and we return its final spoken text. ``graph`` is injected in tests + so the per-turn wiring runs against a fake with no network. + """ + resolved = build_graph(api_key, config) if graph is None else graph + + def complete_reply(messages: list[ChatCompletionMessageParam]) -> str: + conversation = [message for message in messages if message.get("role") != "system"] + return _reply_text(resolved.invoke({"messages": conversation})) + + return complete_reply + + +def _reply_text(result: dict[str, object]) -> str: + """The agent's final spoken reply: the last assistant message that carries text. + + A tool-using turn ends in an ``AIMessage`` whose ``content`` is the spoken answer, + but earlier ``AIMessage``\\s in the same turn (the tool-call requests) have empty + text — so we scan from the end for the last one with non-empty content. + """ + messages = result.get("messages") + if not isinstance(messages, list): + return "" + for message in reversed(messages): + if type(message).__name__ != "AIMessage": + continue + text = _content_text(getattr(message, "content", "")).strip() + if text: + return text + return "" + + +def _content_text(content: object) -> str: + """Coerce a message's content (a string, or a list of content blocks) to plain text.""" + if isinstance(content, str): + return content + if isinstance(content, list): + return "".join( + block.get("text", "") if isinstance(block, dict) else str(block) for block in content + ) + return str(content) diff --git a/aai_cli/agent_cascade/engine.py b/aai_cli/agent_cascade/engine.py index 9c400657..af52f15a 100644 --- a/aai_cli/agent_cascade/engine.py +++ b/aai_cli/agent_cascade/engine.py @@ -18,9 +18,10 @@ from dataclasses import dataclass, field from typing import TYPE_CHECKING, Protocol +from aai_cli.agent_cascade import brain from aai_cli.agent_cascade.config import CascadeConfig from aai_cli.agent_cascade.text import split_sentences, trim_history -from aai_cli.core import client, llm +from aai_cli.core import client from aai_cli.core.errors import CLIError from aai_cli.tts import session as tts_session from aai_cli.tts.session import SpeakConfig @@ -121,15 +122,9 @@ def real( def run_stt(on_turn: Callable[[object], None]) -> None: client.stream_audio(api_key, audio, params=stt_params, on_turn=on_turn) - def complete_reply(messages: list[ChatCompletionMessageParam]) -> str: - response = llm.complete( - api_key, - model=config.model, - messages=messages, - max_tokens=config.max_tokens, - extra=dict(config.llm_extra) or None, - ) - return llm.content_of(response) + # The LLM leg is a deepagents graph (web search / URL fetch / docs tools), not a + # single completion, so a spoken turn can transparently use tools. + complete_reply = brain.build_completer(api_key, config) def synthesize(text: str) -> bytes: spec = SpeakConfig( diff --git a/aai_cli/code_agent/events.py b/aai_cli/code_agent/events.py index 4ee9136f..ed480bbd 100644 --- a/aai_cli/code_agent/events.py +++ b/aai_cli/code_agent/events.py @@ -21,6 +21,19 @@ class AssistantText: text: str +@dataclass(frozen=True) +class AssistantDelta: + """One streamed token of the in-progress reply, shown live then superseded by AssistantText. + + Emitted from langgraph's per-token ``messages`` stream so the front-end can render the + reply as it's generated; the authoritative full text still arrives as an AssistantText + when the step lands, so a consumer that ignores deltas (the headless renderer) loses + nothing. + """ + + text: str + + @dataclass(frozen=True) class ToolCall: """The agent's request to run a tool (announced when not gated by approval).""" @@ -44,7 +57,21 @@ class ErrorText: text: str -Event = AssistantText | ToolCall | ToolResult | ErrorText +Event = AssistantText | AssistantDelta | ToolCall | ToolResult | ErrorText + + +def assistant_delta(payload: object) -> AssistantDelta | None: + """Extract a streaming assistant-text token from a ``messages``-mode stream payload. + + langgraph's ``messages`` mode yields ``(message_chunk, metadata)``; we surface only the + AI message's text tokens (tool-call requests and tool results carry no prose, and other + message kinds aren't the assistant talking), so the live region streams just the reply. + """ + chunk = payload[0] if isinstance(payload, tuple) and payload else payload + if type(chunk).__name__ not in ("AIMessage", "AIMessageChunk"): + return None + text = _text_of(getattr(chunk, "content", "")) + return AssistantDelta(text) if text else None def _text_of(content: object) -> str: diff --git a/aai_cli/code_agent/messages.py b/aai_cli/code_agent/messages.py new file mode 100644 index 00000000..8bb1ad2d --- /dev/null +++ b/aai_cli/code_agent/messages.py @@ -0,0 +1,110 @@ +"""Mounted transcript widgets for the coding-agent TUI. + +The transcript is a ``VerticalScroll`` of these widgets rather than an append-only ``RichLog``, +which buys two things deepagents-code has: the assistant reply updates *in place* as it streams +(no separate live region), and a tool's output is a collapsible row — a clipped preview that +expands to the full output on Ctrl+O or a click. + +Dynamic content (model/tool/user strings) is wrapped in ``rich.text.Text`` so it's shown +literally — Text doesn't parse console markup, so a stray ``[`` can't raise or inject styling. +""" + +from __future__ import annotations + +from collections.abc import Mapping + +from rich.markdown import Markdown +from rich.text import Text +from textual.widgets import Static + +from aai_cli.code_agent.summarize import summarize_call, summarize_result + +_DIM = "#8a8f98" # muted gray for tool lines / notes +_ERROR = "#f04438" + + +class Note(Static): + """A dim one-line transcript aside (``cancelling…``, ``copied…``, ``voice off…``).""" + + def __init__(self, text: str) -> None: + super().__init__(Text(text, style=_DIM)) + + +class UserMessage(Static): + """The echoed user prompt, with a top margin so each turn is visually separated.""" + + DEFAULT_CSS = "UserMessage { margin-top: 1; }" + + def __init__(self, text: str) -> None: + super().__init__(Text(f"» {text}", style="bold #38bdf8")) + + +class AssistantMessage(Static): + """The assistant's reply: streams plain text token-by-token, then renders as Markdown.""" + + def __init__(self) -> None: + super().__init__() + self._tokens: list[str] = [] # accumulate tokens, not str +=, to avoid quadratic growth + + @property + def text(self) -> str: + """The reply text streamed so far (used to finalize a cancelled generation).""" + return "".join(self._tokens) + + def stream(self, delta: str) -> None: + """Append a streamed token and repaint as plain text (cheap; no per-token markdown).""" + self._tokens.append(delta) + self.update(Text(self.text)) + + def finalize(self, text: str) -> None: + """Replace the streamed text with the authoritative reply, rendered as Markdown.""" + self._tokens = [text] + self.update(Markdown(text)) + + +class ToolCallLine(Static): + """A compact tool-call line, e.g. ``→ write_file(app.py)``.""" + + def __init__(self, name: str, args: Mapping[str, object]) -> None: + super().__init__(Text(f"→ {summarize_call(name, args)}", style=_DIM)) + + +class ErrorMessage(Static): + """A failed turn, shown instead of crashing the UI.""" + + def __init__(self, text: str) -> None: + super().__init__(Text(f"✗ {text}", style=_ERROR)) + + +class ToolOutput(Static): + """A tool's output: a clipped preview that expands to the full content (Ctrl+O / click).""" + + def __init__(self, name: str, content: str) -> None: + super().__init__() + self._name = name + self._full = content.strip() + self._preview = summarize_result(content) + self._expandable = self._preview != self._full # nothing to expand when it fits already + self._expanded = False + + def on_mount(self) -> None: + self._repaint() + + def on_click(self) -> None: + self.toggle() + + def toggle(self) -> None: + """Flip between the clipped preview and the full output (no-op when it all fits).""" + if not self._expandable: + return + self._expanded = not self._expanded + self._repaint() + + def _repaint(self) -> None: + body = self._full if self._expanded else self._preview + line = Text(f" {self._name}: ", style=_DIM) + line.append(body, style=_DIM) + if self._expandable: + hint = " (Ctrl+O to collapse)" if self._expanded else " (Ctrl+O to expand)" + line.append(hint, style=f"{_DIM} italic") + self.update(line) diff --git a/aai_cli/code_agent/modals.py b/aai_cli/code_agent/modals.py new file mode 100644 index 00000000..25c54a7c --- /dev/null +++ b/aai_cli/code_agent/modals.py @@ -0,0 +1,202 @@ +"""Bottom-docked modal screens for the coding-agent TUI: tool approval and agent questions. + +Split out of `tui.py` to keep each module under the file-length gate. Both are transparent +``ModalScreen``s docked at the bottom, so the transcript stays visible above them (see the +``ModalScreen { background: transparent }`` rule in :class:`~aai_cli.code_agent.tui.CodeAgentApp`). + +In voice mode each modal is also **spoken and voice-answerable**: when constructed with a +``voice`` IO it speaks the prompt and listens for a spoken reply (approve / auto / reject, or a +free-text answer), off the UI thread. The keyboard path always stays available as a fallback. +""" + +from __future__ import annotations + +import re +import threading +from typing import TYPE_CHECKING, ClassVar + +from rich.markup import escape +from textual.app import ComposeResult +from textual.containers import Vertical +from textual.screen import ModalScreen +from textual.widgets import Input, Label + +from aai_cli.code_agent import banner, risk +from aai_cli.code_agent.summarize import describe_args, full_args +from aai_cli.core import errors + +if TYPE_CHECKING: + from collections.abc import Callable, Mapping + + from aai_cli.code_agent.voice_ui import _VoiceIO + + +def _spawn(target: Callable[[], None]) -> None: + """Run ``target`` on a daemon thread — the voice legs block, so they stay off the UI thread.""" + threading.Thread(target=target, daemon=True).start() # pragma: no mutate + + +# Spoken-answer vocabulary. "auto" wins first (it implies approval); an unclear answer falls +# back to "reject" — the same safe default as the keyboard, so a tool never runs on a guess. +_REJECT_WORDS = frozenset({"no", "reject", "deny", "stop", "cancel", "nope", "nah"}) +_APPROVE_WORDS = frozenset({"yes", "approve", "yeah", "yep", "yup", "sure", "ok", "okay"}) + + +def approval_from_speech(text: str) -> str: + """Map a spoken reply to ``"approve"`` / ``"auto"`` / ``"reject"`` (unclear → reject).""" + lowered = text.lower() + words = set(re.findall(r"[a-z]+", lowered)) + if "auto" in lowered or "always" in lowered: + return "auto" + if words & _REJECT_WORDS or "don't" in lowered or "do not" in lowered: + return "reject" + if words & _APPROVE_WORDS or "go ahead" in lowered or "do it" in lowered: + return "approve" + return "reject" + + +class ApprovalScreen(ModalScreen[str]): + """A compact, bottom-docked prompt to approve/auto-approve/reject one tool call. + + Keyboard ``y / a / n`` (and ``e`` to expand the args); in voice mode it also speaks the + prompt and accepts a spoken approve/auto/reject. The transparent background leaves the + transcript visible, and a risky call (``rm -rf``, an internal fetch) carries a warning. + """ + + DEFAULT_CSS = """ + ApprovalScreen { align: center bottom; background: transparent; } + ApprovalScreen #approvalbox { + dock: bottom; width: 1fr; height: auto; + border: round #f59e0b; background: #000000; padding: 0 1; margin: 0 1 1 1; + } + ApprovalScreen #approvalbox Label { height: auto; } + """ + BINDINGS: ClassVar = [ + ("y", "approve", "Approve"), + ("a", "auto", "Auto-approve"), + ("n", "reject", "Reject"), + ("e", "expand", "Expand"), + ] + + def __init__( + self, name: str, args: Mapping[str, object], *, voice: _VoiceIO | None = None + ) -> None: + super().__init__() + self._tool_name = name # not _name: that shadows Textual Widget's str|None attr + self._args = args + self._expanded = False # toggled by `e`; collapsed (one-line) by default + self._voice = voice # when set, the prompt is spoken and a spoken answer is accepted + self._answered = False # guards against a voice answer and a keypress both dismissing + + def compose(self) -> ComposeResult: + with Vertical(id="approvalbox"): + warning = risk.risk_warning(self._tool_name, self._args) + if warning: + yield Label(f"[b #f04438]⚠ {escape(warning)}[/]", id="approvalwarn") + yield Label(self._detail_markup(), id="approvaldetail") + yield Label( + f"[b #22c55e]y[/] approve [b {banner.BRAND_HEX}]a[/] auto-approve " + "[b #f04438]n[/] reject [b]e[/] expand" + ) + + def on_mount(self) -> None: + if (voice := self._voice) is not None: # drive the decision by voice, off the UI thread + _spawn(lambda: self._drive_by_voice(voice)) + + def _drive_by_voice(self, voice: _VoiceIO) -> None: + """Speak the prompt and accept a spoken approve/auto/reject (keyboard still works).""" + try: + voice.speak(self._spoken_prompt()) + transcript = voice.listen() + except errors.CLIError: + return # mic/STT failed: leave the keyboard hint as the way to answer + if transcript: # silence (None) must not auto-reject a tool — wait for speech or a key + self.app.call_from_thread(self._decide, approval_from_speech(transcript)) + + def _spoken_prompt(self) -> str: + """The read-aloud version of the prompt: the tool, its arg, any warning, the options.""" + parts = [f"Run {self._tool_name}."] + detail = describe_args(self._args) + if detail: + parts.append(f"{detail}.") + warning = risk.risk_warning(self._tool_name, self._args) + if warning: + parts.append(f"Warning: {warning}") + parts.append("Say approve, auto-approve, or reject.") + return " ".join(parts) + + def _decide(self, decision: str) -> None: + """Dismiss once, whether the answer came by spoken reply or keypress.""" + if self._answered: + return + self._answered = True + self.dismiss(decision) + + def _detail_markup(self) -> str: + """The 'Run tool X?' line — the compact arg, or the full args when expanded.""" + args = full_args(self._args) if self._expanded else describe_args(self._args) + return f"Run tool [b]{escape(self._tool_name)}[/b]? [dim]{escape(args)}[/dim]" + + def action_expand(self) -> None: + """Toggle between the compact identifying arg and the full args (``e``).""" + self._expanded = not self._expanded + self.query_one("#approvaldetail", Label).update(self._detail_markup()) + + def action_approve(self) -> None: + self._decide("approve") + + def action_auto(self) -> None: + self._decide("auto") + + def action_reject(self) -> None: + self._decide("reject") + + +class AskScreen(ModalScreen[str]): + """A bottom-docked prompt that relays a question from the agent and returns the answer. + + In voice mode it speaks the question and takes a spoken answer; otherwise the user types. + """ + + DEFAULT_CSS = """ + AskScreen { align: center bottom; background: transparent; } + AskScreen #askbox { + dock: bottom; width: 1fr; height: auto; + border: round #3a3f55; background: #000000; padding: 0 1; margin: 0 1 1 1; + } + """ + + def __init__(self, question: str, *, voice: _VoiceIO | None = None) -> None: + super().__init__() + self._question = question + self._voice = voice + self._answered = False + + def compose(self) -> ComposeResult: + with Vertical(id="askbox"): + yield Label(f"[b]The agent asks:[/b] {escape(self._question)}") + yield Input(id="answer", placeholder="Type your answer and press Enter…") + + def on_mount(self) -> None: + if (voice := self._voice) is not None: + _spawn(lambda: self._drive_by_voice(voice)) + + def _drive_by_voice(self, voice: _VoiceIO) -> None: + """Speak the question and submit a spoken answer (typing still works).""" + try: + voice.speak(f"The agent asks: {self._question}") + transcript = voice.listen() + except errors.CLIError: + return + if transcript: + self.app.call_from_thread(self._answer, transcript) + + def _answer(self, text: str) -> None: + """Dismiss once with the answer, whether spoken or typed.""" + if self._answered: + return + self._answered = True + self.dismiss(text) + + def on_input_submitted(self, event: Input.Submitted) -> None: + self._answer(event.value) diff --git a/aai_cli/code_agent/model.py b/aai_cli/code_agent/model.py index bdb6a4a2..716af2fc 100644 --- a/aai_cli/code_agent/model.py +++ b/aai_cli/code_agent/model.py @@ -8,6 +8,7 @@ from __future__ import annotations +from collections.abc import Mapping from typing import TYPE_CHECKING from aai_cli.core import environments @@ -37,7 +38,13 @@ def _flatten_content(messages: object) -> None: ) -def build_model(api_key: str, *, model: str) -> BaseChatModel: +def build_model( + api_key: str, + *, + model: str, + max_tokens: int | None = None, + extra: Mapping[str, object] | None = None, +) -> BaseChatModel: """A ChatOpenAI bound to the active environment's LLM Gateway. ``use_responses_api=False`` keeps it on the chat-completions endpoint the gateway @@ -45,6 +52,12 @@ def build_model(api_key: str, *, model: str) -> BaseChatModel: Responses API that langchain would otherwise prefer for ``openai:`` models. The subclass also flattens content-parts arrays the gateway rejects (see :func:`_flatten_content`). + + ``max_tokens`` caps the per-reply length (the live voice agent passes a small cap to + keep spoken replies short and fast); ``extra`` passes any additional gateway request + fields through as ``extra_body`` (so they reach the request body verbatim, like + `aai_cli.core.llm`'s ``extra``). Both default to off so the coding agent's call is + unchanged. """ from langchain_openai import ChatOpenAI from pydantic import SecretStr @@ -64,4 +77,6 @@ def _get_request_payload( base_url=environments.active().llm_gateway_base, api_key=SecretStr(api_key), use_responses_api=False, + max_tokens=max_tokens, + extra_body=dict(extra) if extra else None, ) diff --git a/aai_cli/code_agent/render.py b/aai_cli/code_agent/render.py index 499cd4e9..e0e7d639 100644 --- a/aai_cli/code_agent/render.py +++ b/aai_cli/code_agent/render.py @@ -9,20 +9,14 @@ from collections.abc import Callable +from rich.markdown import Markdown from rich.markup import escape from aai_cli.code_agent.events import AssistantText, ErrorText, Event, ToolCall, ToolResult from aai_cli.code_agent.session import Approver +from aai_cli.code_agent.summarize import summarize_call, summarize_result from aai_cli.ui import output -# Tool output can be long; clip it for the inline transcript. -_RESULT_PREVIEW = 2000 - - -def _format_args(args: dict[str, object]) -> str: - """A compact one-line view of a tool call's arguments.""" - return ", ".join(f"{key}={value!r}" for key, value in args.items()) - class RichRenderer: """An :data:`~aai_cli.code_agent.session.EventSink` that prints to the Rich console.""" @@ -31,13 +25,16 @@ def __call__(self, event: Event) -> None: # escape() dynamic content so a model/tool string with "[" can't inject Rich # markup or raise MarkupError (matches the inline-escape convention in output.py). if isinstance(event, AssistantText): - output.console.print(escape(event.text)) + # Render as Markdown so fenced code blocks are syntax-highlighted (and lists/ + # headings format) instead of showing raw ``` markers — Markdown parses its own + # syntax, not console markup, so no escape()/injection concern. + output.console.print(Markdown(event.text)) elif isinstance(event, ToolCall): output.console.print( - f"[aai.muted]→ {escape(event.name)}({escape(_format_args(event.args))})[/aai.muted]" + f"[aai.muted]→ {escape(summarize_call(event.name, event.args))}[/aai.muted]" ) elif isinstance(event, ToolResult): - preview = escape(event.content.strip()[:_RESULT_PREVIEW]) + preview = escape(summarize_result(event.content)) output.console.print(f"[aai.muted] {escape(event.name)}: {preview}[/aai.muted]") elif isinstance(event, ErrorText): output.error_console.print(output.fail(escape(event.text))) diff --git a/aai_cli/code_agent/risk.py b/aai_cli/code_agent/risk.py new file mode 100644 index 00000000..6c7b7e8e --- /dev/null +++ b/aai_cli/code_agent/risk.py @@ -0,0 +1,68 @@ +"""Heuristic risk flags for tool calls, surfaced on the approval prompt. + +The approval modal already shows *what* a tool will do; for the genuinely dangerous calls it +also shows *why to look twice* — a one-line warning, the way deepagents-code badges suspicious +shell commands and URLs. Purely advisory (the real SSRF guard lives in ``fetch_tool``); this +only nudges the human reviewing a manual approval. Pure functions so they unit-test cleanly. +""" + +from __future__ import annotations + +import re +from collections.abc import Mapping + +from aai_cli.code_agent.fetch_tool import FETCH_TOOL_NAME + +# Shell fragments that can destroy data, escalate privileges, or pipe a remote script straight +# into a shell — the classic "are you sure?" cases. Word-ish boundaries avoid matching inside +# innocuous longer tokens (e.g. ``format`` should not trip ``mkfs``). +_DANGEROUS_SHELL = ( + (re.compile(r"\brm\s+(-\w*\s+)*-\w*[rf]", re.I), "deletes files recursively/forcibly"), + (re.compile(r"\bsudo\b", re.I), "runs with elevated privileges"), + (re.compile(r"\bmkfs\b|\bdd\s+if=", re.I), "can overwrite a disk or filesystem"), + (re.compile(r":\s*\(\)\s*\{.*\|.*&\s*\}\s*;"), "looks like a fork bomb"), + ( + re.compile(r"\b(curl|wget)\b[^|]*\|\s*(sudo\s+)?(ba)?sh\b", re.I), + "pipes a download into a shell", + ), + (re.compile(r">\s*/dev/(sd|disk|nvme)", re.I), "writes directly to a block device"), +) +# URL hosts that mean a fetch is reaching a local/internal target rather than the public web. +_LOCAL_HOST = re.compile( + r"^(localhost|127\.|0\.0\.0\.0|10\.|192\.168\.|169\.254\.|172\.(1[6-9]|2\d|3[01])\.|\[?::1\]?)", + re.I, +) + + +def _shell_warning(command: str) -> str | None: + for pattern, reason in _DANGEROUS_SHELL: + if pattern.search(command): + return f"This command {reason}." + return None + + +def _url_warning(url: str) -> str | None: + stripped = url.strip() + if stripped.lower().startswith("file:"): + return "This URL reads a local file (file://)." + host = re.sub(r"^[a-z]+://", "", stripped, flags=re.I) + if _LOCAL_HOST.match(host): + return "This URL targets a local/internal address." + return None + + +def risk_warning(name: str, args: Mapping[str, object]) -> str | None: + """A one-line caution for a risky tool call, or ``None`` when nothing stands out. + + Flags destructive/privileged shell commands (``execute``) and fetches aimed at local or + ``file://`` targets; everything else returns ``None``. + """ + if name == "execute": + command = args.get("command") + if isinstance(command, str): + return _shell_warning(command) + elif name == FETCH_TOOL_NAME: + url = args.get("url") + if isinstance(url, str): + return _url_warning(url) + return None diff --git a/aai_cli/code_agent/session.py b/aai_cli/code_agent/session.py index 64b6c904..b3ce738f 100644 --- a/aai_cli/code_agent/session.py +++ b/aai_cli/code_agent/session.py @@ -18,6 +18,7 @@ from aai_cli.code_agent.events import ( ErrorText, Event, + assistant_delta, interrupt_request, message_events, new_messages, @@ -43,9 +44,18 @@ class _SupportsStream(Protocol): """ def stream( - self, graph_input: object, config: Mapping[str, object] | None, *, stream_mode: str - ) -> Iterator[dict[str, object]]: - """Yield the running state (incl. the growing ``messages``) after each super-step.""" + self, + graph_input: object, + config: Mapping[str, object] | None, + *, + stream_mode: list[str], + ) -> Iterator[tuple[str, object]]: + """Yield ``(mode, payload)`` pairs — ``"values"`` state snapshots and ``"messages"`` deltas. + + With a *list* ``stream_mode`` langgraph tags each yield with its mode, so the caller + can render off the per-super-step ``"values"`` state while still seeing the frequent + per-token ``"messages"`` deltas (used only as a fine-grained cancellation checkpoint). + """ @dataclass @@ -97,17 +107,28 @@ def send(self, text: str) -> None: def _run(self, graph_input: object, config: dict[str, object]) -> dict[str, object]: """Drive one graph segment, emitting events as each step completes; return the end state. - Streaming (``stream_mode="values"``) renders intermediate tool calls/results live and - lets :meth:`request_cancel` break the loop between steps. A double that only implements - ``invoke`` (the TUI/REPL test fakes) emits once at the end instead. + We render the finished messages from the per-super-step ``"values"`` snapshots, and + stream the ``"messages"`` (per-token) deltas alongside them for two reasons: a live + front-end shows the reply as it's generated (emitted as ``AssistantDelta``), and the + frequent deltas give :meth:`request_cancel` a checkpoint *within* a long step — a + single model generation is one super-step, so a values-only loop couldn't break until + the whole reply landed. A double that only implements ``invoke`` (the TUI/REPL test + fakes) emits once at the end instead. """ if isinstance(self.agent, _SupportsStream): last: dict[str, object] = {} - for chunk in self.agent.stream(graph_input, config, stream_mode="values"): + for mode, payload in self.agent.stream( + graph_input, config, stream_mode=["values", "messages"] + ): if self._cancel.is_set(): break - self._emit_new(chunk) - last = chunk + if mode == "values" and isinstance(payload, dict): + self._emit_new(payload) + last = payload + elif mode == "messages": + delta = assistant_delta(payload) + if delta is not None: + self.sink(delta) return last result = self.agent.invoke(graph_input, config) self._emit_new(result) diff --git a/aai_cli/code_agent/summarize.py b/aai_cli/code_agent/summarize.py new file mode 100644 index 00000000..ecb4a0c7 --- /dev/null +++ b/aai_cli/code_agent/summarize.py @@ -0,0 +1,96 @@ +"""Compact one-line summaries of tool activity, shared by both front-ends. + +A coding agent's tool args and output are routinely whole files or long command output. +Dumping them verbatim into the transcript buries the conversation — and, because args go +through ``repr``, renders literal ``\\n`` escapes. Both the Textual TUI (`tui.py`) and the +Rich fallback (`render.py`) route tool calls/results through these helpers so the +transcript stays scannable, mirroring how deepagents-code's collapsible tool rows show +just the identifying arg (a filename / command) and a short output preview with a +"+N more lines" tail rather than the full payload. +""" + +from __future__ import annotations + +from collections.abc import Mapping + +# Output preview budget (deepagents-code previews tool output at 4 lines / 300 chars behind +# an expand toggle; our append-only log has no expander, so we clip and tag the remainder). +_PREVIEW_LINES = 4 +_PREVIEW_CHARS = 300 +# Per-arg and arg-count caps so one giant value (a file's contents) can't flood the line. +_MAX_ARG_VALUE = 60 +_MAX_ARGS = 3 +# Per-value cap for the *expanded* approval view: values shown whole (newlines kept) but bounded +# so a multi-megabyte file can't make the modal unbounded. +_EXPANDED_VALUE = 1000 +# Args that identify a call on their own — show only this and elide bulky siblings (content). +_IDENTITY_ARGS = ("file_path", "path", "filename", "command", "url", "query", "pattern") + + +def _one_line(value: object, *, limit: int) -> str: + """Collapse ``value`` to a single clipped line (newlines → spaces, ellipsis if long).""" + text = " ".join(str(value).split()) + return text if len(text) <= limit else text[: limit - 1] + "…" + + +def describe_args(args: Mapping[str, object]) -> str: + """The compact arg view shared by the transcript line and the approval prompt. + + Prefers a single identifying arg (a path/command/URL) so a ``write_file`` reads as + ``app.py`` instead of inlining the file being written; otherwise shows up to a few + short ``key=value`` args, each clipped, with a trailing ``…`` when more were elided. + """ + for key in _IDENTITY_ARGS: + if key in args: + return _one_line(args[key], limit=_MAX_ARG_VALUE) + shown = list(args.items())[:_MAX_ARGS] + body = ", ".join(f"{key}={_one_line(value, limit=_MAX_ARG_VALUE)}" for key, value in shown) + if len(args) > _MAX_ARGS: + body = f"{body}, …" if body else "…" + return body + + +def summarize_call(name: str, args: Mapping[str, object]) -> str: + """A compact ``name(key arg)`` view of a tool call for the transcript.""" + return f"{name}({describe_args(args)})" + + +def full_args(args: Mapping[str, object]) -> str: + """The full ``key=value`` arg view shown when the approval prompt is expanded (``e``). + + Values are shown whole (newlines preserved) but each is capped at ``_EXPANDED_VALUE`` so a + huge file can't make the modal unbounded; :func:`describe_args` is the collapsed view. + """ + lines = [] + for key, value in args.items(): + text = str(value) + if len(text) > _EXPANDED_VALUE: + text = ( + f"{text[:_EXPANDED_VALUE].rstrip()} … (+{len(text) - _EXPANDED_VALUE} more chars)" + ) + lines.append(f"{key}={text}") + return "\n".join(lines) + + +def summarize_result(content: str) -> str: + """A short preview of tool output: the first few lines, clipped, with a hidden-count tail. + + Returns at most ``_PREVIEW_LINES`` lines and ``_PREVIEW_CHARS`` characters; when the + output was longer, appends ``… (+N more lines)`` (or ``… (+N more chars)`` when a single + long line was clipped) so the elision is visible rather than silent. + """ + text = content.strip() + if not text: + return "" + lines = text.splitlines() + preview_lines = lines[:_PREVIEW_LINES] + preview = "\n".join(preview_lines) + hidden_lines = len(lines) - len(preview_lines) + if len(preview) > _PREVIEW_CHARS: + kept = preview[:_PREVIEW_CHARS].rstrip() + hidden_chars = len(preview) - len(kept) + tail = f"+{hidden_lines} more lines" if hidden_lines else f"+{hidden_chars} more chars" + return f"{kept} … ({tail})" + if hidden_lines > 0: + return f"{preview} … (+{hidden_lines} more lines)" + return preview diff --git a/aai_cli/code_agent/tui.py b/aai_cli/code_agent/tui.py index 264d64c9..cc5a4010 100644 --- a/aai_cli/code_agent/tui.py +++ b/aai_cli/code_agent/tui.py @@ -13,169 +13,77 @@ import threading import time from pathlib import Path -from typing import TYPE_CHECKING, ClassVar, Protocol +from typing import TYPE_CHECKING, ClassVar from rich.markup import escape -from textual.app import App, ComposeResult -from textual.containers import Horizontal, Vertical +from textual.app import ComposeResult +from textual.containers import Horizontal, VerticalScroll from textual.screen import ModalScreen -from textual.widgets import Input, Label, RichLog, Static +from textual.widgets import Input, Static from textual.worker import Worker from aai_cli.code_agent import banner from aai_cli.code_agent.agent import CompiledAgent from aai_cli.code_agent.ask_tool import AskBridge -from aai_cli.code_agent.events import AssistantText, ErrorText, Event, ToolCall, ToolResult +from aai_cli.code_agent.events import ( + AssistantDelta, + AssistantText, + ErrorText, + Event, + ToolCall, + ToolResult, +) +from aai_cli.code_agent.messages import ( + AssistantMessage, + ErrorMessage, + Note, + ToolCallLine, + ToolOutput, + UserMessage, +) +from aai_cli.code_agent.modals import ApprovalScreen, AskScreen from aai_cli.code_agent.session import CodeSession -from aai_cli.code_agent.voice import spoken_summary -from aai_cli.core import errors +from aai_cli.code_agent.tui_status import _spinner_text, _status_text +from aai_cli.code_agent.voice_ui import _VoiceIO, _VoiceLegs if TYPE_CHECKING: - from collections.abc import Callable, Mapping - from textual.timer import Timer # Glyphs cycled by the working indicator's animation (purely cosmetic). _SPIN_FRAMES = "✶✷✸✹✺" # pragma: no mutate # Seconds the Ctrl-C "press again to quit" hint stays armed (deepagents-code uses 3s too). _QUIT_HINT_SECONDS = 3 # pragma: no mutate +# Animated meter for the voice bar — a 3-cell block-char pulse (BMP, single-width, no emoji). +_VOICE_FRAMES = ("▁▃▅", "▃▅▇", "▅▇▆", "▆▇▅", "▇▅▃", "▅▃▁") # pragma: no mutate +# The three voice phases the bar distinguishes, each (label, accent color). +_VOICE_PHASES: dict[str, tuple[str, str]] = { + "listening": ("Listening — speak your request", banner.BRAND_HEX), + "thinking": ("Thinking…", "#f59e0b"), + "speaking": ("Speaking…", "#22c55e"), +} -class _VoiceIO(Protocol): - """The speak-to-it / read-back slice the TUI drives; :class:`VoiceSession` satisfies it.""" - - def listen(self) -> str | None: - """Capture one spoken turn and return its transcript (``None`` on no speech).""" - - def speak(self, text: str) -> None: - """Read ``text`` back aloud (a no-op when readback is unavailable).""" - - -def _format_args(args: Mapping[str, object]) -> str: - return ", ".join(f"{key}={value!r}" for key, value in args.items()) - - -def _spinner_text(elapsed_s: int, frame: str) -> str: - """The working-indicator line: a spinner glyph and the elapsed seconds.""" - return f"{frame} Working… ({elapsed_s}s)" - - -def _abbrev_home(path: Path) -> str: - """Render ``path`` with the home directory collapsed to ``~``.""" - try: - return f"~/{path.relative_to(Path.home())}" - except ValueError: - return str(path) - - -def _git_branch(start: Path) -> str | None: - """The current git branch for ``start`` (walking up to the repo root), or None.""" - for directory in (start, *start.parents): - head = directory / ".git" / "HEAD" - if head.is_file(): - ref = head.read_text(encoding="utf-8").strip() - return ref.removeprefix("ref: refs/heads/") if ref.startswith("ref: ") else ref[:8] - return None - - -def _status_text(cwd: Path, *, auto_approve: bool) -> str: - """The bottom status line: a mode badge, the working directory, and the git branch.""" - mode = "auto" if auto_approve else "manual" - badge = f"[black on #f59e0b] {mode} [/]" - parts = [badge, f"[dim]{_abbrev_home(cwd)}[/dim]"] - branch = _git_branch(cwd) - if branch: - parts.append(f"[dim]↗ {branch}[/dim]") - return " ".join(parts) - - -class ApprovalScreen(ModalScreen[str]): - """A compact, bottom-docked prompt to approve/auto-approve/reject one tool call. - - Keyboard-only — a plain one-line ``y / a / n`` hint instead of clickable buttons, so it - reads like a CLI prompt rather than a chrome-heavy dialog. The transparent screen - background leaves the transcript visible above (no full-screen takeover); the decision is - one of ``"approve"``, ``"auto"``, or ``"reject"``. - """ - - DEFAULT_CSS = """ - ApprovalScreen { align: center bottom; background: transparent; } - ApprovalScreen #approvalbox { - dock: bottom; width: 1fr; height: auto; - border: round #f59e0b; background: #000000; padding: 0 1; margin: 0 1 1 1; - } - ApprovalScreen #approvalbox Label { height: auto; } - """ - BINDINGS: ClassVar = [ - ("y", "approve", "Approve"), - ("a", "auto", "Auto-approve"), - ("n", "reject", "Reject"), - ] - - def __init__(self, name: str, args: Mapping[str, object]) -> None: - super().__init__() - self._tool_name = name # not _name: that shadows Textual Widget's str|None attr - self._args = args - - def compose(self) -> ComposeResult: - with Vertical(id="approvalbox"): - yield Label( - f"Run tool [b]{escape(self._tool_name)}[/b]? " - f"[dim]{escape(_format_args(self._args))}[/dim]" - ) - yield Label( - f"[b #22c55e]y[/] approve [b {banner.BRAND_HEX}]a[/] auto-approve " - "[b #f04438]n[/] reject" - ) - - def action_approve(self) -> None: - self.dismiss("approve") - - def action_auto(self) -> None: - self.dismiss("auto") - - def action_reject(self) -> None: - self.dismiss("reject") - - -class AskScreen(ModalScreen[str]): - """A bottom-docked prompt that relays a question from the agent and returns the answer.""" - - DEFAULT_CSS = """ - AskScreen { align: center bottom; background: transparent; } - AskScreen #askbox { - dock: bottom; width: 1fr; height: auto; - border: round #3a3f55; background: #000000; padding: 0 1; margin: 0 1 1 1; - } - """ - - def __init__(self, question: str) -> None: - super().__init__() - self._question = question - - def compose(self) -> ComposeResult: - with Vertical(id="askbox"): - yield Label(f"[b]The agent asks:[/b] {escape(self._question)}") - yield Input(id="answer", placeholder="Type your answer and press Enter…") - - def on_input_submitted(self, event: Input.Submitted) -> None: - self.dismiss(event.value) - - -class CodeAgentApp(App[None]): +class CodeAgentApp(_VoiceLegs): """The coding-agent TUI: conversation transcript + prompt + approval/ask modals.""" # Flat pure-black canvas — no panel fills/gray, just the bordered prompt and a status # line, matching the deepagents-code look (wordmark in the AssemblyAI brand blue). CSS = f""" Screen {{ background: #000000; }} - #log {{ - height: 1fr; border: none; background: #000000; padding: 1 2; - scrollbar-size-vertical: 0; - }} + /* The approval/ask modals must stay see-through so the transcript shows above their + docked prompt. Their own DEFAULT_CSS sets `background: transparent`, but app CSS beats + a widget's DEFAULT_CSS — without this rule the `Screen` canvas above paints the modal + opaque black (it matches every Screen subclass) and blanks the transcript behind it. */ + ModalScreen {{ background: transparent; }} + /* The transcript is a scroll container of mounted message widgets (not a RichLog), so the + reply streams in place and tool output can expand/collapse. */ + #log {{ height: 1fr; border: none; background: #000000; padding: 1 2; }} #promptbar {{ dock: bottom; height: 3; background: #000000; border: round #3a3f55; margin: 1 1; }} #promptmark {{ width: 3; color: {banner.BRAND_HEX}; content-align: center middle; }} #prompt {{ border: none; background: #000000; padding: 0; }} + /* Shown in place of the prompt while voice capture is on (Ctrl-V brings the prompt back). */ + #voicebar {{ dock: bottom; height: 3; background: #000000; border: round {banner.BRAND_HEX}; + margin: 1 1; content-align: center middle; display: none; }} /* In normal flow below the 1fr log, so it sits just above the docked prompt bar. */ #spinner {{ height: 1; background: #000000; padding: 0 2; color: {banner.BRAND_HEX}; display: none; }} @@ -191,6 +99,8 @@ class CodeAgentApp(App[None]): ("ctrl+c", "quit_or_interrupt", "Interrupt / Quit"), ("ctrl+q", "quit", "Quit"), ("ctrl+y", "copy_last", "Copy last reply"), + ("ctrl+v", "toggle_voice", "Toggle voice"), + ("ctrl+o", "toggle_output", "Expand/collapse output"), ] def __init__( @@ -212,6 +122,12 @@ def __init__( self._initial = initial self._voice = voice # when set, spoken turns drive the prompt and replies are read back self._voice_typed = False # flips once the mic is ruled out; then input is typed only + self._voice_paused = False # user-toggled off via Ctrl-V (distinct from a mic failure) + self._voice_phase = "listening" # listening / thinking / speaking, shown in the voice bar + self._voice_frames = itertools.cycle(_VOICE_FRAMES) + self._voice_timer: Timer | None = None # animates the voice-bar meter while it's shown + self._streaming_msg: AssistantMessage | None = None # the reply widget tokens stream into + self._last_tool_output: ToolOutput | None = None # the row Ctrl+O expands/collapses self._session_name = thread_id # not _thread_id: that shadows Textual App's int self._cwd = cwd if cwd is not None else Path.cwd() self._web_note = web_note @@ -231,34 +147,55 @@ def __init__( def compose(self) -> ComposeResult: # No Header/Footer chrome — the splash is the title and the bottom status line # the only footer, so the screen stays a flat dark canvas. - yield RichLog(id="log", wrap=True, markup=True) + yield VerticalScroll(id="log") # Docked before the prompt bar, so the working indicator sits just above the input. yield Static("", id="spinner") with Horizontal(id="promptbar"): yield Static(">", id="promptmark") yield Input(id="prompt", placeholder="Ask the agent to build something…") - yield Static(_status_text(self._cwd, auto_approve=self._auto_approve), id="status") - - def _write_splash(self, log: RichLog) -> None: - for row in banner.wordmark(): - log.write(f"[bold {banner.BRAND_HEX}]{row}[/]") - log.write(f"[dim]{banner.version()}[/dim]") - log.write("") - log.write(f"[dim]Thread: {self._session_name}[/dim]") - log.write("") - log.write(f"[{banner.BRAND_HEX}]{banner.READY_LINE}[/]") - log.write(f"[dim]{banner.TIP_LINE}[/dim]") + yield Static("", id="voicebar") # filled by _render_voicebar when voice mode is shown + yield Static( + _status_text( + self._cwd, auto_approve=self._auto_approve, voice_state=self._voice_state() + ), + id="status", + ) + + def _write_splash(self) -> None: + # The whole splash is fixed copy except the session name, so this markup is safe to + # parse (only the session name — a --session value — is escaped). + rows = [f"[bold {banner.BRAND_HEX}]{row}[/]" for row in banner.wordmark()] + rows += [ + f"[dim]{banner.version()}[/dim]", + "", + f"[dim]Thread: {escape(self._session_name)}[/dim]", + "", + f"[{banner.BRAND_HEX}]{banner.READY_LINE}[/]", + f"[dim]{banner.TIP_LINE}[/dim]", + ] + self._mount("\n".join(rows)) + + def _mount(self, widget: Static | str) -> None: + """Append a transcript widget (or a markup string) and scroll it into view.""" + log = self.query_one("#log", VerticalScroll) + log.mount(Static(widget) if isinstance(widget, str) else widget) + log.scroll_end(animate=False) # pragma: no mutate — cosmetic; animate flag is unassertable + + def _note(self, text: str) -> None: + """Append a dim transcript aside (cancelling / copied / voice-off).""" + self._mount(Note(text)) def on_mount(self) -> None: # Route the agent's ask_user tool through a modal (the bridge is shared with # the tool built before this app existed). self._ask_bridge.handler = self._ask - self._write_splash(self.query_one("#log", RichLog)) + self._write_splash() if self._web_note: self.notify(self._web_note, title="Web search disabled", severity="warning", timeout=10) # Put the cursor in the prompt so the user can type immediately (RichLog would # otherwise hold focus and swallow keystrokes). self.query_one("#prompt", Input).focus() + self._sync_input_mode() # in voice mode, swap the prompt for the listening affordance if self._initial: self._submit(self._initial) else: @@ -275,18 +212,34 @@ def _emit_event(self, event: Event) -> None: self.call_from_thread(self._write_event, event) def _write_event(self, event: Event) -> None: - log = self.query_one("#log", RichLog) - # Escape dynamic content: a model/tool string containing "[" would otherwise be - # parsed as Rich markup and raise MarkupError (crashing the turn), or inject styling. - if isinstance(event, AssistantText): - self._last_reply = event.text - log.write(escape(event.text)) + if isinstance(event, AssistantDelta): + # Stream the token into the live reply widget (mounting one on the first token), + # updated in place until the authoritative AssistantText finalizes it below. + if self._streaming_msg is None: + self._streaming_msg = AssistantMessage() + self._mount(self._streaming_msg) + self._streaming_msg.stream(event.text) + self.query_one("#log", VerticalScroll).scroll_end(animate=False) # pragma: no mutate + elif isinstance(event, AssistantText): + self._last_reply = event.text # keep the raw text for clipboard copy + self._finalize_reply(event.text) elif isinstance(event, ToolCall): - log.write(f"[dim]→ {escape(event.name)}({escape(_format_args(event.args))})[/dim]") + self._mount(ToolCallLine(event.name, event.args)) elif isinstance(event, ToolResult): - log.write(f"[dim] {escape(event.name)}: {escape(event.content.strip()[:2000])}[/dim]") + self._last_tool_output = ToolOutput(event.name, event.content) + self._mount(self._last_tool_output) elif isinstance(event, ErrorText): - log.write(f"[#F04438]✗ {escape(event.text)}[/#F04438]") + self._mount(ErrorMessage(event.text)) + + def _finalize_reply(self, text: str) -> None: + """Commit the reply: finalize the streamed widget in place, or mount a fresh one.""" + if self._streaming_msg is not None: + self._streaming_msg.finalize(text) + self._streaming_msg = None + else: + msg = AssistantMessage() + self._mount(msg) + msg.finalize(text) def action_copy_last(self) -> None: """Copy the most recent assistant reply to the system clipboard.""" @@ -294,7 +247,12 @@ def action_copy_last(self) -> None: if self._last_reply: pyperclip.copy(self._last_reply) - self.query_one("#log", RichLog).write("[dim](copied last reply to clipboard)[/dim]") + self._note("(copied last reply to clipboard)") + + def action_toggle_output(self) -> None: + """Ctrl-O: expand/collapse the most recent tool output (a no-op if there's none).""" + if self._last_tool_output is not None: + self._last_tool_output.toggle() # --- approval / ask (called on the worker thread) ------------------------- @@ -320,7 +278,8 @@ def _approve(self, name: str, args: dict[str, object]) -> bool: """ if self._auto_approve: return True - decision = self._modal_result(ApprovalScreen(name, args), default="reject") + screen = ApprovalScreen(name, args, voice=self._modal_voice()) + decision = self._modal_result(screen, default="reject") if decision == "auto": self._enable_auto_approve() return True @@ -333,14 +292,80 @@ def _enable_auto_approve(self) -> None: self.call_from_thread(self._refresh_status) def _refresh_status(self) -> None: - """Re-render the bottom status line (e.g. after the mode flips to auto).""" + """Re-render the bottom status line (e.g. after the mode flips to auto or voice toggles).""" self.query_one("#status", Static).update( - _status_text(self._cwd, auto_approve=self._auto_approve) + _status_text( + self._cwd, auto_approve=self._auto_approve, voice_state=self._voice_state() + ) ) + def _voice_state(self) -> str | None: + """``"on"``/``"off"`` for the status badge, or ``None`` when voice isn't wired up.""" + if self._voice is None: + return None + return "on" if self._voice_active() else "off" + + def action_toggle_voice(self) -> None: + """Ctrl-V: turn spoken input/readback on or off for the session. + + A no-op notice when no voice front-end exists (e.g. a piped/typed run). Re-enabling + kicks off listening again unless a turn is mid-flight (the post-turn followup will). + """ + if self._voice is None: + self.notify("Voice isn't available in this session", severity="warning") + return + self._voice_paused = not self._voice_paused + self._refresh_status() + self._sync_input_mode() # show/hide the text box vs. the listening affordance + if self._voice_paused: + self.notify("Voice off — type your request") + elif not self._turn_running(): + self.notify("Voice on — listening") + self._begin_listening() + + def _sync_input_mode(self) -> None: + """Swap the text prompt for the 'listening' affordance while voice capture is active. + + The Input stays mounted either way (it still holds the spoken transcript and the + turn-running ``disabled`` flag); only the bars' visibility flips. The prompt regains + focus whenever it's the visible input. + """ + listening = self._voice_active() + self.query_one("#promptbar", Horizontal).display = not listening + self.query_one("#voicebar", Static).display = listening + if listening: + self._render_voicebar() + if self._voice_timer is None: # animate the meter only while the bar is shown + self._voice_timer = self.set_interval(0.3, self._tick_voice) # pragma: no mutate + else: + if self._voice_timer is not None: + self._voice_timer.stop() + self._voice_timer = None + self.query_one("#prompt", Input).focus() + + def _set_voice_phase(self, phase: str) -> None: + """Switch the voice bar between listening / thinking / speaking and repaint it.""" + self._voice_phase = phase + self._render_voicebar() + + def _render_voicebar(self) -> None: + """Paint the voice bar for the current phase: an animated meter, label, and accent.""" + label, color = _VOICE_PHASES[self._voice_phase] + meter = next(self._voice_frames) + hint = " [dim](Ctrl-V to type)[/dim]" if self._voice_phase == "listening" else "" + self.query_one("#voicebar", Static).update(f"[{color}]{meter}[/] {escape(label)}{hint}") + + def _tick_voice(self) -> None: + """Advance the voice-bar meter one frame (the animation timer's callback).""" + self._render_voicebar() + def _ask(self, question: str) -> str: """Block the worker on a modal input screen and return the user's answer.""" - return self._modal_result(AskScreen(question), default="") + return self._modal_result(AskScreen(question, voice=self._modal_voice()), default="") + + def _modal_voice(self) -> _VoiceIO | None: + """The voice IO to drive a modal by speech, or ``None`` when voice isn't active.""" + return self._voice if self._voice_active() else None # --- interrupt / quit ----------------------------------------------------- # Mirrors deepagents-code: Escape interrupts a running turn; Ctrl-C interrupts a running @@ -361,7 +386,7 @@ def _cancel_turn(self) -> bool: if not self._turn_running(): return False self._session.request_cancel() - self.query_one("#log", RichLog).write("[dim](cancelling…)[/dim]") + self._note("cancelling…") return True def action_interrupt(self) -> None: @@ -396,9 +421,9 @@ def on_input_submitted(self, event: Input.Submitted) -> None: self._submit(text) def _submit(self, text: str) -> None: - log = self.query_one("#log", RichLog) - log.write(f"[b cyan]» {escape(text)}[/b cyan]") + self._mount(UserMessage(text)) self.query_one("#prompt", Input).disabled = True + self._set_voice_phase("thinking") # voice bar reflects the turn (no-op when bar hidden) self._start_spinner() self._run_turn(text) @@ -410,8 +435,14 @@ def _run_turn(self, text: str) -> Worker[None]: # --- working indicator (spinner + elapsed) -------------------------------- def _start_spinner(self) -> None: - """Show the working indicator and animate it while the turn runs.""" + """Show the working indicator and animate it while the turn runs. + + Skipped in voice mode — the voice bar already shows a "Thinking…" state, so a second + spinner would just be redundant chrome. + """ self._turn_started = time.monotonic() + if self._voice_active(): + return self.query_one("#spinner", Static).display = True self._tick() self._spin_timer = self.set_interval(0.25, self._tick) # pragma: no mutate @@ -430,65 +461,16 @@ def _stop_spinner(self) -> None: def on_worker_state_changed(self, event: Worker.StateChanged) -> None: if event.worker.is_finished: - self._stop_spinner() - prompt = self.query_one("#prompt", Input) - prompt.disabled = False - prompt.focus() - self._voice_followup() # read a spoken summary back, then listen for the next turn - - # --- voice (speak-to-it / read-summary-back; the legs run off the UI thread) ---- - - def _voice_active(self) -> bool: - """Voice capture is on: a session exists and the mic hasn't been ruled out yet.""" - return self._voice is not None and not self._voice_typed - - def _spawn(self, target: Callable[[], None]) -> None: - """Run ``target`` on a daemon thread — voice legs block, so they stay off the UI thread.""" - threading.Thread(target=target, daemon=True).start() # pragma: no mutate - - def _begin_listening(self) -> None: - """Capture the next spoken turn on a background thread (no-op when voice is off).""" - if not self._voice_active(): - return - self._spawn(self._capture_voice_turn) - - def _voice_followup(self) -> None: - """After a turn finishes: read back a spoken summary, then listen for the next turn.""" - voice = self._voice - if voice is None: - return - self._spawn(lambda: self._speak_then_listen(voice)) - - def _speak_then_listen(self, voice: _VoiceIO) -> None: - """Read a summary of the last reply aloud (no code), then capture the next spoken turn.""" - voice.speak(spoken_summary(self._last_reply)) - self._capture_voice_turn() - - def _capture_voice_turn(self) -> None: - """Listen for one spoken turn; enter it into the prompt, or degrade to typing.""" - voice = self._voice - if voice is None or self._voice_typed: - return - try: - transcript = voice.listen() - except errors.CLIError as exc: - # A capture failure (no mic, STT error) drops voice for the rest of the session - # rather than wedging it — the user just types instead. - self._voice_typed = True - self.call_from_thread(self._notice_voice_off, exc.message) - return - if transcript: - self.call_from_thread(self._enter_and_submit, transcript) - - def _notice_voice_off(self, detail: str) -> None: - """Tell the user voice input stopped and that input is now typed (UI thread).""" - self.query_one("#log", RichLog).write( - f"[dim](voice input off: {escape(detail)}; type your request instead)[/dim]" - ) - - def _enter_and_submit(self, text: str) -> None: - """Show the spoken text in the prompt, then submit it as a turn (UI thread).""" - prompt = self.query_one("#prompt", Input) - prompt.value = text - self._submit(text) - prompt.value = "" + self._finish_turn() + + def _finish_turn(self) -> None: + """Wind down a completed turn: stop the spinner, re-enable input, resume voice.""" + self._stop_spinner() + if self._streaming_msg is not None: # a cancelled generation: keep what streamed in + self._finalize_reply(self._streaming_msg.text) + self.query_one("#prompt", Input).disabled = False + self._sync_input_mode() # focus the prompt (text mode) or show the listening bar + self._voice_followup() # read a spoken summary back, then listen for the next turn + + # The off-thread voice legs (_voice_active, _begin_listening, _capture_voice_turn, …) are + # inherited from _VoiceLegs; the render/toggle side stays above. diff --git a/aai_cli/code_agent/tui_status.py b/aai_cli/code_agent/tui_status.py new file mode 100644 index 00000000..5e385b55 --- /dev/null +++ b/aai_cli/code_agent/tui_status.py @@ -0,0 +1,51 @@ +"""Pure text helpers for the coding-agent TUI's status line and working indicator. + +Split out of `tui.py` (to keep it under the file-length gate) and free of any Textual +imports, so they unit-test as plain functions. +""" + +from __future__ import annotations + +from pathlib import Path + + +def _spinner_text(elapsed_s: int, frame: str) -> str: + """The working-indicator line: a spinner glyph and the elapsed seconds.""" + return f"{frame} Working… ({elapsed_s}s)" + + +def _abbrev_home(path: Path) -> str: + """Render ``path`` with the home directory collapsed to ``~``.""" + try: + return f"~/{path.relative_to(Path.home())}" + except ValueError: + return str(path) + + +def _git_branch(start: Path) -> str | None: + """The current git branch for ``start`` (walking up to the repo root), or None.""" + for directory in (start, *start.parents): + head = directory / ".git" / "HEAD" + if head.is_file(): + ref = head.read_text(encoding="utf-8").strip() + return ref.removeprefix("ref: refs/heads/") if ref.startswith("ref: ") else ref[:8] + return None + + +def _status_text(cwd: Path, *, auto_approve: bool, voice_state: str | None = None) -> str: + """The bottom status line: a mode badge, the working directory, git branch, and voice state. + + ``voice_state`` is ``"on"``/``"off"`` when the session has a voice front-end (so the + Ctrl-V toggle shows its effect), or ``None`` when voice isn't wired up at all. + """ + mode = "auto" if auto_approve else "manual" + badge = f"[black on #f59e0b] {mode} [/]" + parts = [badge, f"[dim]{_abbrev_home(cwd)}[/dim]"] + branch = _git_branch(cwd) + if branch: + parts.append(f"[dim]↗ {branch}[/dim]") + if voice_state is not None: + # A filled/hollow dot (BMP glyphs, like the rest of the UI — no double-width emoji). + glyph, color = ("●", "#22c55e") if voice_state == "on" else ("○", "#6b7280") + parts.append(f"[{color}]{glyph} voice {voice_state}[/]") + return " ".join(parts) diff --git a/aai_cli/code_agent/voice_ui.py b/aai_cli/code_agent/voice_ui.py new file mode 100644 index 00000000..cdac1e29 --- /dev/null +++ b/aai_cli/code_agent/voice_ui.py @@ -0,0 +1,107 @@ +"""The voice front-end legs for the coding-agent TUI, split out to keep `tui.py` small. + +These are the speak-to-it / read-back mechanics that run *off* the UI thread (mic capture and +TTS readback block), marshaling back via ``call_from_thread``. They live in a mixin that +:class:`~aai_cli.code_agent.tui.CodeAgentApp` inherits, so the app stays one ``App`` with the +voice methods folded in. The render/toggle side (the voice bar, Ctrl-V) stays in `tui.py`. +""" + +from __future__ import annotations + +import threading +from typing import TYPE_CHECKING, Protocol + +from textual.app import App +from textual.widgets import Input + +from aai_cli.code_agent.voice import spoken_summary +from aai_cli.core import errors + +if TYPE_CHECKING: + from collections.abc import Callable + + +class _VoiceIO(Protocol): + """The speak-to-it / read-back slice the TUI drives; :class:`VoiceSession` satisfies it.""" + + def listen(self) -> str | None: + """Capture one spoken turn and return its transcript (``None`` on no speech).""" + + def speak(self, text: str) -> None: + """Read ``text`` back aloud (a no-op when readback is unavailable).""" + + +class _VoiceLegs(App[None]): + """Mixin holding the off-thread voice capture/readback legs for ``CodeAgentApp``. + + Extends ``App`` so the inherited ``query_one``/``call_from_thread`` are typed; the voice + state and the few app methods it leans on (``_set_voice_phase``/``_sync_input_mode``/ + ``_submit``) are provided by the concrete app and declared here for the type checker. + """ + + if TYPE_CHECKING: # provided by CodeAgentApp (state set in __init__, methods defined there) + _voice: _VoiceIO | None + _voice_typed: bool + _voice_paused: bool + _last_reply: str + + def _set_voice_phase(self, phase: str) -> None: ... + def _sync_input_mode(self) -> None: ... + def _submit(self, text: str) -> None: ... + def _note(self, text: str) -> None: ... + + def _voice_active(self) -> bool: + """Voice capture is on: a session exists, the mic isn't ruled out, and it isn't paused.""" + return self._voice is not None and not self._voice_typed and not self._voice_paused + + def _spawn(self, target: Callable[[], None]) -> None: + """Run ``target`` on a daemon thread — voice legs block, so they stay off the UI thread.""" + threading.Thread(target=target, daemon=True).start() # pragma: no mutate + + def _begin_listening(self) -> None: + """Capture the next spoken turn on a background thread (no-op when voice is off).""" + if not self._voice_active(): + return + self._spawn(self._capture_voice_turn) + + def _voice_followup(self) -> None: + """After a turn finishes: read back a spoken summary, then listen for the next turn.""" + voice = self._voice + if voice is None or self._voice_paused: # paused via Ctrl-V: no readback, no listen + return + self._spawn(lambda: self._speak_then_listen(voice)) + + def _speak_then_listen(self, voice: _VoiceIO) -> None: + """Read a summary of the last reply aloud (no code), then capture the next spoken turn.""" + self.call_from_thread(self._set_voice_phase, "speaking") + voice.speak(spoken_summary(self._last_reply)) + self._capture_voice_turn() + + def _capture_voice_turn(self) -> None: + """Listen for one spoken turn; enter it into the prompt, or degrade to typing.""" + voice = self._voice + if voice is None or self._voice_typed or self._voice_paused: + return + self.call_from_thread(self._set_voice_phase, "listening") + try: + transcript = voice.listen() + except errors.CLIError as exc: + # A capture failure (no mic, STT error) drops voice for the rest of the session + # rather than wedging it — the user just types instead. + self._voice_typed = True + self.call_from_thread(self._notice_voice_off, exc.message) + return + if transcript: + self.call_from_thread(self._enter_and_submit, transcript) + + def _notice_voice_off(self, detail: str) -> None: + """Tell the user voice input stopped and that input is now typed (UI thread).""" + self._note(f"voice input off: {detail}; type your request instead") + self._sync_input_mode() # mic ruled out -> bring the text box back + + def _enter_and_submit(self, text: str) -> None: + """Show the spoken text in the prompt, then submit it as a turn (UI thread).""" + prompt = self.query_one("#prompt", Input) + prompt.value = text + self._submit(text) + prompt.value = "" diff --git a/aai_cli/code_agent/web_search.py b/aai_cli/code_agent/web_search.py index d06af999..71ed2bff 100644 --- a/aai_cli/code_agent/web_search.py +++ b/aai_cli/code_agent/web_search.py @@ -19,6 +19,10 @@ # agent a tool that will fail on first use for lack of a key. TAVILY_API_KEY_ENV = "TAVILY_API_KEY" +# The name ``TavilySearch`` registers itself under. Callers (e.g. the live agent's prompt +# builder) detect web-search availability by this name, so a test pins it against the tool. +WEB_SEARCH_TOOL_NAME = "tavily_search" + # A small result cap keeps search responses inside the model's context budget. _DEFAULT_MAX_RESULTS = 5 diff --git a/aai_cli/code_gen/agent_cascade.py b/aai_cli/code_gen/agent_cascade.py index 0a861911..5f5306f0 100644 --- a/aai_cli/code_gen/agent_cascade.py +++ b/aai_cli/code_gen/agent_cascade.py @@ -16,9 +16,11 @@ # which is never formatted — so no brace has to be doubled. _HEADER = """\ # Live voice cascade: Streaming STT -> LLM Gateway -> streaming TTS, wired client-side. -# This is what `assembly --sandbox agent-cascade` runs: it transcribes your speech, +# The basic cascade behind `assembly --sandbox live`: it transcribes your speech, # sends each finalized turn to the LLM Gateway, and speaks the reply through streaming # TTS — the same three primitives the agent-cascade init template wires server-side. +# (The `live` command adds a tool-using agent on the LLM leg; this snippet is the +# plain single-completion version to build from.) # Requires audio + websockets: pip install sounddevice websockets openai # Tip: use headphones — the mic stays open while the agent speaks, so on speakers it # would hear itself and loop. diff --git a/aai_cli/commands/agent/__init__.py b/aai_cli/commands/agent/__init__.py index f535b54c..b20dfc2a 100644 --- a/aai_cli/commands/agent/__init__.py +++ b/aai_cli/commands/agent/__init__.py @@ -84,7 +84,7 @@ def agent( help="Print the equivalent Python SDK code and exit (does not start a session)", ), ) -> None: - """Hold a live two-way voice conversation with a voice agent + """Hold a live two-way voice conversation with the Voice Agent API Use headphones: the mic stays open while the agent speaks, so on speakers it would hear itself and loop. Pass an audio file/URL (or diff --git a/aai_cli/commands/agent_cascade/__init__.py b/aai_cli/commands/agent_cascade/__init__.py index 3e99f146..b17e85e8 100644 --- a/aai_cli/commands/agent_cascade/__init__.py +++ b/aai_cli/commands/agent_cascade/__init__.py @@ -31,7 +31,7 @@ SPEC = command_registry.CommandModuleSpec( panel=help_panels.TRANSCRIPTION, order=45, # pragma: no mutate -- sparse rank; a +-1 shift is order-equivalent - commands=("agent-cascade",), + commands=("live",), ) @@ -43,28 +43,28 @@ def _emit_voice_list(_state: AppState, json_mode: bool) -> None: @app.command( - name="agent-cascade", + name="live", rich_help_panel=help_panels.TRANSCRIPTION, epilog=examples_epilog( [ - ("Start a live cascade conversation", "assembly --sandbox agent-cascade"), + ("Start a live voice conversation", "assembly --sandbox live"), ( "Pick a voice and opening line", - 'assembly --sandbox agent-cascade --voice michael --greeting "Hi there"', + 'assembly --sandbox live --voice michael --greeting "Hi there"', ), ( "Give the agent a persona", - 'assembly --sandbox agent-cascade --system-prompt "You are a terse pirate."', + 'assembly --sandbox live --system-prompt "You are a terse pirate."', ), - ("See available voices", "assembly --sandbox agent-cascade --list-voices"), + ("See available voices", "assembly --sandbox live --list-voices"), ( "Print equivalent Python instead of running", - "assembly --sandbox agent-cascade --show-code", + "assembly --sandbox live --show-code", ), ] ), ) -def agent_cascade( +def live( ctx: typer.Context, source: str | None = typer.Argument( None, help="Audio file path or URL to speak to the agent. Omit to use the microphone." @@ -169,14 +169,15 @@ def agent_cascade( help="Print the equivalent Python SDK code and exit (does not start a session)", ), ) -> None: - """\\[sandbox] Hold a live voice conversation through a self-wired cascade + """\\[sandbox] Talk live to a tool-using voice agent - Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this - wires the three primitives together itself — Streaming STT, the LLM Gateway, - and streaming TTS — exactly like the 'agent-cascade' init template does - server-side. Because it uses streaming TTS it only runs in the sandbox: run - it as 'assembly --sandbox agent-cascade' (--sandbox goes before the - subcommand). + A real-time spoken conversation, wired client-side from three primitives — + Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS. Unlike + 'assembly agent' (the Voice Agent API), the brain here is an agent that can use + tools mid-conversation — web search, URL fetch, and the AssemblyAI docs — so it + answers like a live multimodal assistant. Because it uses streaming TTS it only + runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes before + the subcommand). Use headphones: the mic stays open while the agent speaks, so on speakers it would hear itself and loop. Pass an audio file/URL (or --sample) to speak a @@ -185,6 +186,9 @@ def agent_cascade( This only runs a conversation in the terminal — it writes no code. To build an agent-cascade app, run 'assembly init agent-cascade' instead. + + Web search needs a TAVILY_API_KEY in the environment; without it the agent + keeps its URL-fetch and docs tools. """ if list_voices: diff --git a/aai_cli/commands/agent_cascade/_exec.py b/aai_cli/commands/agent_cascade/_exec.py index 0b97e230..af466c56 100644 --- a/aai_cli/commands/agent_cascade/_exec.py +++ b/aai_cli/commands/agent_cascade/_exec.py @@ -169,9 +169,9 @@ def _print_show_code(opts: AgentCascadeOptions, system_prompt_text: str) -> None def run_agent_cascade(opts: AgentCascadeOptions, state: AppState, *, json_mode: bool) -> None: """Execute one `assembly agent-cascade` cascade from already-parsed flags.""" text_mode, json_mode = resolve_output_modes(opts.output_field, json_mode=json_mode) - validate_voice(opts.voice, voices.VOICE_NAMES, command="agent-cascade") + validate_voice(opts.voice, voices.VOICE_NAMES, command="live") # Streaming TTS has no production host, so the whole cascade is sandbox-only. - tts_session.require_available("agent-cascade") + tts_session.require_available("live") system_prompt_text = _resolve_system_prompt(opts.system_prompt, opts.system_prompt_file) if opts.show_code: diff --git a/aai_cli/core/microphone.py b/aai_cli/core/microphone.py index e75576d4..4ec65dda 100644 --- a/aai_cli/core/microphone.py +++ b/aai_cli/core/microphone.py @@ -1,5 +1,8 @@ from __future__ import annotations +import atexit +import contextlib +import signal import warnings from abc import abstractmethod from collections.abc import Callable, Iterable, Iterator, Mapping @@ -57,6 +60,42 @@ def audio_missing_error() -> CLIError: ) +# Process-global once-latch. The default is only observable on the very first install +# in a fresh process; the suite mutates this flag across tests, so the load-time value +# can't be asserted in isolation — the check/set in _install_… are what the tests pin. +_shutdown_interrupt_guard_installed = False # pragma: no mutate + + +def _ignore_interrupt_during_shutdown() -> None: + """Drop SIGINT for the remainder of interpreter shutdown. + + sounddevice registers its own atexit handler that calls ``Pa_Terminate`` to tear + down PortAudio. A second Ctrl-C while that runs raises ``KeyboardInterrupt`` + *inside* the atexit callback, which Python reports as a noisy "Exception ignored in + atexit callback" traceback — even though the first Ctrl-C already stopped the + session cleanly. There is nothing left to cancel once we're exiting, so ignore the + late interrupt. + """ + # signal.signal only works on the main thread; atexit runs there, but a ValueError + # is still possible in odd embeddings, so guard it rather than crash the teardown. + with contextlib.suppress(ValueError): + signal.signal(signal.SIGINT, signal.SIG_IGN) + + +def _install_shutdown_interrupt_guard() -> None: + """Register ``_ignore_interrupt_during_shutdown`` with atexit exactly once. + + Registered *after* sounddevice imports so atexit's LIFO order runs our guard + before sounddevice's PortAudio teardown, neutralizing a second Ctrl-C that would + otherwise raise inside that atexit callback. + """ + global _shutdown_interrupt_guard_installed + if _shutdown_interrupt_guard_installed: + return + atexit.register(_ignore_interrupt_during_shutdown) + _shutdown_interrupt_guard_installed = True + + def import_sounddevice() -> ModuleType: """Import sounddevice lazily, mapping an ImportError to ``audio_missing_error``. @@ -68,6 +107,7 @@ def import_sounddevice() -> ModuleType: import sounddevice except ImportError as exc: raise audio_missing_error() from exc + _install_shutdown_interrupt_guard() module: ModuleType = sounddevice return module diff --git a/install.sh b/install.sh index c884e247..b87e8641 100755 --- a/install.sh +++ b/install.sh @@ -3,11 +3,93 @@ set -e # Exit on any error # Canonical installer for the AssemblyAI CLI (`assembly`). -# Installs the app as a uv tool, bootstrapping uv first if it is missing. +# +# Default: installs the latest published code as an isolated tool with uv (or +# pipx), bootstrapping uv when neither is present. +# Dev mode (--install-method git / --dev): clones the repo (or reuses the +# checkout you run this from) and installs it editable (`uv tool install -e .`), +# so local source edits take effect without reinstalling. +# Either way it then installs the optional system deps via Homebrew if available. +# +# Usage: +# curl -LsSf https://raw.githubusercontent.com/AssemblyAI/cli/main/install.sh | bash +# ./install.sh --dev # editable, from a clone +# curl -LsSf .../install.sh | bash -s -- --install-method git -PACKAGE="git+https://github.com/AssemblyAI/cli.git" +REPO_URL="https://github.com/AssemblyAI/cli.git" +PACKAGE="git+${REPO_URL}" PYTHON_VERSION="3.13" +# Install method: "release" (default, publish-style) or "git" (editable clone). +# Overridable by env or the flags parsed below. +INSTALL_METHOD="${AAI_INSTALL_METHOD:-release}" +GIT_DIR="${AAI_GIT_DIR:-$HOME/.local/share/assembly-cli}" +# Passed to the installer as `-e` only in dev mode (empty array otherwise). +EDITABLE=() + +usage() { + cat <<'EOF' +Install the AssemblyAI CLI (assembly). + +Usage: install.sh [options] + +Options: + --install-method release (default): install the latest + published code. git: clone the repo and + install it editable (development mode). + --dev, -e, --editable, --git Shortcut for --install-method git. + --release Shortcut for --install-method release. + --dir Clone directory for dev mode + (default: ~/.local/share/assembly-cli). + -h, --help Show this help. + +Environment: + AAI_INSTALL_METHOD=release|git Same as --install-method. + AAI_GIT_DIR= Same as --dir. +EOF +} + +while [ $# -gt 0 ]; do + case "$1" in + --install-method | --method) + [ $# -ge 2 ] || { + echo "Missing value for $1" >&2 + exit 2 + } + INSTALL_METHOD="$2" + shift + ;; + --dev | -e | --editable | --git) INSTALL_METHOD="git" ;; + --release | --published) INSTALL_METHOD="release" ;; + --dir | --git-dir) + [ $# -ge 2 ] || { + echo "Missing value for $1" >&2 + exit 2 + } + GIT_DIR="$2" + shift + ;; + -h | --help) + usage + exit 0 + ;; + *) + echo "Unknown option: $1" >&2 + usage >&2 + exit 2 + ;; + esac + shift +done + +case "$INSTALL_METHOD" in +release | git) ;; +*) + echo "Invalid --install-method: $INSTALL_METHOD (use 'release' or 'git')" >&2 + exit 2 + ;; +esac + # Best-effort check for the PortAudio shared library (no `command` to probe, so # look via pkg-config, the dynamic linker cache, then well-known lib paths). has_portaudio() { @@ -33,17 +115,41 @@ has_portaudio() { return 1 } -# Homebrew also pulls in ffmpeg, portaudio, and cloudflared. The uv install does -# not, so detect any that are missing and print how to install them — without -# touching the system or invoking sudo on the user's behalf. -advise_system_deps() { - local missing=() - command -v ffmpeg >/dev/null 2>&1 || missing+=("ffmpeg") - has_portaudio || missing+=("portaudio") - command -v cloudflared >/dev/null 2>&1 || missing+=("cloudflared") +# Populate MISSING_DEPS with the optional system deps not already on the system. +MISSING_DEPS=() +detect_missing_deps() { + MISSING_DEPS=() + command -v ffmpeg >/dev/null 2>&1 || MISSING_DEPS+=("ffmpeg") + has_portaudio || MISSING_DEPS+=("portaudio") + command -v cloudflared >/dev/null 2>&1 || MISSING_DEPS+=("cloudflared") +} + +# Homebrew also pulls in ffmpeg, portaudio, and cloudflared. The uv/pipx installs +# do not, so detect any that are missing. If Homebrew is available we install the +# ones it actually carries (brew needs no sudo); for anything left we print how to +# install it — without touching the system or invoking sudo on the user's behalf. +install_system_deps() { + detect_missing_deps + [ ${#MISSING_DEPS[@]} -eq 0 ] && return 0 - [ ${#missing[@]} -eq 0 ] && return 0 + if command -v brew >/dev/null 2>&1; then + # Only ask Homebrew for formulae it actually has, so an unavailable one + # can't fail the whole batch; `brew info` exits non-zero for unknown names. + local brew_pkgs=() dep + for dep in "${MISSING_DEPS[@]}"; do + brew info --formula "$dep" >/dev/null 2>&1 && brew_pkgs+=("$dep") + done + if [ ${#brew_pkgs[@]} -gt 0 ]; then + echo "" + echo "Installing optional system dependencies with Homebrew: ${brew_pkgs[*]}" + brew install "${brew_pkgs[@]}" || true + fi + # Re-detect so we only advise on whatever brew couldn't provide. + detect_missing_deps + [ ${#MISSING_DEPS[@]} -eq 0 ] && return 0 + fi + local missing=("${MISSING_DEPS[@]}") echo "" echo "Optional system dependencies are missing: ${missing[*]}" echo "(core 'assembly transcribe' works without them)" @@ -90,25 +196,71 @@ advise_system_deps() { esac } -if ! command -v uv &>/dev/null; then - echo "uv is not installed. Installing..." +# Resolve the source for a development (editable) install: reuse the checkout we +# are run from if it is the CLI repo, otherwise clone/update GIT_DIR. Sets PACKAGE +# to the local path and EDITABLE so the installer passes `-e`. +prepare_git_source() { + if [ -f pyproject.toml ] && grep -q '^name = "aai-cli"' pyproject.toml 2>/dev/null; then + PACKAGE="$(pwd)" + echo "Development install from current checkout: $PACKAGE" + else + if ! command -v git >/dev/null 2>&1; then + echo "Development install needs git to clone $REPO_URL" >&2 + exit 1 + fi + if [ -d "$GIT_DIR/.git" ]; then + echo "Updating existing clone at $GIT_DIR" + git -C "$GIT_DIR" pull --ff-only + else + echo "Cloning $REPO_URL to $GIT_DIR" + mkdir -p "$(dirname "$GIT_DIR")" + git clone "$REPO_URL" "$GIT_DIR" + fi + PACKAGE="$GIT_DIR" + echo "Development install from $PACKAGE" + fi + EDITABLE=(-e) +} + +# Install `assembly` as an isolated tool. Prefer uv (it manages an isolated +# Python for us), then fall back to an existing pipx, and only bootstrap uv if +# neither is already present. EDITABLE is empty for a release install and `-e` +# for a dev install. +install_with_uv() { + # "$1" is the uv executable to invoke. + "$1" tool install -U "${EDITABLE[@]}" "$PACKAGE" --python "$PYTHON_VERSION" +} + +[ "$INSTALL_METHOD" = "git" ] && prepare_git_source + +if command -v uv >/dev/null 2>&1; then + # `uv self update` errors out when uv was installed via an external package + # manager (Homebrew, apt, …) — it can't replace a binary it doesn't own. That + # is not fatal to us: a managed uv is already kept current by its manager, so + # swallow the failure and proceed straight to installing the CLI. + uv self update 2>/dev/null || true + install_with_uv uv +elif command -v pipx >/dev/null 2>&1; then + # --force makes a re-run upgrade in place: the git source's version may not + # change between commits, so a plain `pipx install` would refuse as "already + # installed" and never pick up new code. + pipx install --force "${EDITABLE[@]}" "$PACKAGE" +else + echo "Neither uv nor pipx found. Installing uv..." curl -LsSf https://astral.sh/uv/install.sh | sh echo "uv installation complete!" echo "" if [ -x "$HOME/.local/bin/uv" ]; then - "$HOME/.local/bin/uv" tool install -U "$PACKAGE" --python "$PYTHON_VERSION" + install_with_uv "$HOME/.local/bin/uv" else echo "Please restart your shell and run this script again" echo "" exit 0 fi -else - uv self update - uv tool install -U "$PACKAGE" --python "$PYTHON_VERSION" fi -advise_system_deps || true +install_system_deps || true echo "" echo "For help and support, see the AssemblyAI CLI repository" diff --git a/pyproject.toml b/pyproject.toml index a7eb9e4e..e2da9d8a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -258,9 +258,10 @@ module = [ "aai_cli.code_agent.store", "aai_cli.code_agent.model", "aai_cli.commands.code._exec", + "aai_cli.agent_cascade.brain", ] disallow_any_generics = false -disable_error_code = ["return-value", "arg-type", "type-arg"] +disable_error_code = ["return-value", "arg-type", "type-arg", "call-arg"] [tool.pyright] # Second type checker alongside mypy: pyright catches a different class of @@ -279,7 +280,7 @@ exclude = ["**/node_modules", "**/__pycache__", "**/.*"] # Unknown*/invariance diagnostics our precise signatures can't satisfy. mypy still # type-checks these modules (with the targeted overrides above) as the safety net, so # we suppress pyright diagnostics here rather than littering per-line `# pyright: ignore`. -ignore = ["aai_cli/code_agent", "aai_cli/commands/code"] +ignore = ["aai_cli/code_agent", "aai_cli/commands/code", "aai_cli/agent_cascade/brain.py"] pythonVersion = "3.12" typeCheckingMode = "strict" # Third-party deps (assemblyai, sounddevice) ship no type stubs. @@ -419,6 +420,8 @@ max-statements = 40 "aai_cli/core/environments.py" = ["PLW0603"] # Verbosity is process-global startup state by design (mirrors environments.py). "aai_cli/core/debuglog.py" = ["PLW0603"] +# The "shutdown SIGINT guard installed" latch is process-global once-only state. +"aai_cli/core/microphone.py" = ["PLW0603"] # BaseHTTPRequestHandler.log_message requires a parameter named `format`. "aai_cli/auth/loopback.py" = ["A002"] # Template constants include URL path names such as TOKEN_PATH, not credentials. diff --git a/pyrightconfig.tests.json b/pyrightconfig.tests.json index 1ea7be4a..f9dbdf0e 100644 --- a/pyrightconfig.tests.json +++ b/pyrightconfig.tests.json @@ -3,7 +3,8 @@ "ignore": [ "tests/test_code_agent.py", "tests/test_code_command.py", - "tests/test_code_tui.py" + "tests/test_code_tui.py", + "tests/test_agent_cascade_brain.py" ], "pythonVersion": "3.12", "typeCheckingMode": "standard", diff --git a/scripts/generated_code_compile_gate.py b/scripts/generated_code_compile_gate.py index 8d258efe..bd71efdf 100644 --- a/scripts/generated_code_compile_gate.py +++ b/scripts/generated_code_compile_gate.py @@ -118,10 +118,10 @@ def main() -> int: ), ( # Sandbox-only: streaming TTS has no prod host, so --sandbox makes the URLs valid. - "agent-cascade-basic", + "live-basic", ( "--sandbox", - "agent-cascade", + "live", "--voice", "jane", "--greeting", diff --git a/tests/__snapshots__/test_snapshots_help_root.ambr b/tests/__snapshots__/test_snapshots_help_root.ambr index 82cc9dc9..2bb0f987 100644 --- a/tests/__snapshots__/test_snapshots_help_root.ambr +++ b/tests/__snapshots__/test_snapshots_help_root.ambr @@ -32,60 +32,59 @@ │ exit. │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Quick Start ────────────────────────────────────────────────────────────────╮ - │ onboard Guided setup: sign in and run your first transcription │ + │ onboard Guided setup: sign in and run your first transcription │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Coding Agent ───────────────────────────────────────────────────────────────╮ - │ code Run a terminal coding agent backed by the AssemblyAI LLM │ - │ Gateway │ + │ code Run a terminal coding agent backed by the AssemblyAI LLM │ + │ Gateway │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Build an App ───────────────────────────────────────────────────────────────╮ - │ init Scaffold a new app from a template and launch it │ - │ dev Run the dev server for the app in the current directory │ - │ share Expose the local app on a public URL via a cloudflared tunnel │ - │ deploy Deploy the current project to Vercel, Railway, or Fly.io │ + │ init Scaffold a new app from a template and launch it │ + │ dev Run the dev server for the app in the current directory │ + │ share Expose the local app on a public URL via a cloudflared tunnel │ + │ deploy Deploy the current project to Vercel, Railway, or Fly.io │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Run AssemblyAI ─────────────────────────────────────────────────────────────╮ - │ transcribe Transcribe a file, URL, or YouTube/podcast link — or a whole │ - │ batch │ - │ stream Transcribe live audio in real time from a mic, file, URL, or │ - │ pipe │ - │ dictate Signal-driven dictation: record the mic, get the transcript │ - │ back │ - │ agent Hold a live two-way voice conversation with a voice agent │ - │ agent-cascade [sandbox] Hold a live voice conversation through a self-wired │ - │ cascade │ - │ speak [sandbox] Synthesize speech from text with AssemblyAI │ - │ streaming TTS │ - │ llm Send a prompt to AssemblyAI's LLM Gateway and print the reply │ - │ clip Cut clips from media by speaker, text match, LLM pick, or │ - │ time range │ - │ dub [sandbox] Dub a video or audio file into another language │ - │ caption Burn always-visible captions into a video │ - │ eval Transcribe one or more datasets and score WER against their │ - │ reference texts │ - │ webhooks Receive webhook deliveries on a public dev URL │ + │ transcribe Transcribe a file, URL, or YouTube/podcast link — or a whole │ + │ batch │ + │ stream Transcribe live audio in real time from a mic, file, URL, or │ + │ pipe │ + │ dictate Signal-driven dictation: record the mic, get the transcript │ + │ back │ + │ agent Hold a live two-way voice conversation with the Voice Agent API │ + │ live [sandbox] Talk live to a tool-using voice agent │ + │ speak [sandbox] Synthesize speech from text with AssemblyAI streaming │ + │ TTS │ + │ llm Send a prompt to AssemblyAI's LLM Gateway and print the reply │ + │ clip Cut clips from media by speaker, text match, LLM pick, or time │ + │ range │ + │ dub [sandbox] Dub a video or audio file into another language │ + │ caption Burn always-visible captions into a video │ + │ eval Transcribe one or more datasets and score WER against their │ + │ reference texts │ + │ webhooks Receive webhook deliveries on a public dev URL │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Setup & Tools ──────────────────────────────────────────────────────────────╮ - │ doctor Check that your environment is ready for AssemblyAI │ - │ setup Set up your coding agent for AssemblyAI (docs MCP + skills) │ - │ config Inspect and edit persisted CLI settings (profiles, env, │ - │ telemetry) │ - │ update Update the CLI to the latest release (brew/pipx/uv) │ - │ telemetry Anonymous usage telemetry: status, enable, disable │ + │ doctor Check that your environment is ready for AssemblyAI │ + │ setup Set up your coding agent for AssemblyAI (docs MCP + skills) │ + │ config Inspect and edit persisted CLI settings (profiles, env, │ + │ telemetry) │ + │ update Update the CLI to the latest release (brew/pipx/uv) │ + │ telemetry Anonymous usage telemetry: status, enable, disable │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ History ────────────────────────────────────────────────────────────────────╮ - │ transcripts Browse and fetch past transcripts │ - │ sessions Browse your past streaming (real-time) sessions │ + │ transcripts Browse and fetch past transcripts │ + │ sessions Browse your past streaming (real-time) sessions │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Account ────────────────────────────────────────────────────────────────────╮ - │ login Authenticate via your browser and store a CLI API key │ - │ logout Clear stored credentials for the active profile │ - │ whoami Show the active profile and whether its key works │ - │ balance Show your remaining account balance │ - │ usage Show usage over a date range (default: last 30 days) │ - │ limits Show your account's rate limits per service │ - │ keys List, create, and rename your AssemblyAI API keys │ - │ audit List recent audit-log entries for your account │ + │ login Authenticate via your browser and store a CLI API key │ + │ logout Clear stored credentials for the active profile │ + │ whoami Show the active profile and whether its key works │ + │ balance Show your remaining account balance │ + │ usage Show usage over a date range (default: last 30 days) │ + │ limits Show your account's rate limits per service │ + │ keys List, create, and rename your AssemblyAI API keys │ + │ audit List recent audit-log entries for your account │ ╰──────────────────────────────────────────────────────────────────────────────╯ Examples diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index a36aa130..23ab4f37 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -1,121 +1,10 @@ # serializer version: 1 -# name: test_command_help_matches_snapshot[agent-cascade] - ''' - - Usage: assembly agent-cascade [OPTIONS] [SOURCE] - - [sandbox] Hold a live voice conversation through a self-wired cascade - - Like 'assembly agent', but instead of AssemblyAI's Voice Agent endpoint this - wires the three primitives together itself — Streaming STT, the LLM Gateway, - and streaming TTS — exactly like the 'agent-cascade' init template does - server-side. Because it uses streaming TTS it only runs in the sandbox: run - it as 'assembly --sandbox agent-cascade' (--sandbox goes before the - subcommand). - - Use headphones: the mic stays open while the agent speaks, so on speakers it - would hear itself and loop. Pass an audio file/URL (or --sample) to speak a - recorded clip instead of the microphone; the session then ends after the - agent's reply. - - This only runs a conversation in the terminal — it writes no code. To build - an agent-cascade app, run 'assembly init agent-cascade' instead. - - ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ - │ source [SOURCE] Audio file path or URL to speak to the agent. Omit │ - │ to use the microphone. │ - ╰──────────────────────────────────────────────────────────────────────────────╯ - ╭─ Options ────────────────────────────────────────────────────────────────────╮ - │ --sample Speak the hosted wildfires.mp3 │ - │ sample to the agent │ - │ --system-prompt TEXT System prompt (the agent's │ - │ persona) │ - │ [default: You are a friendly, │ - │ concise voice assistant. Keep │ - │ replies short and conversational. │ - │ Your reply is read aloud by a │ - │ text-to-speech engine, so write │ - │ plain spoken prose — no markdown, │ - │ emoji, bullet lists, or code.] │ - │ --system-prompt-file FILE Read the system prompt from a │ - │ file (overrides --system-prompt) │ - │ --greeting TEXT Spoken greeting │ - │ [default: Hi! I'm your AssemblyAI │ - │ voice agent. What can I help you │ - │ with?] │ - │ --device INTEGER Microphone device index │ - │ --list-voices Print known voices and exit │ - │ --json -j Emit newline-delimited JSON │ - │ events │ - │ --output -o [text|json] Output mode: text (you:/agent: │ - │ lines as plain stdout, │ - │ pipe-friendly) or json │ - │ --show-code Print the equivalent Python SDK │ - │ code and exit (does not start a │ - │ session) │ - │ --help Show this message and exit. │ - ╰──────────────────────────────────────────────────────────────────────────────╯ - ╭─ Text-to-speech ─────────────────────────────────────────────────────────────╮ - │ --voice TEXT TTS voice. See --list-voices. [default: jane] │ - │ --language TEXT TTS language (defaults to the voice's language) │ - │ --tts-config TEXT Set any extra streaming-TTS query field as │ - │ KEY=VALUE (repeatable) │ - ╰──────────────────────────────────────────────────────────────────────────────╯ - ╭─ Language model ─────────────────────────────────────────────────────────────╮ - │ --model TEXT LLM Gateway model that powers the │ - │ agent's replies │ - │ [default: │ - │ claude-haiku-4-5-20251001] │ - │ --max-tokens INTEGER RANGE [x>=1] Max tokens per reply │ - │ [default: 8192] │ - │ --llm-config TEXT Set any LLM Gateway request field │ - │ as KEY=VALUE (repeatable) │ - ╰──────────────────────────────────────────────────────────────────────────────╯ - ╭─ Speech-to-text ─────────────────────────────────────────────────────────────╮ - │ --speech-model TEXT Streaming speech │ - │ model │ - │ [default: │ - │ u3-rt-pro] │ - │ --format-turns --no-format-turns Format │ - │ (punctuate) │ - │ finalized turns │ - │ before replying │ - │ [default: │ - │ format-turns] │ - │ --turn-detection [aggressive|bala Turn-detection │ - │ nced|conservativ sensitivity │ - │ e] preset │ - │ --stt-config TEXT Set any │ - │ StreamingParame… │ - │ field as │ - │ KEY=VALUE │ - │ (repeatable) │ - │ --stt-config-file FILE JSON file of │ - │ streaming fields │ - ╰──────────────────────────────────────────────────────────────────────────────╯ - - Examples - Start a live cascade conversation - $ assembly --sandbox agent-cascade - Pick a voice and opening line - $ assembly --sandbox agent-cascade --voice michael --greeting "Hi there" - Give the agent a persona - $ assembly --sandbox agent-cascade --system-prompt "You are a terse pirate." - See available voices - $ assembly --sandbox agent-cascade --list-voices - Print equivalent Python instead of running - $ assembly --sandbox agent-cascade --show-code - - - - ''' -# --- # name: test_command_help_matches_snapshot[agent] ''' Usage: assembly agent [OPTIONS] [SOURCE] - Hold a live two-way voice conversation with a voice agent + Hold a live two-way voice conversation with the Voice Agent API Use headphones: the mic stays open while the agent speaks, so on speakers it would hear itself and loop. Pass an audio file/URL (or @@ -698,6 +587,126 @@ + ''' +# --- +# name: test_command_help_matches_snapshot[live] + ''' + + Usage: assembly live [OPTIONS] [SOURCE] + + [sandbox] Talk live to a tool-using voice agent + + A real-time spoken conversation, wired client-side from three primitives — + Streaming STT, a deepagents brain on the LLM Gateway, and streaming TTS. + Unlike + 'assembly agent' (the Voice Agent API), the brain here is an agent that can + use + tools mid-conversation — web search, URL fetch, and the AssemblyAI docs — so + it + answers like a live multimodal assistant. Because it uses streaming TTS it + only + runs in the sandbox: run it as 'assembly --sandbox live' (--sandbox goes + before + the subcommand). + + Use headphones: the mic stays open while the agent speaks, so on speakers it + would hear itself and loop. Pass an audio file/URL (or --sample) to speak a + recorded clip instead of the microphone; the session then ends after the + agent's reply. + + This only runs a conversation in the terminal — it writes no code. To build + an agent-cascade app, run 'assembly init agent-cascade' instead. + + Web search needs a TAVILY_API_KEY in the environment; without it the agent + keeps its URL-fetch and docs tools. + + ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ + │ source [SOURCE] Audio file path or URL to speak to the agent. Omit │ + │ to use the microphone. │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ Options ────────────────────────────────────────────────────────────────────╮ + │ --sample Speak the hosted wildfires.mp3 │ + │ sample to the agent │ + │ --system-prompt TEXT System prompt (the agent's │ + │ persona) │ + │ [default: You are a friendly, │ + │ concise voice assistant. Keep │ + │ replies short and conversational. │ + │ Your reply is read aloud by a │ + │ text-to-speech engine, so write │ + │ plain spoken prose — no markdown, │ + │ emoji, bullet lists, or code.] │ + │ --system-prompt-file FILE Read the system prompt from a │ + │ file (overrides --system-prompt) │ + │ --greeting TEXT Spoken greeting │ + │ [default: Hi! I'm your AssemblyAI │ + │ voice agent. What can I help you │ + │ with?] │ + │ --device INTEGER Microphone device index │ + │ --list-voices Print known voices and exit │ + │ --json -j Emit newline-delimited JSON │ + │ events │ + │ --output -o [text|json] Output mode: text (you:/agent: │ + │ lines as plain stdout, │ + │ pipe-friendly) or json │ + │ --show-code Print the equivalent Python SDK │ + │ code and exit (does not start a │ + │ session) │ + │ --help Show this message and exit. │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ Text-to-speech ─────────────────────────────────────────────────────────────╮ + │ --voice TEXT TTS voice. See --list-voices. [default: jane] │ + │ --language TEXT TTS language (defaults to the voice's language) │ + │ --tts-config TEXT Set any extra streaming-TTS query field as │ + │ KEY=VALUE (repeatable) │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ Language model ─────────────────────────────────────────────────────────────╮ + │ --model TEXT LLM Gateway model that powers the │ + │ agent's replies │ + │ [default: │ + │ claude-haiku-4-5-20251001] │ + │ --max-tokens INTEGER RANGE [x>=1] Max tokens per reply │ + │ [default: 8192] │ + │ --llm-config TEXT Set any LLM Gateway request field │ + │ as KEY=VALUE (repeatable) │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ Speech-to-text ─────────────────────────────────────────────────────────────╮ + │ --speech-model TEXT Streaming speech │ + │ model │ + │ [default: │ + │ u3-rt-pro] │ + │ --format-turns --no-format-turns Format │ + │ (punctuate) │ + │ finalized turns │ + │ before replying │ + │ [default: │ + │ format-turns] │ + │ --turn-detection [aggressive|bala Turn-detection │ + │ nced|conservativ sensitivity │ + │ e] preset │ + │ --stt-config TEXT Set any │ + │ StreamingParame… │ + │ field as │ + │ KEY=VALUE │ + │ (repeatable) │ + │ --stt-config-file FILE JSON file of │ + │ streaming fields │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + + Examples + Start a live voice conversation + $ assembly --sandbox live + Pick a voice and opening line + $ assembly --sandbox live --voice michael --greeting "Hi there" + Give the agent a persona + $ assembly --sandbox live --system-prompt "You are a terse pirate." + See available voices + $ assembly --sandbox live --list-voices + Print equivalent Python instead of running + $ assembly --sandbox live --show-code + + + ''' # --- # name: test_command_help_matches_snapshot[llm] diff --git a/tests/test_agent_cascade_brain.py b/tests/test_agent_cascade_brain.py new file mode 100644 index 00000000..9f0509ac --- /dev/null +++ b/tests/test_agent_cascade_brain.py @@ -0,0 +1,235 @@ +"""Tests for the deepagents reply brain behind `assembly live`. + +The brain's only network seam is the compiled graph, so `build_completer` is driven +against the *real* deepagents graph wired to a fake chat model (pytest-socket stays +armed) — no sockets. `build_live_tools` and `build_model`'s new knobs are unit-tested +directly. +""" + +from __future__ import annotations + +from langchain_core.language_models.chat_models import BaseChatModel +from langchain_core.messages import AIMessage +from langchain_core.outputs import ChatGeneration, ChatResult + +from aai_cli.agent_cascade import brain +from aai_cli.agent_cascade.config import CascadeConfig +from aai_cli.code_agent import model as model_mod + + +class FakeChatModel(BaseChatModel): + """A chat model that replays a scripted list of AIMessages (mirrors the code agent's).""" + + responses: list[AIMessage] + index: int = 0 + + @property + def _llm_type(self) -> str: + return "fake-live-model" + + def bind_tools(self, tools, **kwargs): + del tools, kwargs + return self + + def _generate(self, messages, stop=None, run_manager=None, **kwargs): + del messages, stop, run_manager, kwargs + message = self.responses[self.index] + self.index += 1 + return ChatResult(generations=[ChatGeneration(message=message)]) + + +def _graph(model: BaseChatModel): + from deepagents import create_deep_agent + + return create_deep_agent(model=model, tools=[], system_prompt="be a friendly live agent") + + +# --- build_system_prompt ----------------------------------------------------- + + +class _NamedTool: + """A stand-in tool exposing just the ``.name`` the prompt builder inspects.""" + + def __init__(self, name: str): + self.name = name + + +def test_system_prompt_appends_tool_guidance_for_present_tools(): + prompt = brain.build_system_prompt( + "You are a pirate.", + tools=[_NamedTool("tavily_search"), _NamedTool("fetch_url"), _NamedTool("docs_search")], + ) + # The persona is preserved, and the guidance advertises each capability that a present + # tool backs (the plain cascade persona never mentions tools). + assert prompt.startswith("You are a pirate.") + assert "search the web" in prompt + assert "fetch a specific URL" in prompt + assert "AssemblyAI documentation" in prompt + + +def test_system_prompt_omits_web_search_when_no_search_tool(): + # With no TAVILY_API_KEY the search tool is absent — the guidance must NOT promise web + # search, since announcing a missing tool makes the agent narrate "I'll search…" and + # then stall with no answer. The capabilities it *does* have still appear. + prompt = brain.build_system_prompt( + "persona", tools=[_NamedTool("fetch_url"), _NamedTool("docs_search")] + ) + assert "search the web for current or unfamiliar facts" not in prompt + assert "fetch a specific URL" in prompt + assert "AssemblyAI documentation" in prompt + + +def test_system_prompt_tells_model_not_to_promise_tools_when_none(): + # No tools at all: the model must answer from its own knowledge and explicitly not + # promise to search or look anything up (the bug that left replies never coming back). + prompt = brain.build_system_prompt("persona", tools=[]) + assert "search the web for current or unfamiliar facts" not in prompt + assert "your own knowledge" in prompt + assert "Never say" in prompt + + +def test_join_clause_grammar(): + # One/two/three capability phrases each render with natural conjunctions. + assert brain._join_clause(["a"]) == "a" + assert brain._join_clause(["a", "b"]) == "a and b" + assert brain._join_clause(["a", "b", "c"]) == "a, b, and c" + + +def test_web_search_tool_name_matches_built_tool(monkeypatch): + # The prompt builder detects search by WEB_SEARCH_TOOL_NAME, so pin it against the real + # tool's registered name — if langchain_tavily renames it, detection would silently break. + from aai_cli.code_agent import web_search + + monkeypatch.setenv(web_search.TAVILY_API_KEY_ENV, "tvly-x") + assert web_search.build_web_search_tool().name == web_search.WEB_SEARCH_TOOL_NAME + + +# --- build_completer (driving the real graph with a fake model) -------------- + + +def test_completer_returns_final_spoken_text(): + graph = _graph(FakeChatModel(responses=[AIMessage(content="Hello there.")])) + completer = brain.build_completer("k", CascadeConfig(), graph=graph) + reply = completer([{"role": "system", "content": "x"}, {"role": "user", "content": "hi"}]) + assert reply == "Hello there." + + +def test_completer_strips_system_message_before_invoking(): + # The cascade prepends its own system message each turn, but the graph already owns + # the system prompt — so the completer must drop it before invoking, leaving only the + # conversation. We capture what the graph received to prove the system line is gone. + captured = {} + + class _CapturingGraph: + def invoke(self, value): + captured["messages"] = value["messages"] + return {"messages": [AIMessage(content="ok")]} + + completer = brain.build_completer("k", CascadeConfig(), graph=_CapturingGraph()) + completer([{"role": "system", "content": "persona"}, {"role": "user", "content": "hi"}]) + roles = [m["role"] for m in captured["messages"]] + assert roles == ["user"] + + +# --- _reply_text / _content_text --------------------------------------------- + + +def test_reply_text_skips_empty_ai_messages_and_takes_last_text(): + # Scanning from the end, a trailing empty AIMessage (a tool-call request with no + # spoken text) is skipped so the reply falls back to the prior AIMessage's text, + # rather than coming back blank. + result = { + "messages": [ + AIMessage(content="The answer is 42."), + AIMessage(content=""), + ] + } + assert brain._reply_text(result) == "The answer is 42." + + +def test_reply_text_joins_list_content_blocks(): + result = {"messages": [AIMessage(content=[{"type": "text", "text": "Hello "}, "world"])]} + assert brain._reply_text(result) == "Hello world" + + +def test_reply_text_skips_non_assistant_messages(): + from langchain_core.messages import ToolMessage + + # Scanning from the end, a trailing non-assistant message (e.g. a tool result) is + # skipped — the spoken reply is the AIMessage before it. + result = { + "messages": [ + AIMessage(content="hello there"), + ToolMessage(content="tool output", tool_call_id="c1"), + ] + } + assert brain._reply_text(result) == "hello there" + + +def test_content_text_coerces_unexpected_content(): + # A content that is neither a string nor a list of blocks (defensive fallback). + assert brain._content_text(123) == "123" + + +def test_reply_text_is_empty_without_an_assistant_message(): + assert brain._reply_text({"messages": []}) == "" + assert brain._reply_text({}) == "" + + +# --- build_live_tools -------------------------------------------------------- + + +def test_build_live_tools_includes_search_when_keyed(monkeypatch): + search = object() + monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch") + monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: search) + monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", lambda: ["docs"]) + tools = brain.build_live_tools() + # Fetch + the keyed search + the docs tools, in that order. + assert tools == ["fetch", search, "docs"] + + +def test_build_live_tools_omits_search_when_unkeyed(monkeypatch): + monkeypatch.setattr("aai_cli.code_agent.fetch_tool.build_fetch_tool", lambda: "fetch") + monkeypatch.setattr("aai_cli.code_agent.web_search.build_web_search_tool", lambda: None) + monkeypatch.setattr("aai_cli.code_agent.docs_mcp.load_docs_tools", list) + tools = brain.build_live_tools() + # No TAVILY_API_KEY -> no search tool, just the fetch tool. + assert tools == ["fetch"] + + +# --- build_graph (model construction + compile, with the docs probe skipped) - + + +def test_build_graph_uses_gateway_model_and_runs_offline(monkeypatch): + captured = {} + + def fake_build_model(api_key, *, model, max_tokens, extra): + captured["model"] = model + captured["max_tokens"] = max_tokens + captured["extra"] = dict(extra) + return FakeChatModel(responses=[AIMessage(content="hi from the agent")]) + + monkeypatch.setattr(model_mod, "build_model", fake_build_model) + cfg = CascadeConfig(model="claude-x", max_tokens=128, llm_extra={"temperature": 0.2}) + graph = brain.build_graph("k", cfg, tools=[]) + # The cascade's model + knobs are threaded into the gateway model build. + assert captured == {"model": "claude-x", "max_tokens": 128, "extra": {"temperature": 0.2}} + # The compiled graph is a real deepagents graph that answers offline via the fake model. + completer = brain.build_completer("k", cfg, graph=graph) + assert completer([{"role": "user", "content": "hi"}]) == "hi from the agent" + + +# --- build_model new knobs --------------------------------------------------- + + +def test_build_model_threads_max_tokens_and_extra(): + model = model_mod.build_model("k", model="claude-x", max_tokens=222, extra={"top_k": 5}) + assert model.max_tokens == 222 + assert model.extra_body == {"top_k": 5} + + +def test_build_model_defaults_have_no_extra(): + model = model_mod.build_model("k", model="claude-x") + assert model.max_tokens is None + assert model.extra_body is None diff --git a/tests/test_agent_cascade_command.py b/tests/test_agent_cascade_command.py index 513dc1cc..93d25a4e 100644 --- a/tests/test_agent_cascade_command.py +++ b/tests/test_agent_cascade_command.py @@ -1,4 +1,4 @@ -"""Command + wiring tests for `assembly agent-cascade`. +"""Command + wiring tests for `assembly live`. Covers the argv -> options seam, the validation guards, _open_audio source selection, and CascadeDeps.real's three live legs (all driven against fakes). @@ -60,14 +60,14 @@ def _opts(**overrides) -> AgentCascadeOptions: def test_list_voices_human_lists_catalog(): - result = runner.invoke(app, ["agent-cascade", "--list-voices"]) + result = runner.invoke(app, ["live", "--list-voices"]) assert result.exit_code == 0 assert "jane" in result.output assert "English:" in result.output def test_list_voices_json_emits_array(): - result = runner.invoke(app, ["agent-cascade", "--list-voices", "--json"]) + result = runner.invoke(app, ["live", "--list-voices", "--json"]) assert result.exit_code == 0 assert result.output.lstrip().startswith("[") assert '"jane"' in result.output @@ -92,14 +92,14 @@ def test_missing_system_prompt_file_is_rejected_by_typer(): # so the sandbox guard (the other exit-2 path) never runs. Asserting the guard's # message is absent kills the exists=True mutant without depending on the Rich error # text, which CI renders with ANSI + width ellipsis. - result = runner.invoke(app, ["agent-cascade", "--system-prompt-file", "/no/such/file"]) + result = runner.invoke(app, ["live", "--system-prompt-file", "/no/such/file"]) assert result.exit_code == 2 assert "sandbox" not in result.output.lower() def test_production_env_is_rejected_with_sandbox_hint(): # Default env is production, which has no streaming-TTS host. - result = runner.invoke(app, ["agent-cascade", "--voice", "jane"]) + result = runner.invoke(app, ["live", "--voice", "jane"]) assert result.exit_code == 2 assert "only available in the sandbox" in result.output @@ -126,7 +126,7 @@ def fake_run(opts, state, *, json_mode): captured["opts"] = opts monkeypatch.setattr(_exec, "run_agent_cascade", fake_run) - result = runner.invoke(app, ["agent-cascade", *argv]) + result = runner.invoke(app, ["live", *argv]) assert result.exit_code == 0 assert captured["opts"].format_turns is expected @@ -137,7 +137,7 @@ def test_stt_config_file_must_exist(): # terminal so the "does not exist" message isn't wrapped by the 80-col error box. result = runner.invoke( app, - ["agent-cascade", "--stt-config-file", "/no/such/file.json"], + ["live", "--stt-config-file", "/no/such/file.json"], env={"COLUMNS": "300"}, ) assert result.exit_code == 2 @@ -418,36 +418,23 @@ def fake_stream_audio(api_key, source, *, params, on_turn): assert captured["params"] is params -def test_deps_real_complete_reply_threads_model_tokens_and_extra(monkeypatch): +def test_deps_real_complete_reply_is_built_by_the_deepagents_brain(monkeypatch): + # The LLM leg is now a deepagents graph: .real delegates to brain.build_completer, + # passing the api key + config, and uses whatever completer it returns. We assert the + # exact wiring so the brain swap (not a plain llm.complete) can't silently regress. captured = {} - def fake_complete(api_key, **kwargs): - captured.update(kwargs) - return "raw-response" + def fake_build_completer(api_key, config): + captured["api_key"] = api_key + captured["config"] = config + return lambda messages: f"reply to {messages[-1]['content']}" - monkeypatch.setattr(engine.llm, "complete", fake_complete) - monkeypatch.setattr(engine.llm, "content_of", lambda response: response.upper()) + monkeypatch.setattr(engine.brain, "build_completer", fake_build_completer) cfg = CascadeConfig(model="m", max_tokens=222, llm_extra={"temperature": 0.5}) deps = CascadeDeps.real("k", cfg, audio=[], stt_params=_stt_params()) - assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "RAW-RESPONSE" - assert captured["model"] == "m" - assert captured["max_tokens"] == 222 - assert captured["extra"] == {"temperature": 0.5} - - -def test_deps_real_complete_reply_sends_no_extra_when_unset(monkeypatch): - captured = {} - - def fake_complete(api_key, **kwargs): - captured.update(kwargs) - return "x" - - monkeypatch.setattr(engine.llm, "complete", fake_complete) - monkeypatch.setattr(engine.llm, "content_of", lambda response: response) - deps = CascadeDeps.real("k", CascadeConfig(), audio=[], stt_params=_stt_params()) - deps.complete_reply([{"role": "user", "content": "hi"}]) - # Empty overrides collapse to None, not an empty dict, so the gateway sees no extra body. - assert captured["extra"] is None + assert deps.complete_reply([{"role": "user", "content": "hi"}]) == "reply to hi" + assert captured["api_key"] == "k" + assert captured["config"] is cfg def test_deps_real_synthesize_threads_voice_language_and_extra(monkeypatch): diff --git a/tests/test_agent_cascade_show_code.py b/tests/test_agent_cascade_show_code.py index d05b5874..97bbe0ff 100644 --- a/tests/test_agent_cascade_show_code.py +++ b/tests/test_agent_cascade_show_code.py @@ -1,4 +1,4 @@ -"""`assembly agent-cascade --show-code` tests. +"""`assembly live --show-code` tests. Split from test_agent_cascade_command.py (which holds the run-path wiring) so the print-only path's many invocations live in their own file. The cascade is @@ -33,7 +33,7 @@ def _boom(**kwargs): ) result = runner.invoke( app, - ["--sandbox", "agent-cascade", "--voice", "jane", "--greeting", "Hi there", "--show-code"], + ["--sandbox", "live", "--voice", "jane", "--greeting", "Hi there", "--show-code"], ) assert result.exit_code == 0 # Targets the sandbox the key was minted for — all three legs. @@ -54,25 +54,23 @@ def fake_run(opts, state, *, json_mode): captured["opts"] = opts monkeypatch.setattr(_exec, "run_agent_cascade", fake_run) - assert runner.invoke(app, ["agent-cascade"]).exit_code == 0 + assert runner.invoke(app, ["live"]).exit_code == 0 assert captured["opts"].show_code is False - assert runner.invoke(app, ["agent-cascade", "--show-code"]).exit_code == 0 + assert runner.invoke(app, ["live", "--show-code"]).exit_code == 0 assert captured["opts"].show_code is True def test_show_code_injects_speech_model(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) - result = runner.invoke( - app, ["--sandbox", "agent-cascade", "--speech-model", "u3-rt-pro", "--show-code"] - ) + result = runner.invoke(app, ["--sandbox", "live", "--speech-model", "u3-rt-pro", "--show-code"]) assert result.exit_code == 0 assert "speech_model=u3-rt-pro" in result.stdout def test_show_code_reflects_no_format_turns(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) - formatted = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"]) - bare = runner.invoke(app, ["--sandbox", "agent-cascade", "--no-format-turns", "--show-code"]) + formatted = runner.invoke(app, ["--sandbox", "live", "--show-code"]) + bare = runner.invoke(app, ["--sandbox", "live", "--no-format-turns", "--show-code"]) # With formatting on the cue waits for the punctuated turn; off, a bare end-of-turn fires. assert "turn_is_formatted" in formatted.stdout assert "turn_is_formatted" not in bare.stdout @@ -83,7 +81,7 @@ def test_show_code_threads_model_and_max_tokens(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) result = runner.invoke( app, - ["--sandbox", "agent-cascade", "--model", "claude-x", "--max-tokens", "321", "--show-code"], + ["--sandbox", "live", "--model", "claude-x", "--max-tokens", "321", "--show-code"], ) assert result.exit_code == 0 assert "claude-x" in result.stdout @@ -95,7 +93,7 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch): monkeypatch.setattr( _exec.engine, "run_cascade", lambda **kw: (_ for _ in ()).throw(AssertionError("no run")) ) - result = runner.invoke(app, ["--sandbox", "agent-cascade", "clip.wav", "--show-code"]) + result = runner.invoke(app, ["--sandbox", "live", "clip.wav", "--show-code"]) assert result.exit_code == 0 assert "uses the microphone" in result.stderr assert "uses the microphone" not in result.stdout # stdout stays a clean script @@ -104,13 +102,13 @@ def test_show_code_file_source_warns_on_stderr(monkeypatch): def test_show_code_mic_emits_no_warning(monkeypatch): monkeypatch.setattr(_exec.engine, "run_cascade", lambda **kw: None) - result = runner.invoke(app, ["--sandbox", "agent-cascade", "--show-code"]) + result = runner.invoke(app, ["--sandbox", "live", "--show-code"]) assert result.exit_code == 0 assert "uses the microphone" not in result.stderr # mic script matches the run, nothing to warn def test_show_code_in_production_is_rejected_with_sandbox_hint(): # --show-code still honors the sandbox-only guard, so the generated URLs are valid. - result = runner.invoke(app, ["agent-cascade", "--show-code"]) + result = runner.invoke(app, ["live", "--show-code"]) assert result.exit_code == 2 assert "only available in the sandbox" in result.output diff --git a/tests/test_code_agent.py b/tests/test_code_agent.py index 76af37af..0e5d17c5 100644 --- a/tests/test_code_agent.py +++ b/tests/test_code_agent.py @@ -344,49 +344,6 @@ def invoke(self, *a, **k): assert any(isinstance(e, ErrorText) and "gateway 500" in e.text for e in seen) -class StreamingAgent: - """A double exercising the streaming path: yields scripted state snapshots.""" - - def __init__(self, chunks: list[dict[str, object]]) -> None: - self._chunks = chunks - - def stream(self, graph_input, config=None, *, stream_mode="values"): - del graph_input, config, stream_mode - yield from self._chunks - - def invoke(self, *a, **k): # the streaming branch is taken, so invoke is never used - raise AssertionError("a streaming agent must not be invoked") - - -def test_send_streams_each_step_and_cancel_stops_the_loop() -> None: - from langchain_core.messages import HumanMessage - - # Three successive graph states (messages grow by one each step); a stream_mode="values" - # graph yields exactly these snapshots, so the session must emit incrementally. - chunks: list[dict[str, object]] = [ - {"messages": [HumanMessage("go")]}, - {"messages": [HumanMessage("go"), AIMessage("first")]}, - {"messages": [HumanMessage("go"), AIMessage("first"), AIMessage("second")]}, - ] - seen: list[object] = [] - session = CodeSession( - agent=StreamingAgent(chunks), sink=seen.append, approver=lambda n, a: True - ) - - def sink(event: object) -> None: - seen.append(event) - if isinstance(event, AssistantText) and event.text == "first": - session.request_cancel() # cancel mid-stream, before the "second" chunk is consumed - - session.sink = sink - session.send("go") - - texts = [e.text for e in seen if isinstance(e, AssistantText)] - # "first" streamed out as its step landed; the cancel then broke the loop, so the later - # "second" step was never emitted — proving both incremental rendering and cancellation. - assert texts == ["first"] - - def test_session_propagates_keyboard_interrupt() -> None: class Stop: def invoke(self, *a, **k): diff --git a/tests/test_code_messages.py b/tests/test_code_messages.py new file mode 100644 index 00000000..9a1168d4 --- /dev/null +++ b/tests/test_code_messages.py @@ -0,0 +1,149 @@ +"""Tests for the mounted-widget transcript of the `assembly code` TUI. + +Drives the real Textual app (headless) and asserts on the mounted message widgets: the reply +streams into one AssistantMessage in place and renders as Markdown, and a long tool result is +a collapsible ToolOutput (Ctrl-O / click). Split from test_code_tui.py to stay under the +file-length gate. +""" + +from __future__ import annotations + +import asyncio + +from aai_cli.code_agent.events import AssistantDelta, AssistantText, ToolResult +from aai_cli.code_agent.messages import AssistantMessage, ToolOutput +from aai_cli.code_agent.tui import CodeAgentApp + + +class FakeAgent: + """Replays scripted invoke() results so a turn can complete without a model.""" + + def __init__(self, results: list[dict[str, object]]) -> None: + self._results = results + self.calls = 0 + + def invoke(self, *args, **kwargs): + result = self._results[self.calls] + self.calls += 1 + return result + + +def _run(coro) -> None: + asyncio.run(coro) + + +def test_assistant_reply_renders_as_markdown_widget() -> None: + # The reply mounts an AssistantMessage rendered as Markdown — the fence markers are + # consumed and the code shows; the raw text is kept for clipboard copy. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + reply = "Here you go:\n\n```python\nprint('hi')\n```" + app._write_event(AssistantText(reply)) + await pilot.pause() + msg = app.query_one(AssistantMessage) + text = "\n".join(msg.render_line(y).text for y in range(msg.size.height)) + assert "```" not in text # markdown consumed the fence markers + assert "print('hi')" in text # the code itself renders + assert app._last_reply == reply # raw markdown kept for clipboard copy + + _run(go()) + + +def test_assistant_deltas_stream_in_place_then_finalize() -> None: + # Tokens stream into a single AssistantMessage in place (no separate region); the final + # AssistantText finalizes that same widget rather than mounting a second one. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._write_event(AssistantDelta("Hello, ")) + app._write_event(AssistantDelta("world!")) + await pilot.pause() + assert len(app.query(AssistantMessage)) == 1 # one widget, updated in place + assert app.query_one(AssistantMessage).text == "Hello, world!" + streaming = app._streaming_msg # local: asserting on the attr would poison the + assert streaming is not None # later `is None` check (mypy can't see the reset) + app._write_event(AssistantText("Hello, world!")) + await pilot.pause() + assert app._streaming_msg is None # finalized + assert app._last_reply == "Hello, world!" + assert len(app.query(AssistantMessage)) == 1 # finalized in place, not a 2nd widget + + _run(go()) + + +def test_finish_turn_finalizes_a_dangling_streamed_reply() -> None: + # A turn cancelled mid-generation leaves a streamed-but-unfinalized reply; finishing the + # turn commits what streamed in (so it isn't lost) and clears the streaming reference. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._write_event(AssistantDelta("partial repl")) + await pilot.pause() + streaming = app._streaming_msg # local so the later `is None` check stays reachable + assert streaming is not None + app._finish_turn() + assert app._streaming_msg is None # finalized, not left dangling + assert app.query_one(AssistantMessage).text == "partial repl" # kept what streamed + + _run(go()) + + +def test_short_tool_output_is_not_expandable() -> None: + # Output that already fits has no expand affordance and Ctrl-O is a no-op on it. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._write_event(ToolResult(name="execute", content="ok")) + await pilot.pause() + out = app.query_one(ToolOutput) + before = str(out.render()) + assert "Ctrl+O" not in before # nothing to expand -> no hint + out.toggle() + assert str(out.render()) == before # toggle is a no-op when it all fits + + _run(go()) + + +def test_tool_output_toggles_on_click_and_ctrl_o_is_safe_with_no_output() -> None: + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app.action_toggle_output() # no tool output yet -> safe no-op + app._write_event( + ToolResult(name="execute", content="\n".join(f"x{i}" for i in range(20))) + ) + await pilot.pause() + out = app.query_one(ToolOutput) + assert "x19" not in str(out.render()) + out.on_click() # clicking expands + assert "x19" in str(out.render()) + + _run(go()) + + +def test_tool_output_expands_and_collapses_on_ctrl_o() -> None: + # A long tool result mounts a collapsed ToolOutput (preview + "more lines"); Ctrl-O + # expands it to the full content and toggles back. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._write_event( + ToolResult(name="execute", content="\n".join(f"ln{i}" for i in range(20))) + ) + await pilot.pause() + out = app.query_one(ToolOutput) + collapsed = str(out.render()) + assert "ln0" in collapsed and "more lines" in collapsed and "ln19" not in collapsed + app.action_toggle_output() # Ctrl-O expands the most recent output + assert "ln19" in str(out.render()) # full content now shown + app.action_toggle_output() # toggles back to the preview + assert "ln19" not in str(out.render()) + + _run(go()) diff --git a/tests/test_code_modals.py b/tests/test_code_modals.py new file mode 100644 index 00000000..5ad276b8 --- /dev/null +++ b/tests/test_code_modals.py @@ -0,0 +1,236 @@ +"""Tests for the spoken/voice-answerable approval and ask modals. + +The pure ``approval_from_speech`` mapping is unit-tested directly; the screen wiring (speak the +prompt, listen, dismiss with the mapped decision) is driven through the real app headless with +a scripted voice double — no mic, speaker, or socket. +""" + +from __future__ import annotations + +import asyncio + +import pytest +from textual.widgets import Input + +from aai_cli.code_agent.modals import ApprovalScreen, AskScreen, approval_from_speech +from aai_cli.code_agent.tui import CodeAgentApp +from aai_cli.core.errors import CLIError + + +class FakeAgent: + def invoke(self, *a, **k): + return {} + + +class FakeVoice: + """Scripted voice IO: speak() records, listen() replays one transcript (or raises).""" + + def __init__(self, transcript: str | None = None, *, error: CLIError | None = None) -> None: + self._transcript = transcript + self._error = error + self.spoken: list[str] = [] + + def speak(self, text: str) -> None: + self.spoken.append(text) + + def listen(self) -> str | None: + if self._error is not None: + raise self._error + return self._transcript + + +def _run(coro) -> None: + asyncio.run(coro) + + +@pytest.mark.parametrize( + ("said", "decision"), + [ + ("yes please", "approve"), + ("approve that", "approve"), + ("go ahead", "approve"), + ("auto approve", "auto"), + ("always do this", "auto"), + ("no", "reject"), + ("reject it", "reject"), + ("don't", "reject"), + ("yes but no", "reject"), # reject wins over approve when both are heard (safer) + ("uhh what", "reject"), # unclear -> safe default + ], +) +def test_approval_from_speech(said: str, decision: str) -> None: + assert approval_from_speech(said) == decision + + +async def _push_and_wait(app, pilot, screen) -> object: + box: dict[str, object] = {} + app.push_screen(screen, lambda result: box.update(value=result)) + for _ in range(300): + await pilot.pause(0.01) + if "value" in box: + break + return box.get("value", "__pending__") + + +def test_spoken_approval_speaks_prompt_and_maps_answer() -> None: + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + voice = FakeVoice(transcript="yes go for it") + result = await _push_and_wait( + app, pilot, ApprovalScreen("execute", {"command": "rm -rf build"}, voice=voice) + ) + assert result == "approve" # spoken "yes" mapped to approve + prompt = voice.spoken[0] + assert "Run execute" in prompt and "rm -rf build" in prompt + assert "Warning:" in prompt # the risky command is read aloud + assert "approve, auto-approve, or reject" in prompt + + _run(go()) + + +def test_spoken_approval_rejects_on_no() -> None: + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + result = await _push_and_wait( + app, pilot, ApprovalScreen("write_file", {"file_path": "x"}, voice=FakeVoice("no")) + ) + assert result == "reject" + + _run(go()) + + +def test_spoken_ask_speaks_question_and_returns_transcript() -> None: + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + voice = FakeVoice(transcript="use port 8080") + result = await _push_and_wait(app, pilot, AskScreen("Which port?", voice=voice)) + assert result == "use port 8080" # spoken answer returned verbatim + assert "The agent asks: Which port?" in voice.spoken[0] + + _run(go()) + + +def test_silence_does_not_auto_reject() -> None: + # No speech (listen -> None) must not auto-decide — the modal waits for speech or a keypress + # rather than rejecting a tool on a pause. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + box: dict[str, object] = {} + app.push_screen( + ApprovalScreen("execute", {"command": "ls"}, voice=FakeVoice(None)), + lambda result: box.update(value=result), + ) + for _ in range(50): + await pilot.pause(0.01) + assert "value" not in box # silence -> not dismissed + + _run(go()) + + +def test_voice_failure_falls_back_to_keyboard() -> None: + # If the mic/STT fails, the modal isn't auto-dismissed — the user can still press a key. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + voice = FakeVoice(error=CLIError("no mic", error_type="mic_missing", exit_code=2)) + box: dict[str, object] = {} + app.push_screen( + ApprovalScreen("execute", {"command": "ls"}, voice=voice), + lambda result: box.update(value=result), + ) + for _ in range(50): + await pilot.pause(0.01) + assert "value" not in box # voice failed -> not auto-dismissed + await pilot.press("n") # keyboard still works + await pilot.pause() + assert box.get("value") == "reject" + + _run(go()) + + +def test_ask_voice_failure_falls_back_to_typing() -> None: + # An ask modal whose voice fails isn't dismissed; the user types the answer instead. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + voice = FakeVoice(error=CLIError("no mic", error_type="mic_missing", exit_code=2)) + box: dict[str, object] = {} + app.push_screen(AskScreen("Which port?", voice=voice), lambda r: box.update(value=r)) + for _ in range(50): + await pilot.pause(0.01) + assert "value" not in box # voice failed -> not auto-dismissed + app.screen.query_one("#answer", Input).value = "8080" + await pilot.press("enter") + await pilot.pause() + assert box.get("value") == "8080" + + _run(go()) + + +def test_spoken_prompt_omits_detail_when_no_args() -> None: + # A tool with no identifying arg reads as just "Run . Say approve…" (no detail clause). + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + voice = FakeVoice(transcript="yes") + result = await _push_and_wait(app, pilot, ApprovalScreen("noop", {}, voice=voice)) + assert result == "approve" + assert "Run noop. Say approve" in voice.spoken[0] # straight to the options + + _run(go()) + + +def test_ask_silence_does_not_dismiss() -> None: + # No spoken answer (listen -> None) leaves the ask modal up for typing. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + box: dict[str, object] = {} + app.push_screen(AskScreen("Q?", voice=FakeVoice(None)), lambda r: box.update(value=r)) + for _ in range(50): + await pilot.pause(0.01) + assert "value" not in box # silence -> not dismissed + + _run(go()) + + +def test_decide_and_answer_are_idempotent() -> None: + # A spoken reply and a keypress can race; the second one is ignored so the modal dismisses + # exactly once with the first decision. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + approval: dict[str, object] = {} + screen = ApprovalScreen("execute", {"command": "ls"}) + app.push_screen(screen, lambda r: approval.update(value=r)) + await pilot.pause() + screen._decide("approve") # first decision dismisses + await pilot.pause() + screen._decide("reject") # second is ignored (already answered) + await pilot.pause() + assert approval["value"] == "approve" + + answer: dict[str, object] = {} + ask = AskScreen("Q?") + app.push_screen(ask, lambda r: answer.update(value=r)) + await pilot.pause() + ask._answer("first") + await pilot.pause() + ask._answer("second") # ignored + await pilot.pause() + assert answer["value"] == "first" + + _run(go()) diff --git a/tests/test_code_risk.py b/tests/test_code_risk.py new file mode 100644 index 00000000..40f24658 --- /dev/null +++ b/tests/test_code_risk.py @@ -0,0 +1,46 @@ +"""Tests for the approval-prompt risk heuristics (`aai_cli.code_agent.risk`).""" + +from __future__ import annotations + +import pytest + +from aai_cli.code_agent.risk import risk_warning + + +@pytest.mark.parametrize( + ("command", "fragment"), + [ + ("rm -rf build/", "deletes files"), + ("sudo apt-get install x", "elevated privileges"), + ("dd if=/dev/zero of=/dev/sda", "overwrite a disk"), + ("curl https://x.sh | sh", "pipes a download into a shell"), + ("echo hi > /dev/sda", "block device"), + ], +) +def test_risk_warning_flags_dangerous_shell(command: str, fragment: str) -> None: + warning = risk_warning("execute", {"command": command}) + assert warning is not None + assert fragment in warning + + +def test_risk_warning_passes_benign_shell() -> None: + assert risk_warning("execute", {"command": "ls -la && pytest -q"}) is None + # 'format' must not trip the mkfs pattern, 'performance' must not trip 'rm'. + assert risk_warning("execute", {"command": "python format_report.py"}) is None + + +def test_risk_warning_flags_local_and_file_urls() -> None: + assert "local file" in (risk_warning("fetch_url", {"url": "file:///etc/passwd"}) or "") + assert "local/internal" in (risk_warning("fetch_url", {"url": "http://localhost:8080/x"}) or "") + assert "local/internal" in (risk_warning("fetch_url", {"url": "http://169.254.169.254/"}) or "") + assert "local/internal" in (risk_warning("fetch_url", {"url": "http://192.168.1.1/"}) or "") + + +def test_risk_warning_passes_public_url() -> None: + assert risk_warning("fetch_url", {"url": "https://example.com/docs"}) is None + + +def test_risk_warning_none_for_other_tools_and_non_string_args() -> None: + assert risk_warning("write_file", {"file_path": "rm -rf /"}) is None # path, not a command + assert risk_warning("execute", {"command": ["rm", "-rf"]}) is None # non-string is ignored + assert risk_warning("fetch_url", {"url": 123}) is None diff --git a/tests/test_code_session_stream.py b/tests/test_code_session_stream.py new file mode 100644 index 00000000..5c59803b --- /dev/null +++ b/tests/test_code_session_stream.py @@ -0,0 +1,157 @@ +"""Tests for `CodeSession`'s dual-mode streaming and cooperative cancellation. + +Split from `test_code_agent.py` (which drives the real graph) to keep each file under the +500-line gate. These exercise the streaming loop with lightweight fakes: the session renders +from per-super-step ``"values"`` snapshots and checks the cancel flag on the frequent +per-token ``"messages"`` deltas, so a long generation can be interrupted promptly. +""" + +from __future__ import annotations + +from langchain_core.messages import AIMessage, HumanMessage, ToolMessage + +from aai_cli.code_agent.events import AssistantDelta, AssistantText, assistant_delta +from aai_cli.code_agent.session import CodeSession + + +class StreamingAgent: + """A double exercising the dual-mode streaming path. + + Mirrors langgraph's ``stream_mode=["values", "messages"]`` contract: each scripted state + snapshot is yielded tagged as ``("values", snapshot)``, optionally preceded by + ``("messages", delta)`` per-token deltas (the fine-grained cancellation checkpoints). + """ + + def __init__( + self, chunks: list[dict[str, object]], *, token_deltas: tuple[str, ...] = () + ) -> None: + self._chunks = chunks + self._token_deltas = token_deltas + + def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")): + del graph_input, config, stream_mode + for delta in self._token_deltas: + yield ("messages", delta) + for chunk in self._chunks: + yield ("values", chunk) + + def invoke(self, *a, **k): # the streaming branch is taken, so invoke is never used + raise AssertionError("a streaming agent must not be invoked") + + +def test_assistant_delta_is_frozen_hashable() -> None: + # frozen=True makes it immutable+hashable; a non-frozen eq dataclass sets __hash__=None, + # so hash() would raise — this keeps the event safe to dedupe/compare and pins `frozen`. + assert hash(AssistantDelta("x")) == hash(AssistantDelta("x")) + + +def test_assistant_delta_extracts_only_ai_text() -> None: + # messages-mode yields (message, metadata); only AI text becomes a delta. + assert assistant_delta((AIMessage("tok"), {"node": "agent"})) == AssistantDelta("tok") + assert assistant_delta(AIMessage("bare")) == AssistantDelta("bare") # untupled is fine too + assert assistant_delta((AIMessage(""), {})) is None # empty content (e.g. a tool-call turn) + assert assistant_delta((ToolMessage("result", tool_call_id="1"), {})) is None # not assistant + assert assistant_delta(()) is None # defensive: empty payload + + +def test_send_emits_assistant_deltas_from_messages_stream() -> None: + # The per-token messages chunks are surfaced as AssistantDelta (live preview), and the + # values snapshot still yields the authoritative AssistantText. + seen: list[object] = [] + + class TokenAgent: + def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")): + del graph_input, config, stream_mode + yield ("messages", (AIMessage("Hello, "), {})) + yield ("messages", (AIMessage("world"), {})) + yield ("values", {"messages": [AIMessage("Hello, world")]}) + + def invoke(self, *a, **k): + raise AssertionError("a streaming agent must not be invoked") + + session = CodeSession(agent=TokenAgent(), sink=seen.append, approver=lambda n, a: True) + session.send("go") + + deltas = [e.text for e in seen if isinstance(e, AssistantDelta)] + finals = [e.text for e in seen if isinstance(e, AssistantText)] + assert deltas == ["Hello, ", "world"] # streamed tokens + assert finals == ["Hello, world"] # authoritative full reply from the values snapshot + + +def test_send_streams_each_step_and_cancel_stops_the_loop() -> None: + # Three successive graph states (messages grow by one each step); a stream_mode="values" + # graph yields exactly these snapshots, so the session must emit incrementally. + chunks: list[dict[str, object]] = [ + {"messages": [HumanMessage("go")]}, + {"messages": [HumanMessage("go"), AIMessage("first")]}, + {"messages": [HumanMessage("go"), AIMessage("first"), AIMessage("second")]}, + ] + seen: list[object] = [] + session = CodeSession( + agent=StreamingAgent(chunks), sink=seen.append, approver=lambda n, a: True + ) + + def sink(event: object) -> None: + seen.append(event) + if isinstance(event, AssistantText) and event.text == "first": + session.request_cancel() # cancel mid-stream, before the "second" chunk is consumed + + session.sink = sink + session.send("go") + + texts = [e.text for e in seen if isinstance(e, AssistantText)] + # "first" streamed out as its step landed; the cancel then broke the loop, so the later + # "second" step was never emitted — proving both incremental rendering and cancellation. + assert texts == ["first"] + + +def test_cancel_within_a_step_breaks_on_a_token_delta() -> None: + # A single model generation is one super-step, so a values-only loop can't break until the + # whole reply lands. Streaming the per-token "messages" deltas alongside gives a frequent + # cancel checkpoint: a Ctrl-C mid-generation breaks before the reply ("late") is ever + # rendered. Modeled by an agent that requests cancel between two token deltas. + seen: list[object] = [] + + class TokenStreamAgent: + session: CodeSession + + def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")): + del graph_input, config, stream_mode + yield ("messages", "par") # first token arrives — loop sees no cancel yet + self.session.request_cancel() # user hits Ctrl-C mid-generation + yield ("messages", "tial") # next token: the loop's top-of-iteration check breaks + yield ("values", {"messages": [AIMessage("late")]}) # must never be rendered + + def invoke(self, *a, **k): + raise AssertionError("a streaming agent must not be invoked") + + agent = TokenStreamAgent() + session = CodeSession(agent=agent, sink=seen.append, approver=lambda n, a: True) + agent.session = session + session.send("go") + + texts = [e.text for e in seen if isinstance(e, AssistantText)] + assert texts == [] # the post-cancel "late" reply was dropped, not rendered + + +def test_only_values_chunks_are_rendered_not_messages_deltas() -> None: + # The dual-mode stream tags each yield by mode; only "values" snapshots are rendered (the + # "messages" deltas exist purely as cancel checkpoints). A messages delta that happens to + # be a dict must NOT be emitted — guards the `mode == "values" and ...` guard against an + # `and`->`or` slip that would render it. + seen: list[object] = [] + + class DualModeAgent: + def stream(self, graph_input, config=None, *, stream_mode=("values", "messages")): + del graph_input, config, stream_mode + yield ("messages", {"messages": [AIMessage("ghost")]}) # dict, but messages-mode + yield ("values", {"messages": [AIMessage("real")]}) + + def invoke(self, *a, **k): + raise AssertionError("a streaming agent must not be invoked") + + session = CodeSession(agent=DualModeAgent(), sink=seen.append, approver=lambda n, a: True) + session.send("go") + + texts = [e.text for e in seen if isinstance(e, AssistantText)] + assert texts == ["real"] # the messages-mode dict ("ghost") was not rendered diff --git a/tests/test_code_summarize.py b/tests/test_code_summarize.py new file mode 100644 index 00000000..ebf0eb24 --- /dev/null +++ b/tests/test_code_summarize.py @@ -0,0 +1,93 @@ +"""Tests for the shared tool-activity summarizers (`aai_cli.code_agent.summarize`). + +These keep the coding-agent transcript scannable: a tool call shows its identifying arg +(not the whole file being written), and tool output is previewed with a hidden-line tail. +""" + +from __future__ import annotations + +from aai_cli.code_agent.summarize import ( + describe_args, + full_args, + summarize_call, + summarize_result, +) + + +def test_describe_args_prefers_identity_arg_and_elides_bulk() -> None: + # write_file's content is the bulk we must NOT inline — only the path identifies the call. + body = "\n".join(f"line {i}" for i in range(50)) + assert describe_args({"file_path": "app.py", "content": body}) == "app.py" + # A shell command is the identity arg for execute. + assert describe_args({"command": "pip install flask"}) == "pip install flask" + + +def test_describe_args_clips_long_identity_value() -> None: + out = describe_args({"command": "echo " + "x" * 200}) + assert out.endswith("…") + assert len(out) == 60 # exact: clipped to the per-arg budget, ellipsis included + + +def test_describe_args_without_identity_shows_capped_key_values() -> None: + out = describe_args({"a": 1, "b": 2, "c": 3, "d": 4}) + # Only the first few args render, then an ellipsis marks the elided remainder. + assert out.startswith("a=1, b=2, c=3") + assert out.endswith(", …") + assert "d=4" not in out + + +def test_describe_args_collapses_newlines_in_values() -> None: + # A newline-bearing value must not break the one-line transcript entry. + assert "\n" not in describe_args({"x": "a\nb\nc"}) + + +def test_summarize_call_wraps_args_in_tool_name() -> None: + assert ( + summarize_call("write_file", {"file_path": "app.py", "content": "x"}) + == "write_file(app.py)" + ) + + +def test_summarize_result_previews_and_counts_hidden_lines() -> None: + out = summarize_result("\n".join(f"line {i}" for i in range(20))) + assert "line 0" in out and "line 3" in out + assert "line 4" not in out # only the first few lines are kept + assert "+16 more lines" in out # the rest are counted, not dropped silently + + +def test_summarize_result_shows_short_output_in_full() -> None: + assert summarize_result("done\n") == "done" # no tail when nothing is hidden + assert summarize_result(" ") == "" # whitespace-only collapses to empty + + +def test_full_args_shows_every_arg_whole_with_newlines() -> None: + # The expanded view keeps content (and its newlines) that describe_args elides. + out = full_args({"file_path": "app.py", "content": "a\nb\nc"}) + assert "file_path=app.py" in out + assert "content=a\nb\nc" in out # full value, newlines preserved + + +def test_full_args_caps_a_huge_value_with_char_count() -> None: + out = full_args({"content": "z" * 1500}) # over the 1000-char expanded budget + assert "+500 more chars" in out # exact: 1500 minus the 1000 budget + assert out.startswith("content=" + "z" * 1000) + + +def test_full_args_shows_a_value_at_the_budget_whole() -> None: + # Boundary: exactly the budget is shown whole (guards the cap's `>` against a `>=` slip). + out = full_args({"content": "z" * 1000}) + assert "more chars" not in out + assert out == "content=" + "z" * 1000 + + +def test_summarize_result_counts_a_single_hidden_line() -> None: + # Boundary: exactly one line over the preview budget still gets a tail (guards the + # `hidden_lines > 0` threshold against a `> 1` slip that would silently drop it). + out = summarize_result("\n".join(f"line {i}" for i in range(5))) # 4 shown, 1 hidden + assert out.endswith("(+1 more lines)") + + +def test_summarize_result_clips_one_huge_line_with_char_count() -> None: + out = summarize_result("z" * 500) # a single line longer than the char budget + assert "+200 more chars" in out # exact: 500 minus the 300-char budget = 200 hidden + assert out.startswith("z" * 300) diff --git a/tests/test_code_tui.py b/tests/test_code_tui.py index 8abeee08..b536dec1 100644 --- a/tests/test_code_tui.py +++ b/tests/test_code_tui.py @@ -10,15 +10,15 @@ import asyncio import threading import time -from pathlib import Path import pytest from langchain_core.messages import AIMessage, HumanMessage -from textual.widgets import Input, Label, RichLog, Static +from textual.containers import VerticalScroll +from textual.widgets import Input, Label, Static -from aai_cli.code_agent import tui from aai_cli.code_agent.events import AssistantText, ErrorText, ToolCall, ToolResult -from aai_cli.code_agent.tui import ApprovalScreen, AskScreen, CodeAgentApp +from aai_cli.code_agent.modals import ApprovalScreen, AskScreen +from aai_cli.code_agent.tui import CodeAgentApp class FakeAgent: @@ -39,31 +39,6 @@ def __init__(self, value: dict[str, object]) -> None: self.value = value -# --- pure helpers ------------------------------------------------------------- - - -def test_format_args_and_abbrev_home() -> None: - assert tui._format_args({"a": 1, "b": "x"}) == "a=1, b='x'" - assert tui._abbrev_home(Path.home() / "proj") == "~/proj" - # A path outside home renders as-is; compare to the platform-native string so this - # holds on Windows (where str(Path(...)) uses backslashes) as well as POSIX. - outside = Path("/etc/hosts") - assert tui._abbrev_home(outside) == str(outside) - - -def test_git_branch_and_status(tmp_path: Path) -> None: - assert tui._git_branch(tmp_path) is None # no .git - (tmp_path / ".git").mkdir() - (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/feature-x\n") - assert tui._git_branch(tmp_path) == "feature-x" - (tmp_path / ".git" / "HEAD").write_text("a1b2c3d4e5f6\n") # detached - assert tui._git_branch(tmp_path) == "a1b2c3d4" - - status = tui._status_text(tmp_path, auto_approve=True) - assert "auto" in status and "a1b2c3d4" in status - assert "manual" in tui._status_text(tmp_path, auto_approve=False) - - # --- pilot tests -------------------------------------------------------------- @@ -76,8 +51,9 @@ async def go() -> None: app = CodeAgentApp(agent=FakeAgent([]), web_note="no key", thread_id="t1") async with app.run_test(size=(100, 30)) as pilot: await pilot.pause() - log = app.query_one("#log", RichLog) - assert len(log.lines) > 6 # wordmark + tagline + log = app.query_one("#log", VerticalScroll) + assert len(log.children) >= 1 # the splash is mounted into the transcript + assert "Ready to code" in str(log.children[0].render()) # splash intro shown assert app.focused is app.query_one("#prompt", Input) _run(go()) @@ -216,6 +192,50 @@ async def go() -> None: _run(go()) +def test_approval_expands_args_on_e() -> None: + # Collapsed, the prompt shows only the identifying arg (the filename); pressing `e` + # expands it to the full args, revealing the file content that was elided. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app.push_screen( + ApprovalScreen("write_file", {"file_path": "x.py", "content": "SECRET"}) + ) + await pilot.pause() + detail = app.screen.query_one("#approvaldetail", Label) + assert "SECRET" not in str(detail.render()) # collapsed: content elided + await pilot.press("e") + await pilot.pause() + assert "SECRET" in str(detail.render()) # expanded: full args shown + await pilot.press("e") # toggles back + await pilot.pause() + assert "SECRET" not in str(detail.render()) + + _run(go()) + + +def test_approval_shows_risk_warning_for_dangerous_command() -> None: + # A destructive shell command carries a one-line warning above the prompt; a benign one + # mounts no warning label at all. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app.push_screen(ApprovalScreen("execute", {"command": "rm -rf build/"})) + await pilot.pause() + warn = app.screen.query("#approvalwarn") + assert warn # warning present + assert "deletes files" in str(warn.first().render()) + app.pop_screen() + await pilot.pause() + app.push_screen(ApprovalScreen("execute", {"command": "ls -la"})) + await pilot.pause() + assert not app.screen.query("#approvalwarn") # benign: no warning mounted + + _run(go()) + + def test_approval_box_is_compact_and_bottom_docked() -> None: # Regression guard: the approval prompt must not take over the whole screen — it # docks a short box at the bottom so the transcript stays visible above it. @@ -233,6 +253,28 @@ async def go() -> None: _run(go()) +def test_modals_are_transparent_so_transcript_stays_visible() -> None: + # Regression guard: the app's `Screen { background: #000000 }` canvas rule matches every + # Screen subclass, and app CSS beats a widget's DEFAULT_CSS — so without the explicit + # `ModalScreen { background: transparent }` app rule, the modal paints opaque black and + # blanks the transcript behind it. Assert each modal resolves to a see-through background + # (alpha 0); an opaque modal (alpha 1.0) — the bug — fails here. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app.push_screen(ApprovalScreen("write_file", {"file_path": "x.py"})) + await pilot.pause() + assert app.screen.styles.background.a == 0 # approval modal is see-through + app.pop_screen() + await pilot.pause() + app.push_screen(AskScreen("which port?")) + await pilot.pause() + assert app.screen.styles.background.a == 0 # ask modal is see-through + + _run(go()) + + def test_approval_auto_approve_flips_mode_and_skips_later_prompts() -> None: # Picking "Auto-approve (a)" approves this call, flips the badge manual→auto, and # makes every later _approve return True without ever pushing a modal. @@ -335,11 +377,6 @@ async def go() -> None: _run(go()) -def test_spinner_text_formats_frame_and_elapsed() -> None: - assert tui._spinner_text(46, "✶") == "✶ Working… (46s)" - assert tui._spinner_text(0, "✷") == "✷ Working… (0s)" - - def test_spinner_starts_ticks_and_stops(monkeypatch: pytest.MonkeyPatch) -> None: async def go() -> None: app = CodeAgentApp(agent=FakeAgent([])) @@ -352,6 +389,12 @@ async def go() -> None: await pilot.pause() assert app.query_one("#spinner", Static).display is True # _tick wires the elapsed seconds off the start time; pin "now" to assert it. + # Stop the live interval first so only this deterministic tick writes the + # readout — otherwise a real-time auto-tick can race the assert on a loaded + # runner, which flaked CI with "(6s)" vs "(7s)". update()->render() is + # synchronous, so no pilot.pause() is needed (and pausing here deadlocks). + assert app._spin_timer is not None + app._spin_timer.stop() monkeypatch.setattr(time, "monotonic", lambda: app._turn_started + 7.0) app._tick() assert "Working… (7s)" in str(app.query_one("#spinner", Static).render()) diff --git a/tests/test_code_tui_status.py b/tests/test_code_tui_status.py new file mode 100644 index 00000000..f261a517 --- /dev/null +++ b/tests/test_code_tui_status.py @@ -0,0 +1,49 @@ +"""Tests for the coding-agent TUI's pure status/text helpers (`tui_status`). + +Split from test_code_tui.py (which drives the Textual app) to keep each file under the +500-line gate; these need no pilot, just the plain functions. +""" + +from __future__ import annotations + +from pathlib import Path + +from aai_cli.code_agent import tui_status + + +def test_spinner_text_formats_frame_and_elapsed() -> None: + assert tui_status._spinner_text(46, "✶") == "✶ Working… (46s)" + assert tui_status._spinner_text(0, "✷") == "✷ Working… (0s)" + + +def test_abbrev_home() -> None: + assert tui_status._abbrev_home(Path.home() / "proj") == "~/proj" + # A path outside home renders as-is; compare to the platform-native string so this + # holds on Windows (where str(Path(...)) uses backslashes) as well as POSIX. + outside = Path("/etc/hosts") + assert tui_status._abbrev_home(outside) == str(outside) + + +def test_git_branch_and_status(tmp_path: Path) -> None: + assert tui_status._git_branch(tmp_path) is None # no .git + (tmp_path / ".git").mkdir() + (tmp_path / ".git" / "HEAD").write_text("ref: refs/heads/feature-x\n") + assert tui_status._git_branch(tmp_path) == "feature-x" + (tmp_path / ".git" / "HEAD").write_text("a1b2c3d4e5f6\n") # detached + assert tui_status._git_branch(tmp_path) == "a1b2c3d4" + + status = tui_status._status_text(tmp_path, auto_approve=True) + assert "auto" in status and "a1b2c3d4" in status + assert "manual" in tui_status._status_text(tmp_path, auto_approve=False) + + +def test_status_text_renders_voice_badge(tmp_path: Path) -> None: + # No voice front-end -> no voice badge (the dot glyphs are absent); on/off render the + # state so the Ctrl-V toggle shows. (Asserts on the dots, not the word — the tmp_path name + # itself can contain "voice".) + none = tui_status._status_text(tmp_path, auto_approve=False) + assert "●" not in none and "○" not in none + on = tui_status._status_text(tmp_path, auto_approve=False, voice_state="on") + off = tui_status._status_text(tmp_path, auto_approve=False, voice_state="off") + assert "voice on" in on and "●" in on # filled dot when on + assert "voice off" in off and "○" in off # hollow dot when off diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py index 8adbaea6..88072984 100644 --- a/tests/test_code_tui_voice.py +++ b/tests/test_code_tui_voice.py @@ -12,7 +12,7 @@ import pytest from langchain_core.messages import AIMessage, HumanMessage -from textual.widgets import Input +from textual.widgets import Input, Static from aai_cli.code_agent.tui import CodeAgentApp from aai_cli.core.errors import CLIError @@ -163,3 +163,167 @@ async def go() -> None: assert app._voice is None _run(go()) + + +def test_toggle_voice_pauses_and_resumes_capture() -> None: + # Ctrl-V flips voice off (no capture, no readback) and back on; the state badge tracks it. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + # Assert via the methods, not the `_voice_paused` attribute: mypy narrows the + # attribute and can't see action_toggle_voice() flip it back, flagging the second + # check unreachable. The method calls reflect the same state without that trap. + assert app._voice_active() + assert app._voice_state() == "on" + app.action_toggle_voice() # pause + assert not app._voice_active() + assert app._voice_state() == "off" + app.action_toggle_voice() # resume + assert app._voice_active() + assert app._voice_state() == "on" + + _run(go()) + + +def test_paused_voice_skips_followup_readback() -> None: + # While paused, the post-turn followup neither speaks a summary nor listens. + async def go() -> None: + voice = FakeVoice(transcripts=["ignored"]) + app = CodeAgentApp(agent=FakeAgent([]), voice=voice) + app._voice_paused = True # set before mount so on_mount never auto-listens + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._last_reply = "a reply" + app._voice_followup() + await pilot.pause() + assert voice.spoken == [] # paused: no readback + assert voice.listens == 0 # paused: no capture + + _run(go()) + + +def test_voice_mode_swaps_text_input_for_listening_affordance() -> None: + # While voice capture is on, the text prompt is hidden and a "listening" bar shows; + # toggling voice off (Ctrl-V) brings the text box back. (Re-query each check so mypy + # doesn't narrow a stored display bool across the toggles.) + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + app._voice_paused = True # start paused so on_mount doesn't race a capture thread + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + assert app.query_one("#promptbar").display is True # paused -> text box visible + assert app.query_one("#voicebar").display is False + app.action_toggle_voice() # voice on + await pilot.pause() + assert app.query_one("#promptbar").display is False # text box hidden + assert app.query_one("#voicebar").display is True # listening affordance shown + app.action_toggle_voice() # voice off + await pilot.pause() + assert app.query_one("#promptbar").display is True # text box back + assert app.query_one("#voicebar").display is False + + _run(go()) + + +def test_voice_capture_failure_restores_the_text_input() -> None: + # When the mic is ruled out mid-session, the listening bar is replaced by the text box. + async def go() -> None: + voice = FakeVoice(error=CLIError("no mic", error_type="mic_missing", exit_code=2)) + app = CodeAgentApp(agent=FakeAgent([]), voice=voice) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + assert await _wait_until(pilot, lambda: app._voice_typed) + await pilot.pause() + assert app.query_one("#promptbar").display is True # text box restored on failure + assert app.query_one("#voicebar").display is False + + _run(go()) + + +def test_voice_bar_distinguishes_phases() -> None: + # The bar shows a distinct label per phase; only the listening phase carries the type hint. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + app._voice_paused = True # quiet the auto-listen; drive phases directly + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._set_voice_phase("listening") + bar = str(app.query_one("#voicebar", Static).render()) + assert "Listening" in bar and "Ctrl-V to type" in bar + app._set_voice_phase("thinking") + bar = str(app.query_one("#voicebar", Static).render()) + assert "Thinking" in bar and "Ctrl-V to type" not in bar # hint is listening-only + app._set_voice_phase("speaking") + assert "Speaking" in str(app.query_one("#voicebar", Static).render()) + + _run(go()) + + +def test_spinner_suppressed_in_voice_mode() -> None: + # In voice mode the bar carries the "thinking" state, so the separate spinner stays hidden; + # pausing voice brings the spinner back. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._start_spinner() + assert app.query_one("#spinner", Static).display is False # voice active -> no spinner + app._voice_paused = True + app._start_spinner() + assert app.query_one("#spinner", Static).display is True # paused -> spinner shows + + _run(go()) + + +def test_voice_bar_animation_timer_runs_and_advances() -> None: + # The meter animation timer runs only while the bar is shown, and a tick changes the frame. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + app._voice_paused = True + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + # Read into fresh locals each time: asserting `is None`/`is not None` on the same + # attribute across the opaque toggle would make mypy flag the later check unreachable. + paused_timer = app._voice_timer + assert paused_timer is None # paused -> no animation + app.action_toggle_voice() # voice on -> bar shown, timer running + await pilot.pause() + running_timer = app._voice_timer + assert running_timer is not None + before = str(app.query_one("#voicebar", Static).render()) + app._tick_voice() + assert str(app.query_one("#voicebar", Static).render()) != before # meter advanced + app.action_toggle_voice() # voice off -> timer stopped + await pilot.pause() + stopped_timer = app._voice_timer + assert stopped_timer is None + + _run(go()) + + +def test_submit_sets_thinking_phase() -> None: + async def go() -> None: + agent = FakeAgent([{"messages": [HumanMessage("go"), AIMessage("done")]}]) + app = CodeAgentApp(agent=agent, voice=FakeVoice()) + app._voice_paused = True # keep the post-turn followup from flipping the phase + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app._submit("go") + assert app._voice_phase == "thinking" # set synchronously when the turn starts + await app.workers.wait_for_complete() + + _run(go()) + + +def test_toggle_voice_without_session_notifies_and_stays_off() -> None: + # With no voice front-end the toggle is a no-op (notice only) and never marks a pause. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([])) # no voice + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + app.action_toggle_voice() + assert app._voice_paused is False # nothing to pause + assert app._voice_state() is None # no badge without a session + + _run(go()) diff --git a/tests/test_microphone.py b/tests/test_microphone.py index 207f6b4a..ffb2468e 100644 --- a/tests/test_microphone.py +++ b/tests/test_microphone.py @@ -1,5 +1,7 @@ +import signal import sys import types +from collections.abc import Callable, Mapping from typing import Any import pytest @@ -11,7 +13,12 @@ MicrophoneSource, _default_mic_stream, _device_default_rate, + _ignore_interrupt_during_shutdown, + _install_shutdown_interrupt_guard, + _max_input_channels, + _RawInputStream, _SoundDeviceMic, + import_sounddevice, resample_pcm16, ) @@ -37,6 +44,29 @@ def close(self): self.closed = True +class _FakeSoundDevice(types.ModuleType): + """A typed `_SoundDeviceModule` double: scripted device info + a RawInputStream factory. + + Subclasses `ModuleType` so it can be slotted into `sys.modules` via `monkeypatch.setitem`, + and conforms to the protocol so it needs no escape hatches at the call sites that pass it + to the real `_max_input_channels` / `_default_mic_stream` code under test. + """ + + def __init__( + self, + info: Mapping[str, object], + raw_input_stream: Callable[..., _RawInputStream] = _FakeRawStream, + ) -> None: + super().__init__("sounddevice") + self._info = info + self.RawInputStream = raw_input_stream + + def query_devices( + self, device: int | None = None, kind: str | None = None + ) -> Mapping[str, object]: + return self._info + + def test_audio_missing_error_has_reinstall_suggestion(): from aai_cli.core.microphone import audio_missing_error @@ -303,19 +333,24 @@ def test_sounddevice_mic_downmixes_stereo_to_mono(): assert next(iter(mic)) == b"\x00\x02" -def _fake_sd_rejecting_mono(max_input_channels: int, opened: list[int]) -> Any: +def _fake_sd_rejecting_mono(max_input_channels: int, opened: list[int]) -> _FakeSoundDevice: """A sounddevice whose mono open fails with -9998; query reports ``max_input_channels``.""" - def raw_input_stream(**kwargs): - opened.append(kwargs["channels"]) - if kwargs["channels"] == 1: + def raw_input_stream(*, channels: int, **kwargs: object) -> _RawInputStream: + opened.append(channels) + if channels == 1: raise OSError("Error opening RawInputStream: Invalid number of channels [-9998]") - return _FakeStereoStream(**kwargs) + return _FakeStereoStream(channels=channels, **kwargs) + + return _FakeSoundDevice({"max_input_channels": max_input_channels}, raw_input_stream) - fake_sd: Any = types.ModuleType("sounddevice") - fake_sd.RawInputStream = raw_input_stream - fake_sd.query_devices = lambda device, kind: {"max_input_channels": max_input_channels} - return fake_sd + +def test_max_input_channels_defaults_to_zero_when_absent_or_non_int(): + # A device dict missing the key, or carrying a non-int value, must read as 0 channels (so + # the caller raises the actionable no-input error) rather than a truthy bogus count. + assert _max_input_channels(_FakeSoundDevice({}), None) == 0 # key absent -> 0, not get()'s + assert _max_input_channels(_FakeSoundDevice({"max_input_channels": None}), None) == 0 # non-int + assert _max_input_channels(_FakeSoundDevice({"max_input_channels": 2}), None) == 2 # int passes def test_default_mic_stream_falls_back_to_stereo_downmix(monkeypatch): @@ -337,6 +372,7 @@ def test_default_mic_stream_zero_input_channels_raises_permission_error(monkeypa _default_mic_stream(sample_rate=16000, device=None) assert opened == [1] # only the mono attempt; no pointless stereo retry assert exc.value.error_type == "mic_error" + assert exc.value.exit_code == 1 assert "no input channels" in exc.value.message.lower() assert exc.value.suggestion is not None assert "Microphone" in exc.value.suggestion @@ -365,3 +401,50 @@ def boom(**_kwargs): list(mic) assert exc.value is err # passed through unchanged assert exc.value.suggestion == "grant it" + + +def test_ignore_interrupt_during_shutdown_sets_sig_ign(): + # The guard drops a second Ctrl-C during teardown so it can't raise inside + # sounddevice's atexit PortAudio terminate. Save/restore the global disposition. + before = signal.getsignal(signal.SIGINT) + try: + _ignore_interrupt_during_shutdown() + assert signal.getsignal(signal.SIGINT) is signal.SIG_IGN + finally: + signal.signal(signal.SIGINT, before) + + +def test_install_shutdown_interrupt_guard_registers_once(monkeypatch): + registered = [] + monkeypatch.setattr(microphone, "_shutdown_interrupt_guard_installed", False) + monkeypatch.setattr(microphone.atexit, "register", lambda fn: registered.append(fn)) + + _install_shutdown_interrupt_guard() + _install_shutdown_interrupt_guard() # idempotent: the flag short-circuits the second call + + assert registered == [_ignore_interrupt_during_shutdown] + + +def test_import_sounddevice_installs_shutdown_guard(monkeypatch): + registered = [] + monkeypatch.setattr(microphone, "_shutdown_interrupt_guard_installed", False) + monkeypatch.setattr(microphone.atexit, "register", lambda fn: registered.append(fn)) + monkeypatch.setitem(sys.modules, "sounddevice", types.ModuleType("sounddevice")) + + import_sounddevice() + + assert registered == [_ignore_interrupt_during_shutdown] + + +def test_import_sounddevice_missing_does_not_register_guard(monkeypatch): + # A broken install raises before the guard is reached, so nothing is registered. + registered = [] + monkeypatch.setattr(microphone, "_shutdown_interrupt_guard_installed", False) + monkeypatch.setattr(microphone.atexit, "register", lambda fn: registered.append(fn)) + monkeypatch.setitem(sys.modules, "sounddevice", None) # import -> ImportError + + with pytest.raises(CLIError) as exc: + import_sounddevice() + + assert exc.value.error_type == "mic_missing" + assert registered == [] diff --git a/tests/test_sandbox_access.py b/tests/test_sandbox_access.py index ce947ec4..6fe112de 100644 --- a/tests/test_sandbox_access.py +++ b/tests/test_sandbox_access.py @@ -241,7 +241,9 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m assert "--sandbox" not in external assert "--env" not in external assert "[sandbox]" not in external - assert "agent-cascade" not in external + # The [sandbox]-only `live` command's summary is hidden too (a token unique to it, + # since the bare word "live" also appears in other commands' descriptions). + assert "tool-using" not in external # …but the filter is surgical: non-sandbox flags and commands stay visible (this # also kills the mutant that would treat every option/command as sandbox). assert "--profile" in external @@ -255,4 +257,4 @@ def test_help_hides_the_sandbox_surface_from_external_accounts_and_restores_it(m assert "--sandbox" in internal assert "--env" in internal assert "[sandbox]" in internal - assert "agent-cascade" in internal + assert "tool-using" in internal diff --git a/tests/test_smoke.py b/tests/test_smoke.py index b9ba17ff..a66e2929 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -162,7 +162,7 @@ def test_help_lists_commands_in_workflow_order(): "stream", "dictate", "agent", - "agent-cascade", + "live", "speak", "llm", "clip", From 48326e7ac1e8e21233547ca8b2066176d2550aec Mon Sep 17 00:00:00 2001 From: Claude Date: Thu, 18 Jun 2026 17:54:34 +0000 Subject: [PATCH 3/3] Fix flaky Windows voice-leg thread teardown in `assembly code` TUI MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The voice legs run on daemon threads that call back onto the UI thread via call_from_thread. If the app stops (a quit, or a test's run_test block exiting) while a leg is mid-call, that callback raises RuntimeError in the daemon thread, which pytest's threadexception plugin escalates to a failure — surfacing as a flaky `tests (windows, py3.12)` run on test_submit_sets_thinking_phase. Route every leg through a guarded body that swallows the callback error once the app is no longer running (the spoken turn is moot then) while still surfacing a genuine failure that happens while the app is live. Co-Authored-By: Claude Opus 4.8 Claude-Session: https://claude.ai/code/session_01Ad72JciKrsz4TKG7ZY9GR6 --- aai_cli/code_agent/voice_ui.py | 21 +++++++++++++++++++- tests/test_code_tui_voice.py | 35 ++++++++++++++++++++++++++++++++++ 2 files changed, 55 insertions(+), 1 deletion(-) diff --git a/aai_cli/code_agent/voice_ui.py b/aai_cli/code_agent/voice_ui.py index cdac1e29..0fcaf531 100644 --- a/aai_cli/code_agent/voice_ui.py +++ b/aai_cli/code_agent/voice_ui.py @@ -56,7 +56,26 @@ def _voice_active(self) -> bool: def _spawn(self, target: Callable[[], None]) -> None: """Run ``target`` on a daemon thread — voice legs block, so they stay off the UI thread.""" - threading.Thread(target=target, daemon=True).start() # pragma: no mutate + thread = threading.Thread( + target=lambda: self._run_leg(target), + daemon=True, # pragma: no mutate — daemon flag only affects process exit, unassertable + ) + thread.start() + + def _run_leg(self, target: Callable[[], None]) -> None: + """Run one voice leg, dropping the callback error a torn-down app raises mid-flight. + + A leg calls back onto the UI thread (``call_from_thread``); if the app stops — a quit, + or a test's ``run_test`` block exiting — while the leg is mid-call, that callback raises + ``RuntimeError`` in this daemon thread, which would otherwise surface as an unhandled + thread exception (a flaky Windows CI failure). The spoken turn is moot once the app is + gone, so swallow it then; a genuine failure while the app is still live still propagates. + """ + try: + target() + except Exception: + if self.is_running: + raise def _begin_listening(self) -> None: """Capture the next spoken turn on a background thread (no-op when voice is off).""" diff --git a/tests/test_code_tui_voice.py b/tests/test_code_tui_voice.py index 88072984..e402d2d0 100644 --- a/tests/test_code_tui_voice.py +++ b/tests/test_code_tui_voice.py @@ -316,6 +316,41 @@ async def go() -> None: _run(go()) +def test_run_leg_swallows_callback_error_after_the_app_stops() -> None: + # A voice leg still in flight when the app tears down calls back onto a dead UI thread; + # the resulting RuntimeError must be dropped (the spoken turn is moot), not surface as an + # unhandled thread exception. This app was never started, so is_running is False. + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + assert app.is_running is False + ran: list[bool] = [] + + def boom() -> None: + ran.append(True) + raise RuntimeError("App is not running") + + app._run_leg(boom) # returns without raising — the teardown-race error is swallowed + assert ran == [True] # the leg body did run; only its post-teardown error was dropped + + +def test_run_leg_reraises_a_genuine_failure_while_the_app_is_live() -> None: + # While the app is running, a real exception in a leg is a bug and must propagate (so it's + # reported), not be silently swallowed like the teardown race above. + async def go() -> None: + app = CodeAgentApp(agent=FakeAgent([]), voice=FakeVoice()) + app._voice_paused = True # no auto-listen thread racing this assertion + async with app.run_test(size=(100, 30)) as pilot: + await pilot.pause() + assert app.is_running is True + + def boom() -> None: + raise ValueError("genuine bug") + + with pytest.raises(ValueError, match="genuine bug"): + app._run_leg(boom) + + _run(go()) + + def test_toggle_voice_without_session_notifies_and_stays_off() -> None: # With no voice front-end the toggle is a no-op (notice only) and never marks a pause. async def go() -> None: