AssemblyAI · alexkroman · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.importlinter b/.importlinter
@@ -19,12 +19,14 @@ source_modules =
     aai_cli.config_builder
     aai_cli.context
     aai_cli.debuglog
+    aai_cli.dictate_exec
     aai_cli.environments
     aai_cli.errors
     aai_cli.eval_data
     aai_cli.follow
     aai_cli.help_panels
     aai_cli.help_text
+    aai_cli.hotkey
     aai_cli.init
     aai_cli.llm
     aai_cli.llm_exec
@@ -37,6 +39,7 @@ source_modules =
     aai_cli.stdio
     aai_cli.stream_exec
     aai_cli.streaming
+    aai_cli.sync_stt
     aai_cli.telemetry
     aai_cli.theme
     aai_cli.transcribe_batch
@@ -59,6 +62,7 @@ modules =
     aai_cli.commands.clip
     aai_cli.commands.deploy
     aai_cli.commands.dev
+    aai_cli.commands.dictate
     aai_cli.commands.doctor
     aai_cli.commands.evaluate
     aai_cli.commands.init
@@ -87,7 +91,9 @@ source_modules =
     aai_cli.environments
     aai_cli.errors
     aai_cli.eval_data
+    aai_cli.hotkey
     aai_cli.llm
+    aai_cli.sync_stt
     aai_cli.telemetry
     aai_cli.wer
 forbidden_modules =

diff --git a/AGENTS.md b/AGENTS.md
@@ -171,9 +171,9 @@ A Typer CLI. `aai_cli/main.py` builds the `app`, registers each command sub-app,
 
 ### Command layer
 
-Each file in `aai_cli/commands/` is a Typer sub-app (`transcribe`, `stream`, `agent`, `speak`, `llm`, `clip`, `transcripts`, `login` (login/logout/whoami), `doctor`, `init`, `dev`, `share`, `deploy`, `setup`, `onboard`, `account` (balance/usage/limits), `keys`, `sessions`, `audit`, `telemetry` (status/enable/disable), `webhooks` (listen)). Command bodies run through `context.run_command(ctx, fn, json=...)`, which maps any `CLIError` to clean stderr output + the error's exit code. Commands never print tracebacks for expected failures.
+Each file in `aai_cli/commands/` is a Typer sub-app (`transcribe`, `stream`, `dictate`, `agent`, `speak`, `llm`, `clip`, `transcripts`, `login` (login/logout/whoami), `doctor`, `init`, `dev`, `share`, `deploy`, `setup`, `onboard`, `account` (balance/usage/limits), `keys`, `sessions`, `audit`, `telemetry` (status/enable/disable), `webhooks` (listen)). Command bodies run through `context.run_command(ctx, fn, json=...)`, which maps any `CLIError` to clean stderr output + the error's exit code. Commands never print tracebacks for expected failures.
 
-**Options/run split for flag-heavy commands** (gh-CLI style): the Typer function only parses argv into a frozen `<Cmd>Options` dataclass and hands it to a module-level `run_<cmd>(opts, state, *, json_mode)` through a thin lambda adapter in `run_command(ctx, ..., json=...)`. The six run commands follow it — `aai_cli/stream_exec.py` (the reference implementation), `transcribe_exec.py`, `agent_exec.py`, `speak_exec.py`, `llm_exec.py`, `clip_exec.py`. Because the run path is a plain function of data, tests construct options directly (`dataclasses.replace` off a defaults instance, see `tests/test_stream_exec.py` and `tests/test_command_options_seam.py`) instead of round-tripping argv through `CliRunner` — which is also the cheap way to kill mutation-gate mutants on orchestration lines. Follow this for new or heavily-reworked commands with long bodies; small commands keep the inline `body()` closure — the dataclass is pure ceremony there.
+**Options/run split for flag-heavy commands** (gh-CLI style): the Typer function only parses argv into a frozen `<Cmd>Options` dataclass and hands it to a module-level `run_<cmd>(opts, state, *, json_mode)` through a thin lambda adapter in `run_command(ctx, ..., json=...)`. The seven run commands follow it — `aai_cli/stream_exec.py` (the reference implementation), `transcribe_exec.py`, `agent_exec.py`, `speak_exec.py`, `llm_exec.py`, `clip_exec.py`, `dictate_exec.py`. Because the run path is a plain function of data, tests construct options directly (`dataclasses.replace` off a defaults instance, see `tests/test_stream_exec.py` and `tests/test_command_options_seam.py`) instead of round-tripping argv through `CliRunner` — which is also the cheap way to kill mutation-gate mutants on orchestration lines. Follow this for new or heavily-reworked commands with long bodies; small commands keep the inline `body()` closure — the dataclass is pure ceremony there.
 
 ### Cross-cutting state (resolution order matters)
 
@@ -187,6 +187,7 @@ Each file in `aai_cli/commands/` is a Typer sub-app (`transcribe`, `stream`, `ag
 ### Feature subsystems
 
 - **`streaming/`** + `client.stream_audio` — v3 realtime API. Event callbacks run on the SDK reader thread and guard against `BrokenPipeError` (`stdio.silence_stdout()`) so a closed pipe never dumps a thread traceback.
+- **`sync_stt.py`** + **`hotkey.py`** + `commands/dictate.py` — `assembly dictate`: push-to-talk dictation over the **Sync STT API** (`Environment.sync_base`, one POST `/transcribe` per utterance with the required `X-AAI-Model: u3-sync-pro` header; 80 ms–120 s of PCM/WAV). `hotkey.TerminalKeys` scopes stdin into cbreak (Ctrl-C still signals) and reads single keypresses; `dictate_exec._record` polls it with a zero timeout between ~100 ms mic chunks. All three boundaries (keys, mic, HTTP) are injectable, so the suite never needs a real terminal — `tests/test_hotkey.py` drives a pty pair for the termios behavior.
 - **`agent/`** — full-duplex voice agent (mic in, TTS out via `voices.py`).
 - **`tts/`** + `commands/speak.py` — `assembly speak` synthesizes text to speech over the sandbox streaming-TTS WebSocket (`streaming-tts.sandbox000.…`). **Sandbox-only:** `session.is_available()` is false in production (empty `Environment.streaming_tts_host`), so the command exits 2 with a `--sandbox` hint. `session.synthesize` drives a Begin→Generate→Flush→Audio→Terminate protocol with an injectable `connect` for hermetic tests (mirrors `agent/session.py`); `audio.py` plays the PCM (default) or writes a WAV (`--out`).
 - **`code_gen/`** — backs `--show-code` on `transcribe`/`stream`/`agent`: builds a ready-to-run Python SDK script from exactly the flags passed (no API key needed; generated code reads `ASSEMBLYAI_API_KEY`).

diff --git a/README.md b/README.md
@@ -131,7 +131,7 @@ uv tool install "git+https://github.com/AssemblyAI/cli.git"
 If your default interpreter is older than Python 3.12, add `--python python3.12` (pipx) or
 `--python 3.12` (uv) to the install command.
 
-Only the live-audio commands need anything extra: `stream` and `agent` use PortAudio for
+Only the live-audio commands need anything extra: `stream`, `dictate`, and `agent` use PortAudio for
 microphone capture (Debian/Ubuntu: `sudo apt-get install libportaudio2`; Fedora:
 `sudo dnf install portaudio`) and [`ffmpeg`](https://ffmpeg.org) on `PATH` to stream
 non-WAV audio. Plain `transcribe` uploads your file directly and needs neither.
@@ -179,6 +179,7 @@ assembly init                  # scaffold a starter app
 - **Transcription**: `assembly transcribe` handles files, URLs, and YouTube/podcast pages, with flags for speaker labels, PII redaction, summarization, sentiment, chapters, and more.
 - **Batch transcription**: point `assembly transcribe` at a directory or glob (or pipe paths with `--from-stdin`) to transcribe everything concurrently, with sidecar files that make re-runs resumable. Add `--llm "prompt"` to run an LLM prompt over each finished transcript, saved into the sidecars.
 - **Real-time streaming**: `assembly stream` transcribes the microphone, a file, or a URL live — on macOS it can capture system audio too.
+- **Dictation**: `assembly dictate` is push-to-talk for your terminal — press Enter to record, Enter again to get the utterance back instantly from the Sync API (up to 120 s per utterance).
 - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal.
 - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`).
 - **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times.

diff --git a/aai_cli/commands/dictate.py b/aai_cli/commands/dictate.py
@@ -0,0 +1,74 @@
+from __future__ import annotations
+
+import typer
+
+from aai_cli import dictate_exec, help_panels, options
+from aai_cli.context import run_command
+from aai_cli.help_text import examples_epilog
+from aai_cli.sync_stt import MAX_AUDIO_SECONDS
+
+app = typer.Typer()
+
+
+@app.command(
+    rich_help_panel=help_panels.TRANSCRIPTION,
+    epilog=examples_epilog(
+        [
+            ("Dictate: Enter starts a recording, Enter transcribes it", "assembly dictate"),
+            ("One utterance, then exit", "assembly dictate --once"),
+            ("Dictate in Spanish", "assembly dictate --language es"),
+            (
+                "Bias recognition toward tricky terms",
+                "assembly dictate --word-boost AssemblyAI --word-boost LeMUR",
+            ),
+            ("One JSON object per utterance", "assembly dictate --json"),
+        ]
+    ),
+)
+def dictate(
+    ctx: typer.Context,
+    language: str | None = typer.Option(
+        None,
+        "--language",
+        help="ISO 639-1 language code, or a comma-separated list for "
+        "code-switching audio (default: en).",
+    ),
+    prompt: str | None = typer.Option(
+        None,
+        "--prompt",
+        help="Custom transcription prompt (overrides --language).",
+    ),
+    word_boost: list[str] | None = typer.Option(
+        None, "--word-boost", help="Bias recognition toward a term (repeatable)."
+    ),
+    device: int | None = typer.Option(None, "--device", help="Microphone device index."),
+    once: bool = typer.Option(False, "--once", help="Transcribe one utterance, then exit."),
+    max_seconds: float = typer.Option(
+        float(MAX_AUDIO_SECONDS),
+        "--max-seconds",
+        help="Auto-stop a recording after this many seconds.",
+        min=1.0,
+        max=float(MAX_AUDIO_SECONDS),
+    ),
+    json_out: bool = options.json_option("Emit one JSON object per utterance."),
+) -> None:
+    """Dictate with a hotkey: record the mic, get the transcript back instantly.
+
+    Press Enter (or Space) to start recording and press it again to stop; the
+    utterance is sent to the AssemblyAI Sync API and the transcript prints
+    immediately — no polling. Press q (or Esc/Ctrl-C) to finish. Each utterance
+    can be up to 120 seconds long.
+    """
+    opts = dictate_exec.DictateOptions(
+        language=language,
+        prompt=prompt,
+        word_boost=word_boost,
+        device=device,
+        once=once,
+        max_seconds=max_seconds,
+    )
+    run_command(
+        ctx,
+        lambda state, json_mode: dictate_exec.run_dictate(opts, state, json_mode=json_mode),
+        json=json_out,
+    )
diff --git a/aai_cli/dictate_exec.py b/aai_cli/dictate_exec.py
@@ -0,0 +1,187 @@
+"""Run logic for `assembly dictate`: the options/run split (see AGENTS.md).
+
+Push-to-talk dictation over the Sync STT API: wait for a hotkey, record the
+microphone until the hotkey is pressed again (or the duration cap), POST the
+utterance to the Sync API, print the transcript, repeat. The command module
+(aai_cli/commands/dictate.py) only parses argv into a ``DictateOptions``; tests
+drive the session by constructing options directly and injecting the key/mic/
+HTTP boundaries, with no CliRunner argv round-trip and no real terminal.
+"""
+
+from __future__ import annotations
+
+from dataclasses import dataclass
+
+from aai_cli import output, sync_stt
+from aai_cli.context import AppState
+from aai_cli.hotkey import CTRL_C, CTRL_D, ESC, TerminalKeys
+from aai_cli.microphone import MicrophoneSource
+
+# Capture is resampled to one rate the Sync API accepts; 16 kHz mono PCM16 keeps
+# a 120 s utterance well under the 40 MB upload cap.
+TARGET_RATE = 16000
+_BYTES_PER_SECOND = TARGET_RATE * 2  # PCM16 mono
+
+# Enter or Space toggles recording; q / Esc / Ctrl-D ends the session at the
+# idle prompt (Ctrl-C works anywhere — cbreak mode keeps SIGINT delivery).
+TOGGLE_KEYS = frozenset({"\r", "\n", " "})
+QUIT_KEYS = frozenset({"q", "Q", ESC, CTRL_C, CTRL_D})
+
+
+@dataclass(frozen=True)
+class DictateOptions:
+    """Every `assembly dictate` flag as plain data (``--json`` excluded: run_command
+    resolves it into the ``json_mode`` argument)."""
+
+    language: str | None
+    prompt: str | None
+    word_boost: list[str] | None
+    device: int | None
+    once: bool
+    max_seconds: float
+
+
+def _note(message: str, *, json_mode: bool, quiet: bool) -> None:
+    """A muted stderr hint guiding the interactive session; silent under --json
+    (stderr must stay machine-readable) and --quiet."""
+    if json_mode or quiet:
+        return
+    output.error_console.print(f"[aai.muted]{message}[/aai.muted]")
+
+
+def _languages(language: str | None) -> str | list[str] | None:
+    """Fold --language into the config shape: one ISO code as a string, a
+    comma-separated list (code-switching audio) as a list, blank as unset."""
+    if language is None:
+        return None
+    codes = [code.strip() for code in language.split(",") if code.strip()]
+    if not codes:
+        return None
+    return codes[0] if len(codes) == 1 else codes
+
+
+def _record(keys: TerminalKeys, mic: MicrophoneSource, *, max_seconds: float) -> bytes:
+    """Capture PCM until a hotkey is pressed again or the duration cap is hit.
+
+    The key poll runs between ~100 ms mic chunks with a zero timeout, so the mic
+    read loop is never blocked waiting on the keyboard.
+    """
+    pcm = bytearray()
+    frames = iter(mic)
+    try:
+        for chunk in frames:
+            pcm += chunk
+            if len(pcm) >= int(max_seconds * _BYTES_PER_SECOND):
+                break
+            # None (no key pending) is simply not in either set.
+            if keys.read(0) in TOGGLE_KEYS | QUIT_KEYS:
+                break
+    finally:
+        # MicrophoneSource yields from a generator whose cleanup releases the
+        # device; close it deterministically instead of waiting on GC. Injected
+        # fakes (a plain list iterator) may not have close().
+        close = getattr(frames, "close", None)
+        if callable(close):
+            close()
+    return bytes(pcm)
+
+
+def _emit(result: sync_stt.SyncTranscript, *, json_mode: bool) -> None:
+    """One utterance to stdout: the bare transcript text, or one NDJSON object."""
+    if json_mode:
+        output.emit_ndjson(
+            {
+                "text": result.text,
+                "confidence": result.confidence,
+                "audio_duration_ms": result.audio_duration_ms,
+                "session_id": result.session_id,
+            }
+        )
+    else:
+        output.emit_text(result.text)
+
+
+def _transcribe_utterance(
+    api_key: str,
+    pcm: bytes,
+    opts: DictateOptions,
+    state: AppState,
+    *,
+    json_mode: bool,
+) -> None:
+    """Send one recorded utterance to the Sync API and print the transcript.
+
+    A recording below the API's 80 ms floor (a double-tapped hotkey) is skipped
+    with a warning rather than bounced off the server as a 400.
+    """
+    if len(pcm) < sync_stt.MIN_AUDIO_MS * _BYTES_PER_SECOND // 1000:
+        output.emit_warning(
+            f"Recording was shorter than {sync_stt.MIN_AUDIO_MS} ms; nothing to transcribe.",
+            json_mode=json_mode,
+        )
+        return
+    with output.status("Transcribing…", json_mode=json_mode, quiet=state.quiet):
+        result = sync_stt.transcribe_pcm(
+            api_key,
+            pcm,
+            sample_rate=TARGET_RATE,
+            language_code=_languages(opts.language),
+            prompt=opts.prompt,
+            word_boost=opts.word_boost,
+        )
+    _emit(result, json_mode=json_mode)
+
+
+def _session(
+    keys: TerminalKeys,
+    api_key: str,
+    opts: DictateOptions,
+    state: AppState,
+    *,
+    json_mode: bool,
+) -> None:
+    """The dictation loop: idle until a toggle key, record, transcribe, repeat."""
+    while True:
+        key = keys.read(None)
+        if key is None or key in QUIT_KEYS:
+            return
+        if key not in TOGGLE_KEYS:
+            continue
+        mic = MicrophoneSource(
+            target_rate=TARGET_RATE,
+            device=opts.device,
+            on_open=lambda: _note(
+                "● Recording — press Enter to stop.", json_mode=json_mode, quiet=state.quiet
+            ),
+        )
+        pcm = _record(keys, mic, max_seconds=opts.max_seconds)
+        _transcribe_utterance(api_key, pcm, opts, state, json_mode=json_mode)
+        if opts.once:
+            return
+
+
+def run_dictate(opts: DictateOptions, state: AppState, *, json_mode: bool) -> None:
+    """Execute one `assembly dictate` invocation from already-parsed flags."""
+    try:
+        # Entering TerminalKeys validates the terminal (a usage precondition)
+        # before credentials, so a piped stdin reads as "needs a terminal" — not
+        # as a login prompt.
+        with TerminalKeys() as keys:
+            api_key = state.resolve_api_key()
+            if opts.prompt and opts.language:
+                # The server ignores language_code whenever a custom prompt is set;
+                # never drop a requested flag silently (mirrors the speak warnings).
+                output.emit_warning(
+                    "--language is ignored when --prompt is set; "
+                    "state the language inside the prompt.",
+                    json_mode=json_mode,
+                )
+            _note(
+                "Press Enter to start recording, Enter again to transcribe. q quits.",
+                json_mode=json_mode,
+                quiet=state.quiet,
+            )
+            _session(keys, api_key, opts, state, json_mode=json_mode)
+    except KeyboardInterrupt:
+        # Ctrl-C is the normal "done dictating" signal: end cleanly, not as an error.
+        return
diff --git a/aai_cli/environments.py b/aai_cli/environments.py
@@ -16,6 +16,7 @@ class Environment:
 
     name: str
     api_base: str  # SDK base_url for /v2/upload + /v2/transcript
+    sync_base: str  # Sync STT API base (one-shot POST /transcribe, used by `assembly dictate`)
     streaming_host: str  # StreamingClientOptions.api_host (SDK builds wss://host/v3/ws)
     streaming_tts_host: str  # streaming TTS host; empty when TTS isn't available (prod)
     agents_host: str  # Voice Agent host; the agent client builds wss://host/v1/ws
@@ -37,6 +38,7 @@ class Environment:
     "production": Environment(
         name="production",
         api_base="https://api.assemblyai.com",
+        sync_base="https://sync.assemblyai.com",
         streaming_host="streaming.assemblyai.com",
         streaming_tts_host="",
         agents_host="agents.assemblyai.com",
@@ -49,6 +51,7 @@ class Environment:
     "sandbox000": Environment(
         name="sandbox000",
         api_base="https://api.sandbox000.assemblyai-labs.com",
+        sync_base="https://sync.sandbox000.assemblyai-labs.com",
         streaming_host="streaming.sandbox000.assemblyai-labs.com",
         streaming_tts_host="streaming-tts.sandbox000.assemblyai-labs.com",
         agents_host="agents.sandbox000.assemblyai-labs.com",