diff --git a/aai_cli/commands/dictate/__init__.py b/aai_cli/commands/dictate/__init__.py index c763f1ad..b821b850 100644 --- a/aai_cli/commands/dictate/__init__.py +++ b/aai_cli/commands/dictate/__init__.py @@ -24,6 +24,10 @@ [ ("Dictate: Enter starts a recording, Enter transcribes it", "assembly dictate"), ("One utterance, then exit", "assembly dictate --once"), + ( + "Pipe one utterance into another command", + 'assembly dictate | assembly llm "write a conventional commit"', + ), ("Dictate in Spanish", "assembly dictate --language es"), ( "Bias recognition toward tricky terms", @@ -51,7 +55,7 @@ def dictate( None, "--word-boost", help="Bias recognition toward a term (repeatable)" ), device: int | None = typer.Option(None, "--device", help="Microphone device index"), - once: bool = typer.Option(False, "--once", help="Transcribe one utterance, then exit"), + once: bool = typer.Option(False, "--once", help="Record one utterance immediately, then exit"), max_seconds: float = typer.Option( float(MAX_AUDIO_SECONDS), "--max-seconds", @@ -72,7 +76,9 @@ def dictate( Press Enter (or Space) to start recording and press it again to stop; the utterance is sent to the AssemblyAI Sync API and the transcript prints immediately — no polling. Press q (or Esc/Ctrl-C) to finish. Each utterance - can be up to 120 seconds long. + can be up to 120 seconds long. With --once, or when stdout is piped, + recording starts immediately and dictate exits after one utterance so the + transcript flows to the next command. """ opts = dictate_exec.DictateOptions( language=language, diff --git a/aai_cli/commands/dictate/_exec.py b/aai_cli/commands/dictate/_exec.py index 371c2ee1..fe23c941 100644 --- a/aai_cli/commands/dictate/_exec.py +++ b/aai_cli/commands/dictate/_exec.py @@ -13,7 +13,7 @@ from dataclasses import dataclass from aai_cli.app.context import AppState -from aai_cli.core import choices, sync_stt +from aai_cli.core import choices, stdio, sync_stt from aai_cli.core.config_builder import split_csv from aai_cli.core.hotkey import CTRL_C, CTRL_D, ESC, TerminalKeys from aai_cli.core.microphone import MicrophoneSource @@ -138,6 +138,26 @@ def _transcribe_utterance( _emit(result, json_mode=json_mode) +def _capture_and_transcribe( + keys: TerminalKeys, + api_key: str, + opts: DictateOptions, + state: AppState, + *, + json_mode: bool, +) -> None: + """Record one utterance from the mic and print its transcript.""" + mic = MicrophoneSource( + target_rate=TARGET_RATE, + device=opts.device, + on_open=lambda: _note( + "● Recording — press Enter to stop.", json_mode=json_mode, quiet=state.quiet + ), + ) + pcm = _record(keys, mic, max_seconds=opts.max_seconds) + _transcribe_utterance(api_key, pcm, opts, state, json_mode=json_mode) + + def _session( keys: TerminalKeys, api_key: str, @@ -145,25 +165,25 @@ def _session( state: AppState, *, json_mode: bool, + single: bool, ) -> None: - """The dictation loop: idle until a toggle key, record, transcribe, repeat.""" + """Drive recording: one auto-started utterance, or the idle-toggle loop. + + ``single`` (a piped stdout or --once) starts recording immediately so a + one-off capture takes a single keystroke to stop and then exits — which + closes a piped stdout and unblocks the downstream command. Otherwise it's + the interactive loop: idle until a toggle key, record, transcribe, repeat. + """ + if single: + _capture_and_transcribe(keys, api_key, opts, state, json_mode=json_mode) + return while True: key = keys.read(None) if key is None or key in QUIT_KEYS: return if key not in TOGGLE_KEYS: continue - mic = MicrophoneSource( - target_rate=TARGET_RATE, - device=opts.device, - on_open=lambda: _note( - "● Recording — press Enter to stop.", json_mode=json_mode, quiet=state.quiet - ), - ) - pcm = _record(keys, mic, max_seconds=opts.max_seconds) - _transcribe_utterance(api_key, pcm, opts, state, json_mode=json_mode) - if opts.once: - return + _capture_and_transcribe(keys, api_key, opts, state, json_mode=json_mode) def run_dictate(opts: DictateOptions, state: AppState, *, json_mode: bool) -> None: @@ -187,12 +207,20 @@ def run_dictate(opts: DictateOptions, state: AppState, *, json_mode: bool) -> No "state the language inside the prompt.", json_mode=json_mode, ) - _note( - "Press Enter to start recording, Enter again to transcribe. q quits.", - json_mode=json_mode, - quiet=state.quiet, - ) - _session(keys, api_key, opts, state, json_mode=json_mode) + # A piped stdout (`assembly dictate | assembly llm …`) only closes when + # dictate exits, so a looping session would keep the downstream consumer + # blocked on stdin forever. Single-shot mode (piped or --once) records + # one utterance and exits so the transcript drains to the next stage. + single = opts.once or not stdio.stdout_is_tty() + if not single: + # Only the interactive loop needs a start prompt; single-shot + # auto-starts and announces "● Recording" when the mic opens. + _note( + "Press Enter to start recording, Enter again to transcribe. q quits.", + json_mode=json_mode, + quiet=state.quiet, + ) + _session(keys, api_key, opts, state, json_mode=json_mode, single=single) except KeyboardInterrupt: # Ctrl-C is the normal "done dictating" signal: end cleanly, not as an error. return diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index ffc0ae1a..001704f6 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -353,7 +353,9 @@ Press Enter (or Space) to start recording and press it again to stop; the utterance is sent to the AssemblyAI Sync API and the transcript prints immediately — no polling. Press q (or Esc/Ctrl-C) to finish. Each utterance - can be up to 120 seconds long. + can be up to 120 seconds long. With --once, or when stdout is piped, + recording starts immediately and dictate exits after one utterance so the + transcript flows to the next command. ╭─ Options ────────────────────────────────────────────────────────────────────╮ │ --language TEXT ISO 639-1 language code, │ @@ -366,8 +368,8 @@ │ --word-boost TEXT Bias recognition toward a │ │ term (repeatable) │ │ --device INTEGER Microphone device index │ - │ --once Transcribe one utterance, │ - │ then exit │ + │ --once Record one utterance │ + │ immediately, then exit │ │ --max-seconds FLOAT RANGE Auto-stop a recording │ │ [1.0<=x<=120.0] after this many seconds │ │ [default: 120.0] │ @@ -386,6 +388,8 @@ $ assembly dictate One utterance, then exit $ assembly dictate --once + Pipe one utterance into another command + $ assembly dictate | assembly llm "write a conventional commit" Dictate in Spanish $ assembly dictate --language es Bias recognition toward tricky terms diff --git a/tests/test_dictate_exec.py b/tests/test_dictate_exec.py index 08af8245..7be9a069 100644 --- a/tests/test_dictate_exec.py +++ b/tests/test_dictate_exec.py @@ -70,6 +70,10 @@ def seams(monkeypatch): harness = {"keys": FakeKeys([]), "chunks": [CHUNK, CHUNK], "mic": {}, "calls": []} monkeypatch.setattr(dictate_exec, "TerminalKeys", lambda: harness["keys"]) + # Default to interactive stdout (a real terminal); the piped tests flip this. + # capsys leaves stdout a non-tty, which would otherwise force single-utterance + # mode and end every looping session after one utterance. + monkeypatch.setattr(dictate_exec.stdio, "stdout_is_tty", lambda: True) def fake_mic(*, target_rate, device=None, on_open=None): harness["mic"].update(target_rate=target_rate, device=device) @@ -189,6 +193,28 @@ def test_once_exits_after_a_single_utterance(seams): assert seams["keys"].script +def test_piped_stdout_auto_starts_one_utterance_then_exits(seams, monkeypatch, capsys): + # `assembly dictate | assembly llm …`: stdout is a pipe, not a tty. A looping + # session would keep the pipe open and hang the consumer, so recording + # auto-starts, the first Enter stops it, and the session exits on its own. + monkeypatch.setattr(dictate_exec.stdio, "stdout_is_tty", lambda: False) + # No leading toggle to *start* and no quit key: a single read(0) pops the + # Enter that stops the auto-started recording, then dictate exits. + seams["keys"] = FakeKeys(["\r", "\r", "\r"]) + _run() + assert len(seams["calls"]) == 1 + # Ended on the single-shot, not by draining the key script. + assert seams["keys"].script + # Auto-start: the only key read is the zero-timeout in-recording poll — no + # blocking idle read(None) waiting for a start keypress. + assert seams["keys"].timeouts == [0] + captured = capsys.readouterr() + assert captured.out.strip() == "hello world" + # The mic-open note fires immediately; the interactive start prompt is absent. + assert "Recording — press Enter to stop" in captured.err + assert "start recording" not in captured.err + + @pytest.mark.parametrize("quit_key", ["q", "Q", "\x1b", "\x04"]) def test_quit_keys_end_the_session_without_recording(seams, quit_key, capsys): seams["keys"] = FakeKeys([quit_key, "\r", "\r"])