Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions aai_cli/commands/dictate/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,10 @@
[
("Dictate: Enter starts a recording, Enter transcribes it", "assembly dictate"),
("One utterance, then exit", "assembly dictate --once"),
(
"Pipe one utterance into another command",
'assembly dictate | assembly llm "write a conventional commit"',
),
("Dictate in Spanish", "assembly dictate --language es"),
(
"Bias recognition toward tricky terms",
Expand Down Expand Up @@ -51,7 +55,7 @@ def dictate(
None, "--word-boost", help="Bias recognition toward a term (repeatable)"
),
device: int | None = typer.Option(None, "--device", help="Microphone device index"),
once: bool = typer.Option(False, "--once", help="Transcribe one utterance, then exit"),
once: bool = typer.Option(False, "--once", help="Record one utterance immediately, then exit"),
max_seconds: float = typer.Option(
float(MAX_AUDIO_SECONDS),
"--max-seconds",
Expand All @@ -72,7 +76,9 @@ def dictate(
Press Enter (or Space) to start recording and press it again to stop; the
utterance is sent to the AssemblyAI Sync API and the transcript prints
immediately — no polling. Press q (or Esc/Ctrl-C) to finish. Each utterance
can be up to 120 seconds long.
can be up to 120 seconds long. With --once, or when stdout is piped,
recording starts immediately and dictate exits after one utterance so the
transcript flows to the next command.
"""
opts = dictate_exec.DictateOptions(
language=language,
Expand Down
66 changes: 47 additions & 19 deletions aai_cli/commands/dictate/_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from dataclasses import dataclass

from aai_cli.app.context import AppState
from aai_cli.core import choices, sync_stt
from aai_cli.core import choices, stdio, sync_stt
from aai_cli.core.config_builder import split_csv
from aai_cli.core.hotkey import CTRL_C, CTRL_D, ESC, TerminalKeys
from aai_cli.core.microphone import MicrophoneSource
Expand Down Expand Up @@ -138,32 +138,52 @@ def _transcribe_utterance(
_emit(result, json_mode=json_mode)


def _capture_and_transcribe(
keys: TerminalKeys,
api_key: str,
opts: DictateOptions,
state: AppState,
*,
json_mode: bool,
) -> None:
"""Record one utterance from the mic and print its transcript."""
mic = MicrophoneSource(
target_rate=TARGET_RATE,
device=opts.device,
on_open=lambda: _note(
"● Recording — press Enter to stop.", json_mode=json_mode, quiet=state.quiet
),
)
pcm = _record(keys, mic, max_seconds=opts.max_seconds)
_transcribe_utterance(api_key, pcm, opts, state, json_mode=json_mode)


def _session(
keys: TerminalKeys,
api_key: str,
opts: DictateOptions,
state: AppState,
*,
json_mode: bool,
single: bool,
) -> None:
"""The dictation loop: idle until a toggle key, record, transcribe, repeat."""
"""Drive recording: one auto-started utterance, or the idle-toggle loop.

``single`` (a piped stdout or --once) starts recording immediately so a
one-off capture takes a single keystroke to stop and then exits — which
closes a piped stdout and unblocks the downstream command. Otherwise it's
the interactive loop: idle until a toggle key, record, transcribe, repeat.
"""
if single:
_capture_and_transcribe(keys, api_key, opts, state, json_mode=json_mode)
return
while True:
key = keys.read(None)
if key is None or key in QUIT_KEYS:
return
if key not in TOGGLE_KEYS:
continue
mic = MicrophoneSource(
target_rate=TARGET_RATE,
device=opts.device,
on_open=lambda: _note(
"● Recording — press Enter to stop.", json_mode=json_mode, quiet=state.quiet
),
)
pcm = _record(keys, mic, max_seconds=opts.max_seconds)
_transcribe_utterance(api_key, pcm, opts, state, json_mode=json_mode)
if opts.once:
return
_capture_and_transcribe(keys, api_key, opts, state, json_mode=json_mode)


def run_dictate(opts: DictateOptions, state: AppState, *, json_mode: bool) -> None:
Expand All @@ -187,12 +207,20 @@ def run_dictate(opts: DictateOptions, state: AppState, *, json_mode: bool) -> No
"state the language inside the prompt.",
json_mode=json_mode,
)
_note(
"Press Enter to start recording, Enter again to transcribe. q quits.",
json_mode=json_mode,
quiet=state.quiet,
)
_session(keys, api_key, opts, state, json_mode=json_mode)
# A piped stdout (`assembly dictate | assembly llm …`) only closes when
# dictate exits, so a looping session would keep the downstream consumer
# blocked on stdin forever. Single-shot mode (piped or --once) records
# one utterance and exits so the transcript drains to the next stage.
single = opts.once or not stdio.stdout_is_tty()

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

single and _session(...) are inside if opts.prompt and opts.language, so dictation only runs when both flags are set; otherwise run_dictate() exits without recording or transcribing.

Details

✨ AI Reasoning
​The updated flow is trying to always run dictation while optionally warning when --prompt and --language are combined. However, the new session-start logic is placed under the same condition as the warning. That means the command loop runs only when both options are provided together. In the common case where that condition is false, execution exits the context manager without ever entering recording/transcription, so the command effectively does nothing. This is a definite logic bug caused by control-flow placement, not a style issue.

🔧 How do I fix it?
Trace execution paths carefully. Ensure precondition checks happen before using values, validate ranges before checking impossible conditions, and don't check for states that the code has already ruled out.

Reply @AikidoSec feedback: [FEEDBACK] to get better review comments in the future.
Reply @AikidoSec ignore: [REASON] to ignore this issue.
More info

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@AikidoSec ignore: False positive — misread indentation. The if opts.prompt and opts.language: block contains only the emit_warning(...) call; single = … and _session(...) are dedented to the with TerminalKeys() scope and run unconditionally. Verified by test_hotkey_records_then_prints_bare_transcript, which records and transcribes with neither flag set, plus 100% patch coverage on this file.


Generated by Claude Code

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

✅ Based on your feedback, we ignored this issue because of the following reason:

False positive — misread indentation. The if opts.prompt and opts.language: block contains only the emit_warning(...) call; single = … and _session(...) are dedented to the with TerminalKeys() scope and run unconditionally. Verified by test_hotkey_records_then_prints_bare_transcript, which records and transcribes with neither flag set, plus 100% patch coverage on this file.


Generated by Claude Code

if not single:
# Only the interactive loop needs a start prompt; single-shot
# auto-starts and announces "● Recording" when the mic opens.
_note(
"Press Enter to start recording, Enter again to transcribe. q quits.",
json_mode=json_mode,
quiet=state.quiet,
)
_session(keys, api_key, opts, state, json_mode=json_mode, single=single)
except KeyboardInterrupt:
# Ctrl-C is the normal "done dictating" signal: end cleanly, not as an error.
return
10 changes: 7 additions & 3 deletions tests/__snapshots__/test_snapshots_help_run.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -353,7 +353,9 @@
Press Enter (or Space) to start recording and press it again to stop; the
utterance is sent to the AssemblyAI Sync API and the transcript prints
immediately — no polling. Press q (or Esc/Ctrl-C) to finish. Each utterance
can be up to 120 seconds long.
can be up to 120 seconds long. With --once, or when stdout is piped,
recording starts immediately and dictate exits after one utterance so the
transcript flows to the next command.

╭─ Options ────────────────────────────────────────────────────────────────────╮
│ --language TEXT ISO 639-1 language code, │
Expand All @@ -366,8 +368,8 @@
│ --word-boost TEXT Bias recognition toward a │
│ term (repeatable) │
│ --device INTEGER Microphone device index │
│ --once Transcribe one utterance,
│ then exit
│ --once Record one utterance
immediately, then exit │
│ --max-seconds FLOAT RANGE Auto-stop a recording │
│ [1.0<=x<=120.0] after this many seconds │
│ [default: 120.0] │
Expand All @@ -386,6 +388,8 @@
$ assembly dictate
One utterance, then exit
$ assembly dictate --once
Pipe one utterance into another command
$ assembly dictate | assembly llm "write a conventional commit"
Dictate in Spanish
$ assembly dictate --language es
Bias recognition toward tricky terms
Expand Down
26 changes: 26 additions & 0 deletions tests/test_dictate_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,6 +70,10 @@ def seams(monkeypatch):
harness = {"keys": FakeKeys([]), "chunks": [CHUNK, CHUNK], "mic": {}, "calls": []}

monkeypatch.setattr(dictate_exec, "TerminalKeys", lambda: harness["keys"])
# Default to interactive stdout (a real terminal); the piped tests flip this.
# capsys leaves stdout a non-tty, which would otherwise force single-utterance
# mode and end every looping session after one utterance.
monkeypatch.setattr(dictate_exec.stdio, "stdout_is_tty", lambda: True)

def fake_mic(*, target_rate, device=None, on_open=None):
harness["mic"].update(target_rate=target_rate, device=device)
Expand Down Expand Up @@ -189,6 +193,28 @@ def test_once_exits_after_a_single_utterance(seams):
assert seams["keys"].script


def test_piped_stdout_auto_starts_one_utterance_then_exits(seams, monkeypatch, capsys):
# `assembly dictate | assembly llm …`: stdout is a pipe, not a tty. A looping
# session would keep the pipe open and hang the consumer, so recording
# auto-starts, the first Enter stops it, and the session exits on its own.
monkeypatch.setattr(dictate_exec.stdio, "stdout_is_tty", lambda: False)
# No leading toggle to *start* and no quit key: a single read(0) pops the
# Enter that stops the auto-started recording, then dictate exits.
seams["keys"] = FakeKeys(["\r", "\r", "\r"])
_run()
assert len(seams["calls"]) == 1
# Ended on the single-shot, not by draining the key script.
assert seams["keys"].script
# Auto-start: the only key read is the zero-timeout in-recording poll — no
# blocking idle read(None) waiting for a start keypress.
assert seams["keys"].timeouts == [0]
captured = capsys.readouterr()
assert captured.out.strip() == "hello world"
# The mic-open note fires immediately; the interactive start prompt is absent.
assert "Recording — press Enter to stop" in captured.err
assert "start recording" not in captured.err


@pytest.mark.parametrize("quit_key", ["q", "Q", "\x1b", "\x04"])
def test_quit_keys_end_the_session_without_recording(seams, quit_key, capsys):
seams["keys"] = FakeKeys([quit_key, "\r", "\r"])
Expand Down
Loading