Skip to content

Commit 6aaaec4

Browse files
committed
Clarify transcribe/stream CLI help text
Fix the stream docstring, which referenced a nonexistent --llm-gateway-prompt flag and described the LLM transform as running once the stream ends; --llm actually runs over the live transcript, refreshing on every finalized turn. Tighten vague and cryptic option help across transcribe and stream: - spell out value enumerations as readable lists ('Speech model: best, nano, slam-1, or universal' rather than a bare comma list), which also keeps the summary-model/summary-type values from truncating in narrow terminals - give bare-value flags a leading noun (--encoding, --voice-focus, --redact-pii-sub, --summary-*) and clarify thresholds and turn-silence flags - standardize --prompt and --redact-pii-sub wording across both commands Refresh the help snapshots accordingly.
1 parent e9a63d3 commit 6aaaec4

3 files changed

Lines changed: 78 additions & 47 deletions

File tree

aai_cli/commands/stream.py

Lines changed: 16 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -66,20 +66,22 @@ def stream(
6666
speech_model: str = typer.Option(
6767
DEFAULT_SPEECH_MODEL, "--speech-model", help="Streaming speech model."
6868
),
69-
encoding: str | None = typer.Option(None, "--encoding", help="pcm_s16le or pcm_mulaw."),
69+
encoding: str | None = typer.Option(
70+
None, "--encoding", help="Audio encoding: pcm_s16le or pcm_mulaw."
71+
),
7072
language_detection: bool | None = typer.Option(
7173
None, "--language-detection", help="Auto-detect the spoken language."
7274
),
7375
domain: str | None = typer.Option(None, "--domain", help="Domain preset (e.g. medical)."),
7476
# turn detection
7577
end_of_turn_confidence_threshold: float | None = typer.Option(
76-
None, "--end-of-turn-confidence-threshold", help="0-1 end-of-turn confidence."
78+
None, "--end-of-turn-confidence-threshold", help="End-of-turn confidence (0-1)."
7779
),
7880
min_turn_silence: int | None = typer.Option(
79-
None, "--min-turn-silence", help="Min turn silence (ms)."
81+
None, "--min-turn-silence", help="Min silence to end a turn (ms)."
8082
),
8183
max_turn_silence: int | None = typer.Option(
82-
None, "--max-turn-silence", help="Max turn silence (ms)."
84+
None, "--max-turn-silence", help="Max silence before ending a turn (ms)."
8385
),
8486
vad_threshold: float | None = typer.Option(
8587
None, "--vad-threshold", help="Voice-activity threshold."
@@ -99,7 +101,9 @@ def stream(
99101
),
100102
speaker_labels: bool | None = typer.Option(None, "--speaker-labels", help="Label speakers."),
101103
max_speakers: int | None = typer.Option(None, "--max-speakers", help="Max speakers."),
102-
voice_focus: str | None = typer.Option(None, "--voice-focus", help="near_field or far_field."),
104+
voice_focus: str | None = typer.Option(
105+
None, "--voice-focus", help="Voice focus: near_field or far_field."
106+
),
103107
voice_focus_threshold: float | None = typer.Option(
104108
None, "--voice-focus-threshold", help="Voice-focus threshold."
105109
),
@@ -108,7 +112,7 @@ def stream(
108112
None, "--redact-pii-policy", help="Comma-separated PII policies."
109113
),
110114
redact_pii_sub: str | None = typer.Option(
111-
None, "--redact-pii-sub", help="hash or entity_name."
115+
None, "--redact-pii-sub", help="Replace redacted PII with: hash or entity_name."
112116
),
113117
inactivity_timeout: int | None = typer.Option(
114118
None, "--inactivity-timeout", help="Auto-close after N seconds idle."
@@ -125,7 +129,9 @@ def stream(
125129
None, "--config-file", help="JSON file of streaming fields."
126130
),
127131
# existing
128-
prompt: str | None = typer.Option(None, "--prompt", help="Bias the speech model (u3-pro)."),
132+
prompt: str | None = typer.Option(
133+
None, "--prompt", help="Prompt to bias the speech model (u3-pro)."
134+
),
129135
llm_prompt: list[str] | None = typer.Option(
130136
None,
131137
"--llm",
@@ -150,8 +156,9 @@ def stream(
150156
) -> None:
151157
"""Transcribe live audio in real time — from your mic, a file, or a URL.
152158
153-
--prompt biases the speech model. --llm-gateway-prompt transforms the full
154-
transcript through LLM Gateway once the stream ends (e.g. "summarize the call").
159+
--prompt biases the speech model. --llm runs a prompt over the live transcript
160+
through LLM Gateway, refreshing the answer on every finalized turn (e.g.
161+
"summarize action items").
155162
"""
156163

157164
def body(state: AppState, json_mode: bool) -> None:

aai_cli/commands/transcribe.py

Lines changed: 14 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -82,7 +82,7 @@ def transcribe(
8282
sample: bool = typer.Option(False, "--sample", help="Use the hosted wildfires.mp3 sample."),
8383
# model & language
8484
speech_model: str | None = typer.Option(
85-
None, "--speech-model", help="best, nano, slam-1, universal."
85+
None, "--speech-model", help="Speech model: best, nano, slam-1, or universal."
8686
),
8787
language_code: str | None = typer.Option(
8888
None, "--language-code", help="Force a language (e.g. en_us)."
@@ -96,22 +96,26 @@ def transcribe(
9696
temperature: float | None = typer.Option(
9797
None, "--temperature", help="Speech model temperature."
9898
),
99-
prompt: str | None = typer.Option(None, "--prompt", help="Bias the speech model (u3-pro)."),
99+
prompt: str | None = typer.Option(
100+
None, "--prompt", help="Prompt to bias the speech model (u3-pro)."
101+
),
100102
# formatting
101103
punctuate: bool | None = typer.Option(
102104
None, "--punctuate/--no-punctuate", help="Add punctuation."
103105
),
104106
format_text: bool | None = typer.Option(
105-
None, "--format-text/--no-format-text", help="Format text."
107+
None, "--format-text/--no-format-text", help="Apply text formatting (casing, numbers)."
108+
),
109+
disfluencies: bool | None = typer.Option(
110+
None, "--disfluencies", help="Keep filler words (e.g. um, uh)."
106111
),
107-
disfluencies: bool | None = typer.Option(None, "--disfluencies", help="Keep filler words."),
108112
# speakers & channels
109113
speaker_labels: bool = typer.Option(False, "--speaker-labels", help="Enable diarization."),
110114
speakers_expected: int | None = typer.Option(
111115
None, "--speakers-expected", help="Hint speaker count."
112116
),
113117
multichannel: bool | None = typer.Option(
114-
None, "--multichannel", help="Transcribe each channel."
118+
None, "--multichannel", help="Transcribe each audio channel separately."
115119
),
116120
# guardrails
117121
redact_pii: bool | None = typer.Option(
@@ -121,7 +125,7 @@ def transcribe(
121125
None, "--redact-pii-policy", help="Comma-separated PII policies (e.g. person_name,...)."
122126
),
123127
redact_pii_sub: str | None = typer.Option(
124-
None, "--redact-pii-sub", help="Substitution: hash or entity_name."
128+
None, "--redact-pii-sub", help="Replace redacted PII with: hash or entity_name."
125129
),
126130
redact_pii_audio: bool | None = typer.Option(
127131
None, "--redact-pii-audio", help="Also redact audio."
@@ -133,20 +137,20 @@ def transcribe(
133137
None, "--content-safety", help="Detect sensitive content."
134138
),
135139
content_safety_confidence: int | None = typer.Option(
136-
None, "--content-safety-confidence", help="Confidence threshold 25-100."
140+
None, "--content-safety-confidence", help="Content-safety confidence threshold (25-100)."
137141
),
138142
speech_threshold: float | None = typer.Option(
139-
None, "--speech-threshold", help="Minimum speech proportion 0-1."
143+
None, "--speech-threshold", help="Minimum proportion of speech required (0-1)."
140144
),
141145
# analysis
142146
summarization: bool | None = typer.Option(
143147
None, "--summarization", help="Summarize the transcript."
144148
),
145149
summary_model: str | None = typer.Option(
146-
None, "--summary-model", help="informative/conversational/catchy."
150+
None, "--summary-model", help="Summary model: informative, conversational, or catchy."
147151
),
148152
summary_type: str | None = typer.Option(
149-
None, "--summary-type", help="bullets/gist/headline/paragraph."
153+
None, "--summary-type", help="Summary format: bullets, gist, headline, or paragraph."
150154
),
151155
auto_chapters: bool | None = typer.Option(None, "--auto-chapters", help="Generate chapters."),
152156
sentiment_analysis: bool | None = typer.Option(

tests/__snapshots__/test_cli_output_snapshots.ambr

Lines changed: 48 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -517,9 +517,9 @@
517517

518518
Transcribe live audio in real time — from your mic, a file, or a URL.
519519

520-
--prompt biases the speech model. --llm-gateway-prompt transforms the full
521-
transcript through LLM Gateway once the stream ends (e.g. "summarize the
522-
call").
520+
--prompt biases the speech model. --llm runs a prompt over the live transcript
521+
through LLM Gateway, refreshing the answer on every finalized turn (e.g.
522+
"summarize action items").
523523

524524
╭─ Arguments ──────────────────────────────────────────────────────────────────╮
525525
│ source [SOURCE] Audio file path, URL, or YouTube URL to stream. Omit │
@@ -546,18 +546,19 @@
546546
│ --speech-model TEXT Streaming speech │
547547
│ model. │
548548
│ [default: u3-rt-pro] │
549-
│ --encoding TEXT pcm_s16le or │
549+
│ --encoding TEXT Audio encoding: │
550+
│ pcm_s16le or │
550551
│ pcm_mulaw. │
551552
│ --language-detection Auto-detect the │
552553
│ spoken language. │
553554
│ --domain TEXT Domain preset (e.g. │
554555
│ medical). │
555-
│ --end-of-turn-confi… FLOAT 0-1 end-of-turn │
556-
│ confidence.
557-
│ --min-turn-silence INTEGER Min turn silence
558-
│ (ms).
559-
│ --max-turn-silence INTEGER Max turn silence
560-
│ (ms).
556+
│ --end-of-turn-confi… FLOAT End-of-turn
557+
│ confidence (0-1).
558+
│ --min-turn-silence INTEGER Min silence to end a
559+
turn (ms). │
560+
│ --max-turn-silence INTEGER Max silence before
561+
ending a turn (ms). │
561562
│ --vad-threshold FLOAT Voice-activity │
562563
│ threshold. │
563564
│ --format-turns --no-format-turns Punctuate/format │
@@ -568,15 +569,18 @@
568569
│ --filter-profanity Mask profanity. │
569570
│ --speaker-labels Label speakers. │
570571
│ --max-speakers INTEGER Max speakers. │
571-
│ --voice-focus TEXT near_field or │
572+
│ --voice-focus TEXT Voice focus: │
573+
│ near_field or │
572574
│ far_field. │
573575
│ --voice-focus-thres… FLOAT Voice-focus │
574576
│ threshold. │
575577
│ --redact-pii Redact PII from │
576578
│ turns. │
577579
│ --redact-pii-policy TEXT Comma-separated PII │
578580
│ policies. │
579-
│ --redact-pii-sub TEXT hash or entity_name. │
581+
│ --redact-pii-sub TEXT Replace redacted PII │
582+
│ with: hash or │
583+
│ entity_name. │
580584
│ --inactivity-timeout INTEGER Auto-close after N │
581585
│ seconds idle. │
582586
│ --webhook-url TEXT Webhook URL. │
@@ -588,8 +592,9 @@
588592
│ (repeatable). │
589593
│ --config-file TEXT JSON file of │
590594
│ streaming fields. │
591-
│ --prompt TEXT Bias the speech │
592-
│ model (u3-pro). │
595+
│ --prompt TEXT Prompt to bias the │
596+
│ speech model │
597+
│ (u3-pro). │
593598
│ --llm TEXT Run a prompt over │
594599
│ the live transcript │
595600
│ through LLM Gateway, │
@@ -658,7 +663,8 @@
658663
│ --sample Use the hosted │
659664
│ wildfires.mp3 │
660665
│ sample. │
661-
│ --speech-model TEXT best, nano, slam-1, │
666+
│ --speech-model TEXT Speech model: best, │
667+
│ nano, slam-1, or │
662668
│ universal. │
663669
│ --language-code TEXT Force a language │
664670
│ (e.g. en_us). │
@@ -668,34 +674,48 @@
668674
│ (repeatable). │
669675
│ --temperature FLOAT Speech model │
670676
│ temperature. │
671-
│ --prompt TEXT Bias the speech │
672-
│ model (u3-pro). │
677+
│ --prompt TEXT Prompt to bias the │
678+
│ speech model │
679+
│ (u3-pro). │
673680
│ --punctuate --no-punctuate Add punctuation. │
674-
│ --format-text --no-format-text Format text. │
675-
│ --disfluencies Keep filler words. │
681+
│ --format-text --no-format-text Apply text │
682+
│ formatting (casing, │
683+
│ numbers). │
684+
│ --disfluencies Keep filler words │
685+
│ (e.g. um, uh). │
676686
│ --speaker-labels Enable diarization. │
677687
│ --speakers-expected INTEGER Hint speaker count. │
678688
│ --multichannel Transcribe each │
679-
│ channel. │
689+
│ audio channel │
690+
│ separately. │
680691
│ --redact-pii Redact PII from the │
681692
│ transcript. │
682693
│ --redact-pii-policy TEXT Comma-separated PII │
683694
│ policies (e.g. │
684695
│ person_name,...). │
685-
│ --redact-pii-sub TEXT Substitution: hash │
686-
│ or entity_name. │
696+
│ --redact-pii-sub TEXT Replace redacted PII │
697+
│ with: hash or │
698+
│ entity_name. │
687699
│ --redact-pii-audio Also redact audio. │
688700
│ --filter-profanity Mask profanity. │
689701
│ --content-safety Detect sensitive │
690702
│ content. │
691-
│ --content-safety-con… INTEGER Confidence threshold │
692-
│ 25-100. │
693-
│ --speech-threshold FLOAT Minimum speech │
694-
│ proportion 0-1. │
703+
│ --content-safety-con… INTEGER Content-safety │
704+
│ confidence threshold │
705+
│ (25-100). │
706+
│ --speech-threshold FLOAT Minimum proportion │
707+
│ of speech required │
708+
│ (0-1). │
695709
│ --summarization Summarize the │
696710
│ transcript. │
697-
│ --summary-model TEXT informative/convers… │
698-
│ --summary-type TEXT bullets/gist/headli… │
711+
│ --summary-model TEXT Summary model: │
712+
│ informative, │
713+
│ conversational, or │
714+
│ catchy. │
715+
│ --summary-type TEXT Summary format: │
716+
│ bullets, gist, │
717+
│ headline, or │
718+
│ paragraph. │
699719
│ --auto-chapters Generate chapters. │
700720
│ --sentiment-analysis Analyze sentiment. │
701721
│ --entity-detection Detect entities. │

0 commit comments

Comments
 (0)