Clarify transcribe/stream CLI help text

claude · claude · commit 6aaaec4e41f2 · 2026-06-06T04:43:03.000Z
Fix the stream docstring, which referenced a nonexistent --llm-gateway-prompt
flag and described the LLM transform as running once the stream ends; --llm
actually runs over the live transcript, refreshing on every finalized turn.

Tighten vague and cryptic option help across transcribe and stream:
- spell out value enumerations as readable lists ('Speech model: best, nano,
  slam-1, or universal' rather than a bare comma list), which also keeps the
  summary-model/summary-type values from truncating in narrow terminals
- give bare-value flags a leading noun (--encoding, --voice-focus,
  --redact-pii-sub, --summary-*) and clarify thresholds and turn-silence flags
- standardize --prompt and --redact-pii-sub wording across both commands

Refresh the help snapshots accordingly.
diff --git a/aai_cli/commands/stream.py b/aai_cli/commands/stream.py
@@ -66,20 +66,22 @@ def stream(
     speech_model: str = typer.Option(
         DEFAULT_SPEECH_MODEL, "--speech-model", help="Streaming speech model."
     ),
-    encoding: str | None = typer.Option(None, "--encoding", help="pcm_s16le or pcm_mulaw."),
+    encoding: str | None = typer.Option(
+        None, "--encoding", help="Audio encoding: pcm_s16le or pcm_mulaw."
+    ),
     language_detection: bool | None = typer.Option(
         None, "--language-detection", help="Auto-detect the spoken language."
     ),
     domain: str | None = typer.Option(None, "--domain", help="Domain preset (e.g. medical)."),
     # turn detection
     end_of_turn_confidence_threshold: float | None = typer.Option(
-        None, "--end-of-turn-confidence-threshold", help="0-1 end-of-turn confidence."
+        None, "--end-of-turn-confidence-threshold", help="End-of-turn confidence (0-1)."
     ),
     min_turn_silence: int | None = typer.Option(
-        None, "--min-turn-silence", help="Min turn silence (ms)."
+        None, "--min-turn-silence", help="Min silence to end a turn (ms)."
     ),
     max_turn_silence: int | None = typer.Option(
-        None, "--max-turn-silence", help="Max turn silence (ms)."
+        None, "--max-turn-silence", help="Max silence before ending a turn (ms)."
     ),
     vad_threshold: float | None = typer.Option(
         None, "--vad-threshold", help="Voice-activity threshold."
@@ -99,7 +101,9 @@ def stream(
     ),
     speaker_labels: bool | None = typer.Option(None, "--speaker-labels", help="Label speakers."),
     max_speakers: int | None = typer.Option(None, "--max-speakers", help="Max speakers."),
-    voice_focus: str | None = typer.Option(None, "--voice-focus", help="near_field or far_field."),
+    voice_focus: str | None = typer.Option(
+        None, "--voice-focus", help="Voice focus: near_field or far_field."
+    ),
     voice_focus_threshold: float | None = typer.Option(
         None, "--voice-focus-threshold", help="Voice-focus threshold."
     ),
@@ -108,7 +112,7 @@ def stream(
         None, "--redact-pii-policy", help="Comma-separated PII policies."
     ),
     redact_pii_sub: str | None = typer.Option(
-        None, "--redact-pii-sub", help="hash or entity_name."
+        None, "--redact-pii-sub", help="Replace redacted PII with: hash or entity_name."
     ),
     inactivity_timeout: int | None = typer.Option(
         None, "--inactivity-timeout", help="Auto-close after N seconds idle."
@@ -125,7 +129,9 @@ def stream(
         None, "--config-file", help="JSON file of streaming fields."
     ),
     # existing
-    prompt: str | None = typer.Option(None, "--prompt", help="Bias the speech model (u3-pro)."),
+    prompt: str | None = typer.Option(
+        None, "--prompt", help="Prompt to bias the speech model (u3-pro)."
+    ),
     llm_prompt: list[str] | None = typer.Option(
         None,
         "--llm",
@@ -150,8 +156,9 @@ def stream(
 ) -> None:
     """Transcribe live audio in real time — from your mic, a file, or a URL.
 
-    --prompt biases the speech model. --llm-gateway-prompt transforms the full
-    transcript through LLM Gateway once the stream ends (e.g. "summarize the call").
+    --prompt biases the speech model. --llm runs a prompt over the live transcript
+    through LLM Gateway, refreshing the answer on every finalized turn (e.g.
+    "summarize action items").
     """
 
     def body(state: AppState, json_mode: bool) -> None:
diff --git a/aai_cli/commands/transcribe.py b/aai_cli/commands/transcribe.py
@@ -82,7 +82,7 @@ def transcribe(
     sample: bool = typer.Option(False, "--sample", help="Use the hosted wildfires.mp3 sample."),
     # model & language
     speech_model: str | None = typer.Option(
-        None, "--speech-model", help="best, nano, slam-1, universal."
+        None, "--speech-model", help="Speech model: best, nano, slam-1, or universal."
     ),
     language_code: str | None = typer.Option(
         None, "--language-code", help="Force a language (e.g. en_us)."
@@ -96,22 +96,26 @@ def transcribe(
     temperature: float | None = typer.Option(
         None, "--temperature", help="Speech model temperature."
     ),
-    prompt: str | None = typer.Option(None, "--prompt", help="Bias the speech model (u3-pro)."),
+    prompt: str | None = typer.Option(
+        None, "--prompt", help="Prompt to bias the speech model (u3-pro)."
+    ),
     # formatting
     punctuate: bool | None = typer.Option(
         None, "--punctuate/--no-punctuate", help="Add punctuation."
     ),
     format_text: bool | None = typer.Option(
-        None, "--format-text/--no-format-text", help="Format text."
+        None, "--format-text/--no-format-text", help="Apply text formatting (casing, numbers)."
+    ),
+    disfluencies: bool | None = typer.Option(
+        None, "--disfluencies", help="Keep filler words (e.g. um, uh)."
     ),
-    disfluencies: bool | None = typer.Option(None, "--disfluencies", help="Keep filler words."),
     # speakers & channels
     speaker_labels: bool = typer.Option(False, "--speaker-labels", help="Enable diarization."),
     speakers_expected: int | None = typer.Option(
         None, "--speakers-expected", help="Hint speaker count."
     ),
     multichannel: bool | None = typer.Option(
-        None, "--multichannel", help="Transcribe each channel."
+        None, "--multichannel", help="Transcribe each audio channel separately."
     ),
     # guardrails
     redact_pii: bool | None = typer.Option(
@@ -121,7 +125,7 @@ def transcribe(
         None, "--redact-pii-policy", help="Comma-separated PII policies (e.g. person_name,...)."
     ),
     redact_pii_sub: str | None = typer.Option(
-        None, "--redact-pii-sub", help="Substitution: hash or entity_name."
+        None, "--redact-pii-sub", help="Replace redacted PII with: hash or entity_name."
     ),
     redact_pii_audio: bool | None = typer.Option(
         None, "--redact-pii-audio", help="Also redact audio."
@@ -133,20 +137,20 @@ def transcribe(
         None, "--content-safety", help="Detect sensitive content."
     ),
     content_safety_confidence: int | None = typer.Option(
-        None, "--content-safety-confidence", help="Confidence threshold 25-100."
+        None, "--content-safety-confidence", help="Content-safety confidence threshold (25-100)."
     ),
     speech_threshold: float | None = typer.Option(
-        None, "--speech-threshold", help="Minimum speech proportion 0-1."
+        None, "--speech-threshold", help="Minimum proportion of speech required (0-1)."
     ),
     # analysis
     summarization: bool | None = typer.Option(
         None, "--summarization", help="Summarize the transcript."
     ),
     summary_model: str | None = typer.Option(
-        None, "--summary-model", help="informative/conversational/catchy."
+        None, "--summary-model", help="Summary model: informative, conversational, or catchy."
     ),
     summary_type: str | None = typer.Option(
-        None, "--summary-type", help="bullets/gist/headline/paragraph."
+        None, "--summary-type", help="Summary format: bullets, gist, headline, or paragraph."
     ),
     auto_chapters: bool | None = typer.Option(None, "--auto-chapters", help="Generate chapters."),
     sentiment_analysis: bool | None = typer.Option(
diff --git a/tests/__snapshots__/test_cli_output_snapshots.ambr b/tests/__snapshots__/test_cli_output_snapshots.ambr
@@ -517,9 +517,9 @@
   
    Transcribe live audio in real time — from your mic, a file, or a URL.
   
-   --prompt biases the speech model. --llm-gateway-prompt transforms the full
-   transcript through LLM Gateway once the stream ends (e.g. "summarize the
-   call").
+   --prompt biases the speech model. --llm runs a prompt over the live transcript
+   through LLM Gateway, refreshing the answer on every finalized turn (e.g.
+   "summarize action items").
   
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
   │   source      [SOURCE]  Audio file path, URL, or YouTube URL to stream. Omit │
@@ -546,18 +546,19 @@
   │ --speech-model                                 TEXT     Streaming speech     │
   │                                                         model.               │
   │                                                         [default: u3-rt-pro] │
-  │ --encoding                                     TEXT     pcm_s16le or         │
+  │ --encoding                                     TEXT     Audio encoding:      │
+  │                                                         pcm_s16le or         │
   │                                                         pcm_mulaw.           │
   │ --language-detection                                    Auto-detect the      │
   │                                                         spoken language.     │
   │ --domain                                       TEXT     Domain preset (e.g.  │
   │                                                         medical).            │
-  │ --end-of-turn-confi…                           FLOAT    0-1 end-of-turn      │
-  │                                                         confidence.          │
-  │ --min-turn-silence                             INTEGER  Min turn silence     │
-  │                                                         (ms).                │
-  │ --max-turn-silence                             INTEGER  Max turn silence     │
-  │                                                         (ms).                │
+  │ --end-of-turn-confi…                           FLOAT    End-of-turn          │
+  │                                                         confidence (0-1).    │
+  │ --min-turn-silence                             INTEGER  Min silence to end a │
+  │                                                         turn (ms).           │
+  │ --max-turn-silence                             INTEGER  Max silence before   │
+  │                                                         ending a turn (ms).  │
   │ --vad-threshold                                FLOAT    Voice-activity       │
   │                                                         threshold.           │
   │ --format-turns            --no-format-turns             Punctuate/format     │
@@ -568,15 +569,18 @@
   │ --filter-profanity                                      Mask profanity.      │
   │ --speaker-labels                                        Label speakers.      │
   │ --max-speakers                                 INTEGER  Max speakers.        │
-  │ --voice-focus                                  TEXT     near_field or        │
+  │ --voice-focus                                  TEXT     Voice focus:         │
+  │                                                         near_field or        │
   │                                                         far_field.           │
   │ --voice-focus-thres…                           FLOAT    Voice-focus          │
   │                                                         threshold.           │
   │ --redact-pii                                            Redact PII from      │
   │                                                         turns.               │
   │ --redact-pii-policy                            TEXT     Comma-separated PII  │
   │                                                         policies.            │
-  │ --redact-pii-sub                               TEXT     hash or entity_name. │
+  │ --redact-pii-sub                               TEXT     Replace redacted PII │
+  │                                                         with: hash or        │
+  │                                                         entity_name.         │
   │ --inactivity-timeout                           INTEGER  Auto-close after N   │
   │                                                         seconds idle.        │
   │ --webhook-url                                  TEXT     Webhook URL.         │
@@ -588,8 +592,9 @@
   │                                                         (repeatable).        │
   │ --config-file                                  TEXT     JSON file of         │
   │                                                         streaming fields.    │
-  │ --prompt                                       TEXT     Bias the speech      │
-  │                                                         model (u3-pro).      │
+  │ --prompt                                       TEXT     Prompt to bias the   │
+  │                                                         speech model         │
+  │                                                         (u3-pro).            │
   │ --llm                                          TEXT     Run a prompt over    │
   │                                                         the live transcript  │
   │                                                         through LLM Gateway, │
@@ -658,7 +663,8 @@
   │ --sample                                                Use the hosted       │
   │                                                         wildfires.mp3        │
   │                                                         sample.              │
-  │ --speech-model                                 TEXT     best, nano, slam-1,  │
+  │ --speech-model                                 TEXT     Speech model: best,  │
+  │                                                         nano, slam-1, or     │
   │                                                         universal.           │
   │ --language-code                                TEXT     Force a language     │
   │                                                         (e.g. en_us).        │
@@ -668,34 +674,48 @@
   │                                                         (repeatable).        │
   │ --temperature                                  FLOAT    Speech model         │
   │                                                         temperature.         │
-  │ --prompt                                       TEXT     Bias the speech      │
-  │                                                         model (u3-pro).      │
+  │ --prompt                                       TEXT     Prompt to bias the   │
+  │                                                         speech model         │
+  │                                                         (u3-pro).            │
   │ --punctuate                --no-punctuate               Add punctuation.     │
-  │ --format-text              --no-format-text             Format text.         │
-  │ --disfluencies                                          Keep filler words.   │
+  │ --format-text              --no-format-text             Apply text           │
+  │                                                         formatting (casing,  │
+  │                                                         numbers).            │
+  │ --disfluencies                                          Keep filler words    │
+  │                                                         (e.g. um, uh).       │
   │ --speaker-labels                                        Enable diarization.  │
   │ --speakers-expected                            INTEGER  Hint speaker count.  │
   │ --multichannel                                          Transcribe each      │
-  │                                                         channel.             │
+  │                                                         audio channel        │
+  │                                                         separately.          │
   │ --redact-pii                                            Redact PII from the  │
   │                                                         transcript.          │
   │ --redact-pii-policy                            TEXT     Comma-separated PII  │
   │                                                         policies (e.g.       │
   │                                                         person_name,...).    │
-  │ --redact-pii-sub                               TEXT     Substitution: hash   │
-  │                                                         or entity_name.      │
+  │ --redact-pii-sub                               TEXT     Replace redacted PII │
+  │                                                         with: hash or        │
+  │                                                         entity_name.         │
   │ --redact-pii-audio                                      Also redact audio.   │
   │ --filter-profanity                                      Mask profanity.      │
   │ --content-safety                                        Detect sensitive     │
   │                                                         content.             │
-  │ --content-safety-con…                          INTEGER  Confidence threshold │
-  │                                                         25-100.              │
-  │ --speech-threshold                             FLOAT    Minimum speech       │
-  │                                                         proportion 0-1.      │
+  │ --content-safety-con…                          INTEGER  Content-safety       │
+  │                                                         confidence threshold │
+  │                                                         (25-100).            │
+  │ --speech-threshold                             FLOAT    Minimum proportion   │
+  │                                                         of speech required   │
+  │                                                         (0-1).               │
   │ --summarization                                         Summarize the        │
   │                                                         transcript.          │
-  │ --summary-model                                TEXT     informative/convers… │
-  │ --summary-type                                 TEXT     bullets/gist/headli… │
+  │ --summary-model                                TEXT     Summary model:       │
+  │                                                         informative,         │
+  │                                                         conversational, or   │
+  │                                                         catchy.              │
+  │ --summary-type                                 TEXT     Summary format:      │
+  │                                                         bullets, gist,       │
+  │                                                         headline, or         │
+  │                                                         paragraph.           │
   │ --auto-chapters                                         Generate chapters.   │
   │ --sentiment-analysis                                    Analyze sentiment.   │
   │ --entity-detection                                      Detect entities.     │