AssemblyAI · alexkroman · Jun 8, 2026 · Jun 8, 2026 · aikido-pr-checks · Jun 8, 2026
diff --git a/aai_cli/code_gen/__init__.py b/aai_cli/code_gen/__init__.py
@@ -5,11 +5,22 @@
 from aai_cli.code_gen import transcribe as _transcribe
 
 
-def gateway_options(prompts: list[str], model: str, max_tokens: int) -> dict[str, object] | None:
-    """The LLM-gateway options dict consumed by `transcribe`/`stream`, or None if no prompts."""
+def gateway_options(
+    prompts: list[str], model: str, max_tokens: int, *, interval: float = 0.0
+) -> dict[str, object] | None:
+    """The LLM-gateway options dict consumed by `transcribe`/`stream`, or None if no prompts.
+
+    `interval` (streaming only) is the seconds between summary refreshes baked into the
+    generated `stream --llm` loop; 0 refreshes on every turn. `transcribe` ignores it.
+    """
     if not prompts:
         return None
-    return {"prompts": list(prompts), "model": model, "max_tokens": max_tokens}
+    return {
+        "prompts": list(prompts),
+        "model": model,
+        "max_tokens": max_tokens,
+        "interval": interval,
+    }
 
 
 def agent(voice: str, system_prompt: str, greeting: str) -> str:
@@ -34,8 +45,8 @@ def stream(
 ) -> str:
     """Generate runnable Python that reproduces this streaming invocation.
 
-    With `llm` (a dict of ``prompts``/``model``/``max_tokens``), the script refreshes a
-    prompt-chain over the growing transcript on every finalized turn — the live sibling
-    of `transcribe --llm` — mirroring how `stream --llm` runs.
+    With `llm` (a dict of ``prompts``/``model``/``max_tokens``/``interval``), the script
+    refreshes a prompt-chain over the growing transcript every ``interval`` seconds (0 =
+    every turn) — the live sibling of `transcribe --llm` — mirroring how `stream --llm` runs.
     """
     return _stream.render(merged, llm=llm)
diff --git a/aai_cli/code_gen/stream.py b/aai_cli/code_gen/stream.py
@@ -40,6 +40,7 @@ def on_turn(client: StreamingClient, event: TurnEvent) -> None:
 """
 
 _LLM_PREAMBLE = """import os
+import time
 
 import assemblyai as aai
 from assemblyai.streaming.v3 import (
@@ -57,7 +58,12 @@ def on_turn(client: StreamingClient, event: TurnEvent) -> None:
 PROMPTS = [
 {prompts}
 ]
+# Turns accumulate continuously; the prompt chain re-runs at most once every
+# LLM_INTERVAL seconds (0 = on every finalized turn).
+LLM_INTERVAL = {interval}
 transcript: list[str] = []
+_summarized = 0
+_last_summary = float("-inf")
 
 
 def run_chain(text: str) -> str:
@@ -73,12 +79,26 @@ def run_chain(text: str) -> str:
     return result
 
 
+def summarize(*, final: bool = False) -> None:
+    # Refresh the answer over the growing transcript, throttled to LLM_INTERVAL. `final`
+    # forces a closing refresh so turns since the last tick aren't lost on stop.
+    global _summarized, _last_summary
+    turns = len(transcript)
+    if turns <= _summarized:
+        return
+    now = time.monotonic()
+    if not final and LLM_INTERVAL > 0 and now - _last_summary < LLM_INTERVAL:
+        return
+    _summarized = turns
+    _last_summary = now
+    print(run_chain(" ".join(transcript)))
+
+
 def on_turn(client: StreamingClient, event: TurnEvent) -> None:
-    # Refresh the answer on every finalized turn, over the growing transcript.
     if not event.end_of_turn or not event.transcript:
         return
     transcript.append(event.transcript)
-    print(run_chain(" ".join(transcript)))
+    summarize()
 
 
 client = StreamingClient(
@@ -95,6 +115,17 @@ def on_turn(client: StreamingClient, event: TurnEvent) -> None:
     client.disconnect(terminate=True)
 """
 
+# Same as _FOOTER, but flushes a closing summary (incl. on Ctrl-C) so the turns since the
+# last interval tick are reflected before disconnecting.
+_LLM_FOOTER = """
+print("Listening… press Ctrl-C to stop.")
+try:
+    client.stream(aai.extras.MicrophoneStream(sample_rate={rate}))
+finally:
+    summarize(final=True)
+    client.disconnect(terminate=True)
+"""
+
 
 def _imports_block(merged: dict[str, object]) -> str:
     """Sorted streaming-class import lines; SpeechModel only when a model kwarg is emitted."""
@@ -114,6 +145,7 @@ def _build_preamble(imports: str, llm: dict[str, object] | None) -> str:
             prompts=prompts,
             model=llm["model"],
             max_tokens=llm["max_tokens"],
+            interval=llm.get("interval", 0.0),
         )
     return _PREAMBLE.format(imports=imports)
 
@@ -138,4 +170,5 @@ def render(merged: dict[str, object], *, llm: dict[str, object] | None = None) -
     # Mic capture rate must match StreamingParameters.sample_rate, else audio is corrupt.
     rate = merged.get("sample_rate", 16000)
     connect = _build_connect(merged)
-    return preamble + "\n" + connect + "\n" + _FOOTER.format(rate=rate)
+    footer = _LLM_FOOTER if llm else _FOOTER
+    return preamble + "\n" + connect + "\n" + footer.format(rate=rate)
diff --git a/aai_cli/code_gen/transcribe.py b/aai_cli/code_gen/transcribe.py
@@ -86,7 +86,7 @@ def _llm_gateway_block(llm_gateway: dict[str, object]) -> list[str]:
         f'        content = prompt + "\\n\\n{llm.TRANSCRIPT_TAG}"',
         '        extra = {"transcript_id": transcript.id}',
         "    else:",
-        '        content = prompt + "\\n\\n" + result',
+        '        content = prompt + "\\n\\nTranscript:\\n" + result',
         "        extra = None",
         "    response = gateway.chat.completions.create(",
         f"        model={llm_gateway['model']!r},",

diff --git a/aai_cli/commands/stream.py b/aai_cli/commands/stream.py
@@ -195,7 +195,11 @@ def stream(
     ),
     # speakers
     speaker_labels: bool | None = typer.Option(
-        None, "--speaker-labels", help="Label speakers.", rich_help_panel=help_panels.OPT_SPEAKERS
+        None,
+        "--speaker-labels",
+        help='Diarize speakers. With system audio the mic stays "You"; only the system '
+        "audio is split into speakers.",
+        rich_help_panel=help_panels.OPT_SPEAKERS,
     ),
     max_speakers: int | None = typer.Option(
         None,
@@ -270,6 +274,13 @@ def stream(
         "one's response (a chain).",
         rich_help_panel=help_panels.OPT_LLM,
     ),
+    llm_interval: float = typer.Option(
+        30.0,
-    llm_interval: float = typer.Option(
-        30.0,
+    llm_interval: float = typer.Option(
+        0.0,
-    llm_interval: float = typer.Option(
-        30.0,
+    llm_interval: float = typer.Option(
+        0.0,
+        "--llm-interval",
+        help="Seconds between --llm summary refreshes (0 refreshes on every turn).",
+        min=0.0,
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
     model: str = typer.Option(
         llm.DEFAULT_MODEL,
         "--model",
@@ -367,7 +378,9 @@ def body(state: AppState, json_mode: bool) -> None:
                 overrides=config_kv,
                 config_file=config_file,
             )
-            gateway = code_gen.gateway_options(list(llm_prompt or []), model, max_tokens)
+            gateway = code_gen.gateway_options(
+                list(llm_prompt or []), model, max_tokens, interval=llm_interval
+            )
             output.print_code(code_gen.stream(merged, llm=gateway))
             return
 
@@ -385,6 +398,7 @@ def body(state: AppState, json_mode: bool) -> None:
             llm_prompts=llm_prompts,
             model=model,
             max_tokens=max_tokens,
+            llm_interval=llm_interval,
         )
         _dispatch(session, opts)
 

diff --git a/aai_cli/streaming/render.py b/aai_cli/streaming/render.py
@@ -6,8 +6,39 @@
 from rich.console import Console
 from rich.text import Text
 
+from aai_cli import theme
 from aai_cli.render import BaseRenderer
 
+# Source label -> (display text, Rich style). System audio borrows the agent color;
+# the microphone ("you") its own. Unknown sources fall back to the raw label.
+_SOURCE_LABELS: dict[str, tuple[str, str]] = {
+    "system": ("System", "aai.agent"),
+    "you": ("You", "aai.you"),
+}
+
+
+def speaker_prefix(source: str | None, speaker: str | None) -> tuple[str, str] | None:
+    """The lead-in label and Rich style for a turn, or None when it has neither a
+    source nor a diarized speaker.
+
+    - source + speaker -> "System (A)" (system audio diarized via --speaker-labels)
+    - source only      -> "System"     (parallel system/you streams)
+    - speaker only      -> "Speaker A"  (single-stream diarization, no source label)
+
+    When a speaker is present the whole label is tinted by `theme.speaker_style` so each
+    speaker reads in its own color (matching batch transcribe's diarized output); a
+    sourced turn with no speaker keeps the source's own color.
+    """
+    label, style = (None, "aai.label")
+    if source is not None:
+        label, style = _SOURCE_LABELS.get(source, (source, "aai.label"))
+    if speaker is not None:
+        style = theme.speaker_style(speaker)
+        return (f"{label} ({speaker})" if label is not None else f"Speaker {speaker}"), style
+    if label is not None:
+        return label, style
+    return None
+
 
 class StreamRenderer(BaseRenderer):
     """Renders streaming events in one of three modes.
@@ -46,25 +77,16 @@ def _with_source(payload: dict[str, object], source: str | None) -> dict[str, ob
         return payload
 
     @staticmethod
-    def _source_label(source: str) -> tuple[str, str]:
-        labels = {
-            "system": ("System", "aai.agent"),
-            "you": ("You", "aai.you"),
-        }
-        return labels.get(source, (source, "aai.label"))
-
-    @classmethod
-    def _label(cls, text: str, source: str | None) -> str:
-        if source is None:
-            return text
-        label, _style = cls._source_label(source)
-        return f"{label}: {text}"
+    def _label(text: str, source: str | None, speaker: str | None = None) -> str:
+        prefix = speaker_prefix(source, speaker)
+        return text if prefix is None else f"{prefix[0]}: {text}"
 
-    @classmethod
-    def _styled_label(cls, text: str, source: str | None) -> str | Text:
-        if source is None:
+    @staticmethod
+    def _styled_label(text: str, source: str | None, speaker: str | None = None) -> str | Text:
+        prefix = speaker_prefix(source, speaker)
+        if prefix is None:
             return text
-        label, style = cls._source_label(source)
+        label, style = prefix
         rendered = Text()
         rendered.append(f"{label}: ", style=style)
         rendered.append(text)
@@ -90,21 +112,24 @@ def listening(self) -> None:
     def turn(self, event: object, *, source: str | None = None) -> None:
         text = getattr(event, "transcript", "") or ""
         end = bool(getattr(event, "end_of_turn", False))
+        speaker = getattr(event, "speaker_label", None)  # set when --speaker-labels diarizes
         with self._lock:
             if self.json_mode:
-                self._emit(
-                    self._with_source(
-                        {"type": "turn", "transcript": text, "end_of_turn": end},
-                        source,
-                    )
-                )
+                payload: dict[str, object] = {
+                    "type": "turn",
+                    "transcript": text,
+                    "end_of_turn": end,
+                }
+                if speaker is not None:
+                    payload["speaker"] = speaker
+                self._emit(self._with_source(payload, source))
             elif self.text_mode:
                 if end and text:
-                    self._write(self._label(text, source) + "\n")  # plain finalized line
+                    self._write(self._label(text, source, speaker) + "\n")  # plain finalized line
             elif end:
-                self._finalize_line(self._styled_label(text, source))
+                self._finalize_line(self._styled_label(text, source, speaker))
             else:
-                self._update_line(self._styled_label(text, source))
+                self._update_line(self._styled_label(text, source, speaker))
 
     def termination(self, event: object, *, source: str | None = None) -> None:
         with self._lock: