Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 17 additions & 6 deletions aai_cli/code_gen/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,11 +5,22 @@
from aai_cli.code_gen import transcribe as _transcribe


def gateway_options(prompts: list[str], model: str, max_tokens: int) -> dict[str, object] | None:
"""The LLM-gateway options dict consumed by `transcribe`/`stream`, or None if no prompts."""
def gateway_options(
prompts: list[str], model: str, max_tokens: int, *, interval: float = 0.0
) -> dict[str, object] | None:
"""The LLM-gateway options dict consumed by `transcribe`/`stream`, or None if no prompts.

`interval` (streaming only) is the seconds between summary refreshes baked into the
generated `stream --llm` loop; 0 refreshes on every turn. `transcribe` ignores it.
"""
if not prompts:
return None
return {"prompts": list(prompts), "model": model, "max_tokens": max_tokens}
return {
"prompts": list(prompts),
"model": model,
"max_tokens": max_tokens,
"interval": interval,
}


def agent(voice: str, system_prompt: str, greeting: str) -> str:
Expand All @@ -34,8 +45,8 @@ def stream(
) -> str:
"""Generate runnable Python that reproduces this streaming invocation.

With `llm` (a dict of ``prompts``/``model``/``max_tokens``), the script refreshes a
prompt-chain over the growing transcript on every finalized turn — the live sibling
of `transcribe --llm` — mirroring how `stream --llm` runs.
With `llm` (a dict of ``prompts``/``model``/``max_tokens``/``interval``), the script
refreshes a prompt-chain over the growing transcript every ``interval`` seconds (0 =
every turn) — the live sibling of `transcribe --llm` — mirroring how `stream --llm` runs.
"""
return _stream.render(merged, llm=llm)
39 changes: 36 additions & 3 deletions aai_cli/code_gen/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,7 @@ def on_turn(client: StreamingClient, event: TurnEvent) -> None:
"""

_LLM_PREAMBLE = """import os
import time

import assemblyai as aai
from assemblyai.streaming.v3 import (
Expand All @@ -57,7 +58,12 @@ def on_turn(client: StreamingClient, event: TurnEvent) -> None:
PROMPTS = [
{prompts}
]
# Turns accumulate continuously; the prompt chain re-runs at most once every
# LLM_INTERVAL seconds (0 = on every finalized turn).
LLM_INTERVAL = {interval}
transcript: list[str] = []
_summarized = 0
_last_summary = float("-inf")


def run_chain(text: str) -> str:
Expand All @@ -73,12 +79,26 @@ def run_chain(text: str) -> str:
return result


def summarize(*, final: bool = False) -> None:
# Refresh the answer over the growing transcript, throttled to LLM_INTERVAL. `final`
# forces a closing refresh so turns since the last tick aren't lost on stop.
global _summarized, _last_summary
turns = len(transcript)
if turns <= _summarized:
return
now = time.monotonic()
if not final and LLM_INTERVAL > 0 and now - _last_summary < LLM_INTERVAL:
return
_summarized = turns
_last_summary = now
print(run_chain(" ".join(transcript)))


def on_turn(client: StreamingClient, event: TurnEvent) -> None:
# Refresh the answer on every finalized turn, over the growing transcript.
if not event.end_of_turn or not event.transcript:
return
transcript.append(event.transcript)
print(run_chain(" ".join(transcript)))
summarize()


client = StreamingClient(
Expand All @@ -95,6 +115,17 @@ def on_turn(client: StreamingClient, event: TurnEvent) -> None:
client.disconnect(terminate=True)
"""

# Same as _FOOTER, but flushes a closing summary (incl. on Ctrl-C) so the turns since the
# last interval tick are reflected before disconnecting.
_LLM_FOOTER = """
print("Listening… press Ctrl-C to stop.")
try:
client.stream(aai.extras.MicrophoneStream(sample_rate={rate}))
finally:
summarize(final=True)
client.disconnect(terminate=True)
"""


def _imports_block(merged: dict[str, object]) -> str:
"""Sorted streaming-class import lines; SpeechModel only when a model kwarg is emitted."""
Expand All @@ -114,6 +145,7 @@ def _build_preamble(imports: str, llm: dict[str, object] | None) -> str:
prompts=prompts,
model=llm["model"],
max_tokens=llm["max_tokens"],
interval=llm.get("interval", 0.0),
)
return _PREAMBLE.format(imports=imports)

Expand All @@ -138,4 +170,5 @@ def render(merged: dict[str, object], *, llm: dict[str, object] | None = None) -
# Mic capture rate must match StreamingParameters.sample_rate, else audio is corrupt.
rate = merged.get("sample_rate", 16000)
connect = _build_connect(merged)
return preamble + "\n" + connect + "\n" + _FOOTER.format(rate=rate)
footer = _LLM_FOOTER if llm else _FOOTER
return preamble + "\n" + connect + "\n" + footer.format(rate=rate)
2 changes: 1 addition & 1 deletion aai_cli/code_gen/transcribe.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,7 @@ def _llm_gateway_block(llm_gateway: dict[str, object]) -> list[str]:
f' content = prompt + "\\n\\n{llm.TRANSCRIPT_TAG}"',
' extra = {"transcript_id": transcript.id}',
" else:",
' content = prompt + "\\n\\n" + result',
' content = prompt + "\\n\\nTranscript:\\n" + result',
" extra = None",
" response = gateway.chat.completions.create(",
f" model={llm_gateway['model']!r},",
Expand Down
18 changes: 16 additions & 2 deletions aai_cli/commands/stream.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,11 @@ def stream(
),
# speakers
speaker_labels: bool | None = typer.Option(
None, "--speaker-labels", help="Label speakers.", rich_help_panel=help_panels.OPT_SPEAKERS
None,
"--speaker-labels",
help='Diarize speakers. With system audio the mic stays "You"; only the system '
"audio is split into speakers.",
rich_help_panel=help_panels.OPT_SPEAKERS,
),
max_speakers: int | None = typer.Option(
None,
Expand Down Expand Up @@ -270,6 +274,13 @@ def stream(
"one's response (a chain).",
rich_help_panel=help_panels.OPT_LLM,
),
llm_interval: float = typer.Option(
30.0,
Comment on lines +277 to +278

Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Default llm_interval=30.0 conflicts with nearby --llm descriptions claiming refresh on every finalized turn, so behavior and command semantics diverge.

Suggested change
llm_interval: float = typer.Option(
30.0,
llm_interval: float = typer.Option(
0.0,
Details

✨ AI Reasoning
​​1) The code adds a new interval-based throttle with a nonzero default, so refreshes are no longer guaranteed per finalized turn.
​2) Existing user-facing wording in the same command path still describes per-turn refresh behavior.
​3) This creates contradictory runtime assumptions for users and generated expectations.
​4) The contradiction is concrete and introduced by this change set, not speculative.

Reply @AikidoSec feedback: [FEEDBACK] to get better review comments in the future.
Reply @AikidoSec ignore: [REASON] to ignore this issue.
More info

"--llm-interval",
help="Seconds between --llm summary refreshes (0 refreshes on every turn).",
min=0.0,
rich_help_panel=help_panels.OPT_LLM,
),
model: str = typer.Option(
llm.DEFAULT_MODEL,
"--model",
Expand Down Expand Up @@ -367,7 +378,9 @@ def body(state: AppState, json_mode: bool) -> None:
overrides=config_kv,
config_file=config_file,
)
gateway = code_gen.gateway_options(list(llm_prompt or []), model, max_tokens)
gateway = code_gen.gateway_options(
list(llm_prompt or []), model, max_tokens, interval=llm_interval
)
output.print_code(code_gen.stream(merged, llm=gateway))
return

Expand All @@ -385,6 +398,7 @@ def body(state: AppState, json_mode: bool) -> None:
llm_prompts=llm_prompts,
model=model,
max_tokens=max_tokens,
llm_interval=llm_interval,
)
_dispatch(session, opts)

Expand Down
77 changes: 51 additions & 26 deletions aai_cli/streaming/render.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,39 @@
from rich.console import Console
from rich.text import Text

from aai_cli import theme
from aai_cli.render import BaseRenderer

# Source label -> (display text, Rich style). System audio borrows the agent color;
# the microphone ("you") its own. Unknown sources fall back to the raw label.
_SOURCE_LABELS: dict[str, tuple[str, str]] = {
"system": ("System", "aai.agent"),
"you": ("You", "aai.you"),
}


def speaker_prefix(source: str | None, speaker: str | None) -> tuple[str, str] | None:
"""The lead-in label and Rich style for a turn, or None when it has neither a
source nor a diarized speaker.

- source + speaker -> "System (A)" (system audio diarized via --speaker-labels)
- source only -> "System" (parallel system/you streams)
- speaker only -> "Speaker A" (single-stream diarization, no source label)

When a speaker is present the whole label is tinted by `theme.speaker_style` so each
speaker reads in its own color (matching batch transcribe's diarized output); a
sourced turn with no speaker keeps the source's own color.
"""
label, style = (None, "aai.label")
if source is not None:
label, style = _SOURCE_LABELS.get(source, (source, "aai.label"))
if speaker is not None:
style = theme.speaker_style(speaker)
return (f"{label} ({speaker})" if label is not None else f"Speaker {speaker}"), style
if label is not None:
return label, style
return None


class StreamRenderer(BaseRenderer):
"""Renders streaming events in one of three modes.
Expand Down Expand Up @@ -46,25 +77,16 @@ def _with_source(payload: dict[str, object], source: str | None) -> dict[str, ob
return payload

@staticmethod
def _source_label(source: str) -> tuple[str, str]:
labels = {
"system": ("System", "aai.agent"),
"you": ("You", "aai.you"),
}
return labels.get(source, (source, "aai.label"))

@classmethod
def _label(cls, text: str, source: str | None) -> str:
if source is None:
return text
label, _style = cls._source_label(source)
return f"{label}: {text}"
def _label(text: str, source: str | None, speaker: str | None = None) -> str:
prefix = speaker_prefix(source, speaker)
return text if prefix is None else f"{prefix[0]}: {text}"

@classmethod
def _styled_label(cls, text: str, source: str | None) -> str | Text:
if source is None:
@staticmethod
def _styled_label(text: str, source: str | None, speaker: str | None = None) -> str | Text:
prefix = speaker_prefix(source, speaker)
if prefix is None:
return text
label, style = cls._source_label(source)
label, style = prefix
rendered = Text()
rendered.append(f"{label}: ", style=style)
rendered.append(text)
Expand All @@ -90,21 +112,24 @@ def listening(self) -> None:
def turn(self, event: object, *, source: str | None = None) -> None:
text = getattr(event, "transcript", "") or ""
end = bool(getattr(event, "end_of_turn", False))
speaker = getattr(event, "speaker_label", None) # set when --speaker-labels diarizes
with self._lock:
if self.json_mode:
self._emit(
self._with_source(
{"type": "turn", "transcript": text, "end_of_turn": end},
source,
)
)
payload: dict[str, object] = {
"type": "turn",
"transcript": text,
"end_of_turn": end,
}
if speaker is not None:
payload["speaker"] = speaker
self._emit(self._with_source(payload, source))
elif self.text_mode:
if end and text:
self._write(self._label(text, source) + "\n") # plain finalized line
self._write(self._label(text, source, speaker) + "\n") # plain finalized line
elif end:
self._finalize_line(self._styled_label(text, source))
self._finalize_line(self._styled_label(text, source, speaker))
else:
self._update_line(self._styled_label(text, source))
self._update_line(self._styled_label(text, source, speaker))

def termination(self, event: object, *, source: str | None = None) -> None:
with self._lock:
Expand Down
Loading
Loading