Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion aai_cli/commands/dub.py
Original file line number Diff line number Diff line change
Expand Up @@ -60,7 +60,8 @@ def dub(
[],
"--voice",
help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE "
"to pin a diarized speaker (repeatable, e.g. --voice A=jane).",
"to pin a diarized speaker (repeatable, e.g. --voice A=jane). Default: the "
"target language's native voice(s).",
),
model: str = typer.Option(
llm.DEFAULT_MODEL,
Expand Down
11 changes: 8 additions & 3 deletions aai_cli/commands/speak.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,8 +19,8 @@
[
("Speak text aloud (sandbox only)", 'assembly --sandbox speak "Hello there, friend."'),
(
"Pick a voice and language",
'assembly --sandbox speak "Bonjour" --voice jane --language French',
"Pick a language (its native voice is selected automatically)",
'assembly --sandbox speak "Bonjour" --language French',
),
(
"Speak a diarized transcript, one voice per speaker",
Expand All @@ -46,7 +46,12 @@ def speak(
help="Voice id (e.g. jane, michael, mary, paul, eve, george), or SPEAKER=VOICE "
"for diarized input (repeatable, e.g. --voice A=jane).",
),
language: str = typer.Option(DEFAULT_LANGUAGE, "--language", help="Language of the text."),
language: str = typer.Option(
DEFAULT_LANGUAGE,
"--language",
help="Language of the text. The default voice follows it "
"(e.g. --language Italian speaks with giovanni).",
),
sample_rate: int | None = typer.Option(
None,
"--sample-rate",
Expand Down
13 changes: 8 additions & 5 deletions aai_cli/dub_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from aai_cli import llm as gateway
from aai_cli.context import AppState
from aai_cli.errors import APIError, CLIError, UsageError
from aai_cli.tts import audio, dialogue, session
from aai_cli.tts import audio, dialogue, session, voices
from aai_cli.tts.session import SpeakConfig

# ISO-639-1 codes accepted by --lang, mapped to the language *name* both the
Expand Down Expand Up @@ -349,15 +349,18 @@ def _assign_voices(
utterances: list[_Utterance],
translations: list[str],
voice_values: list[str],
language: str,
) -> tuple[list[tuple[str, str]], dict[str, str]]:
"""Resolve each translated utterance to ``(voice, text)`` plus the speaker→voice map.

A bare ``--voice`` dubs every speaker with that one voice; ``SPEAKER=VOICE``
mappings pin individual speakers; everyone else takes the rotation in
first-appearance order (the same rules as `assembly speak`).
mappings pin individual speakers; everyone else takes the target language's
rotation in first-appearance order (the same rules as `assembly speak`) —
each voice speaks one language, so a non-English dub switches to that
language's native voice(s).
"""
bare_voice, overrides = dialogue.parse_voice_overrides(voice_values)
rotation = (bare_voice,) if bare_voice is not None else dialogue.DEFAULT_VOICE_ROTATION
rotation = (bare_voice,) if bare_voice is not None else voices.rotation_for(language)
segments = [
dialogue.Segment(utterance.speaker, translated)
# strict=True is an invariant guard only: _translate returns exactly one
Expand All @@ -384,7 +387,7 @@ def run_dub(opts: DubOptions, state: AppState, *, json_mode: bool) -> None:
translations = _translate(
api_key, utterances, language, opts, json_mode=json_mode, quiet=state.quiet
)
resolved, speakers = _assign_voices(utterances, translations, opts.voice)
resolved, speakers = _assign_voices(utterances, translations, opts.voice, language)
pcm_segments, sample_rate = _synthesize(
api_key, resolved, language, json_mode=json_mode, quiet=state.quiet
)
Expand Down
14 changes: 7 additions & 7 deletions aai_cli/speak_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,12 +14,12 @@
from aai_cli import environments, output, stdio
from aai_cli.context import AppState
from aai_cli.errors import CLIError, UsageError
from aai_cli.tts import audio, dialogue, session
from aai_cli.tts import audio, dialogue, session, voices

# The streaming-TTS reference client defaults to the PocketTTS "jane" voice and
# English, so the CLI sends the same and a bare `assembly speak` works out of the box.
# Override either with --voice/--language.
DEFAULT_VOICE = "jane"
# The streaming-TTS reference client defaults to English, so the CLI does the
# same. The default voice follows the language (voices.default_voice): each
# voice speaks one language, so e.g. --language Italian switches to giovanni
# unless --voice overrides it.
DEFAULT_LANGUAGE = "English"


Expand Down Expand Up @@ -161,7 +161,7 @@ def _speak_dialogue(
json_mode=json_mode,
)
resolved, speakers = dialogue.assign_voices(
segments, dialogue.DEFAULT_VOICE_ROTATION, overrides
segments, voices.rotation_for(opts.language), overrides
)
with output.status("Synthesizing speech…", json_mode=json_mode, quiet=quiet):
result = session.synthesize_dialogue(
Expand Down Expand Up @@ -209,7 +209,7 @@ def run_speak(opts: SpeakOptions, state: AppState, *, json_mode: bool) -> None:
_speak_single(
api_key,
spoken,
bare_voice or DEFAULT_VOICE,
bare_voice or voices.default_voice(opts.language),
opts,
json_mode=json_mode,
quiet=state.quiet,
Expand Down
3 changes: 0 additions & 3 deletions aai_cli/tts/dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,9 +65,6 @@ def flush() -> None:
return [turn for turn in merged if turn.text]


DEFAULT_VOICE_ROTATION = ("jane", "michael", "mary", "paul", "eve", "george")


def parse_voice_overrides(values: list[str]) -> tuple[str | None, dict[str, str]]:
"""Split repeatable ``--voice`` values into ``(bare_voice, {speaker_id: voice})``.

Expand Down
90 changes: 90 additions & 0 deletions aai_cli/tts/voices.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
"""The streaming-TTS voice catalog: every voice speaks exactly one language.

When no ``--voice`` is chosen, `assembly speak` and `assembly dub` pick the
voice from the requested language: a non-English language switches to that
language's native voice(s) — most ship exactly one, so the language alone
selects the voice — while English keeps the curated multi-speaker rotation.
"""

from __future__ import annotations

# Voice id -> ISO 639-1 code of the (single) language the voice speaks.
VOICE_LANGUAGES: dict[str, str] = {
"alba": "en",
"anna": "en",
"azelma": "en",
"bill_boerst": "en",
"caro_davy": "en",
"charles": "en",
"cosette": "en",
"eponine": "en",
"estelle": "fr",
"eve": "en",
"fantine": "en",
"george": "en",
"giovanni": "it",
"jane": "en",
"javert": "en",
"jean": "en",
"juergen": "de",
"lola": "es",
"marius": "en",
"mary": "en",
"michael": "en",
"paul": "en",
"peter_yearsley": "en",
"rafael": "pt",
"stuart_bell": "en",
"vera": "en",
}

# The language names the TTS `language` param uses, keyed by ISO code. Only
# languages with at least one catalog voice belong here (rotation_for relies
# on that invariant to never resolve to an empty rotation). Deliberately
# narrower than dub_exec.LANGUAGE_NAMES, which also lists voiceless languages
# the translator supports.
_LANGUAGE_NAMES: dict[str, str] = {
"de": "German",
"en": "English",
"es": "Spanish",
"fr": "French",
"it": "Italian",
"pt": "Portuguese",
}

_NAME_TO_CODE = {name.casefold(): code for code, name in _LANGUAGE_NAMES.items()}

# English has many voices; this curated subset keeps multi-speaker output
# varied with the confirmed-working voices. Non-English languages rotate
# through their own (usually single) native voices instead.
ENGLISH_ROTATION = ("jane", "michael", "mary", "paul", "eve", "george")


def language_code(language: str | None) -> str | None:
"""Normalize a language value (ISO code or name, any case) to its code,
or None when it has no catalog voices — the TTS service and the dub
translator accept more languages than the catalog covers."""
if language is None:
return None
cleaned = language.strip().casefold()
if cleaned in _LANGUAGE_NAMES:
return cleaned
return _NAME_TO_CODE.get(cleaned)


def rotation_for(language: str | None) -> tuple[str, ...]:
"""The default voice rotation for a language.

English — and any language without catalog voices — keeps the curated
English rotation; a language with native voices rotates through those, so
a single-voice language always switches to its one voice.
"""
code = language_code(language)
if code is None or code == "en":
return ENGLISH_ROTATION
return tuple(voice for voice, spoken in VOICE_LANGUAGES.items() if spoken == code)


def default_voice(language: str | None) -> str:
"""The voice used when none is chosen: the language's first rotation voice."""
return rotation_for(language)[0]
12 changes: 8 additions & 4 deletions tests/__snapshots__/test_snapshots_help_run.ambr
Original file line number Diff line number Diff line change
Expand Up @@ -244,7 +244,8 @@
│ --voice TEXT Voice id for every speaker (e.g. jane, │
│ michael, paul), or SPEAKER=VOICE to pin a │
│ diarized speaker (repeatable, e.g. --voice │
│ A=jane). │
│ A=jane). Default: the target language's │
│ native voice(s). │
│ --out PATH Output file (default: │
│ <name>.dub.<lang><ext> next to the input). │
│ --json -j Emit JSON describing the dubbed file. │
Expand Down Expand Up @@ -426,7 +427,10 @@
│ SPEAKER=VOICE for diarized │
│ input (repeatable, e.g. --voice │
│ A=jane). │
│ --language TEXT Language of the text. │
│ --language TEXT Language of the text. The │
│ default voice follows it (e.g. │
│ --language Italian speaks with │
│ giovanni). │
│ [default: English] │
│ --sample-rate INTEGER RANGE [x>=1] Output sample rate in Hz │
│ (positive). Server default if │
Expand All @@ -441,8 +445,8 @@
Examples
Speak text aloud (sandbox only)
$ assembly --sandbox speak "Hello there, friend."
Pick a voice and language
$ assembly --sandbox speak "Bonjour" --voice jane --language French
Pick a language (its native voice is selected automatically)
$ assembly --sandbox speak "Bonjour" --language French
Speak a diarized transcript, one voice per speaker
$ assembly transcribe meeting.mp3 --speaker-labels | assembly --sandbox speak
Override a speaker's voice
Expand Down
36 changes: 29 additions & 7 deletions tests/test_dub_pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,10 +93,11 @@ def test_run_dub_pipeline_end_to_end(
assert "dubbing" in system["content"]
assert "German" in system["content"]

# Synthesis: the translated text, rotation voices in speaker order, target language.
# Synthesis: the translated text in the target language, every speaker on
# German's one native voice (the language selects the voice).
assert [(cfg.voice, cfg.text) for cfg in fake_synthesize] == [
("jane", "DE:Hello."),
("michael", "DE:World."),
("juergen", "DE:Hello."),
("juergen", "DE:World."),
]
assert all(cfg.language == "German" for cfg in fake_synthesize)

Expand Down Expand Up @@ -137,7 +138,7 @@ def test_run_dub_pipeline_end_to_end(
"language": "German",
"transcript_id": "tr_dub",
"utterances": 2,
"speakers": {"A": "jane", "B": "michael"},
"speakers": {"A": "juergen", "B": "juergen"},
"sample_rate": SAMPLE_RATE,
"audio_duration_seconds": 5.0,
}
Expand All @@ -157,7 +158,7 @@ def test_run_dub_human_summary(
assert "dub.de.mp4" in out
assert "dubbed to German" in out
assert "2 utterances" in out
assert "A=jane, B=michael" in out
assert "A=juergen, B=juergen" in out


def test_bare_voice_dubs_every_speaker(
Expand All @@ -173,8 +174,29 @@ def test_voice_overrides_pin_speakers_without_consuming_rotation(
):
opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["A=mary"])
_run(opts, json_mode=True)
# A is pinned; B still takes the first rotation voice (overrides don't consume slots).
assert [cfg.voice for cfg in fake_synthesize] == ["mary", "jane"]
# A is pinned; B still takes German's native voice from the rotation.
assert [cfg.voice for cfg in fake_synthesize] == ["mary", "juergen"]


def test_english_dub_keeps_the_multi_voice_rotation(
media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg
):
# English has many voices, so speakers still rotate through the curated set
# instead of collapsing onto one voice.
opts = dataclasses.replace(DEFAULTS, media=str(media), language="en")
_run(opts, json_mode=True)
assert [cfg.voice for cfg in fake_synthesize] == ["jane", "michael"]


def test_language_without_a_native_voice_falls_back_to_english_rotation(
media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg
):
# Japanese is translatable but has no catalog voice: the dub still runs,
# on the English rotation.
opts = dataclasses.replace(DEFAULTS, media=str(media), language="ja")
_run(opts, json_mode=True)
assert [cfg.voice for cfg in fake_synthesize] == ["jane", "michael"]
assert all(cfg.language == "Japanese" for cfg in fake_synthesize)


def test_transcript_id_reuses_existing_transcript(
Expand Down
31 changes: 31 additions & 0 deletions tests/test_speak.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,6 +115,27 @@ def test_voice_and_language_flow_into_config(monkeypatch, fake_synthesize):
assert cfg.query_params() == {"voice": "jane", "language": "English"}


def test_default_voice_follows_the_language(monkeypatch, fake_synthesize):
# Each voice speaks one language: with no --voice, a non-English --language
# switches to that language's native voice instead of English "jane".
monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None)
result = runner.invoke(app, ["--sandbox", "speak", "Ciao", "--language", "Italian"])
assert result.exit_code == 0
cfg = fake_synthesize["cfg"]
assert cfg.voice == "giovanni"
assert cfg.language == "Italian"


def test_explicit_voice_beats_the_language_default(monkeypatch, fake_synthesize):
monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None)
result = runner.invoke(
app, ["--sandbox", "speak", "Bonjour", "--voice", "jane", "--language", "French"]
)
assert result.exit_code == 0
# A chosen voice always wins; the language only drives the default.
assert fake_synthesize["cfg"].voice == "jane"


def test_json_mode_emits_metadata_object_on_stdout(monkeypatch, fake_synthesize):
monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None)
result = runner.invoke(app, ["--sandbox", "speak", "Hi", "--voice", "jane", "--json"])
Expand Down Expand Up @@ -166,6 +187,16 @@ def test_labeled_stdin_uses_dialogue_path_with_default_rotation(fake_dialogue):
]


def test_dialogue_rotation_follows_the_language(fake_dialogue):
# French has exactly one native voice, so every speaker switches to it —
# the language selects the voice in dialogue mode too.
text = "Speaker A: Bonjour.\nSpeaker B: Salut."
result = runner.invoke(app, ["--sandbox", "speak", "--language", "French"], input=text)
assert result.exit_code == 0
assert fake_dialogue["segments"] == [("estelle", "Bonjour."), ("estelle", "Salut.")]
assert fake_dialogue["language"] == "French"


def test_speaker_voice_override_is_applied(fake_dialogue):
text = "Speaker A: One.\nSpeaker B: Two."
result = runner.invoke(
Expand Down
4 changes: 0 additions & 4 deletions tests/test_tts_dialogue.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,3 @@ def test_assign_voices_override_beats_rotation_without_consuming_a_slot():
resolved, mapping = dialogue.assign_voices(segs, ["jane", "michael"], {"a": "vera"})
assert [v for v, _ in resolved] == ["vera", "jane"]
assert mapping == {"A": "vera", "B": "jane"}


def test_default_rotation_is_the_confirmed_working_voices():
assert dialogue.DEFAULT_VOICE_ROTATION == ("jane", "michael", "mary", "paul", "eve", "george")
Loading
Loading