AssemblyAI · alexkroman · Jun 12, 2026 · Jun 12, 2026
diff --git a/aai_cli/commands/dub.py b/aai_cli/commands/dub.py
@@ -60,7 +60,8 @@ def dub(
         [],
         "--voice",
         help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE "
-        "to pin a diarized speaker (repeatable, e.g. --voice A=jane).",
+        "to pin a diarized speaker (repeatable, e.g. --voice A=jane). Default: the "
+        "target language's native voice(s).",
     ),
     model: str = typer.Option(
         llm.DEFAULT_MODEL,

diff --git a/aai_cli/commands/speak.py b/aai_cli/commands/speak.py
@@ -19,8 +19,8 @@
         [
             ("Speak text aloud (sandbox only)", 'assembly --sandbox speak "Hello there, friend."'),
             (
-                "Pick a voice and language",
-                'assembly --sandbox speak "Bonjour" --voice jane --language French',
+                "Pick a language (its native voice is selected automatically)",
+                'assembly --sandbox speak "Bonjour" --language French',
             ),
             (
                 "Speak a diarized transcript, one voice per speaker",
@@ -46,7 +46,12 @@ def speak(
         help="Voice id (e.g. jane, michael, mary, paul, eve, george), or SPEAKER=VOICE "
         "for diarized input (repeatable, e.g. --voice A=jane).",
     ),
-    language: str = typer.Option(DEFAULT_LANGUAGE, "--language", help="Language of the text."),
+    language: str = typer.Option(
+        DEFAULT_LANGUAGE,
+        "--language",
+        help="Language of the text. The default voice follows it "
+        "(e.g. --language Italian speaks with giovanni).",
+    ),
     sample_rate: int | None = typer.Option(
         None,
         "--sample-rate",

diff --git a/aai_cli/dub_exec.py b/aai_cli/dub_exec.py
@@ -31,7 +31,7 @@
 from aai_cli import llm as gateway
 from aai_cli.context import AppState
 from aai_cli.errors import APIError, CLIError, UsageError
-from aai_cli.tts import audio, dialogue, session
+from aai_cli.tts import audio, dialogue, session, voices
 from aai_cli.tts.session import SpeakConfig
 
 # ISO-639-1 codes accepted by --lang, mapped to the language *name* both the
@@ -349,15 +349,18 @@ def _assign_voices(
     utterances: list[_Utterance],
     translations: list[str],
     voice_values: list[str],
+    language: str,
 ) -> tuple[list[tuple[str, str]], dict[str, str]]:
     """Resolve each translated utterance to ``(voice, text)`` plus the speaker→voice map.
 
     A bare ``--voice`` dubs every speaker with that one voice; ``SPEAKER=VOICE``
-    mappings pin individual speakers; everyone else takes the rotation in
-    first-appearance order (the same rules as `assembly speak`).
+    mappings pin individual speakers; everyone else takes the target language's
+    rotation in first-appearance order (the same rules as `assembly speak`) —
+    each voice speaks one language, so a non-English dub switches to that
+    language's native voice(s).
     """
     bare_voice, overrides = dialogue.parse_voice_overrides(voice_values)
-    rotation = (bare_voice,) if bare_voice is not None else dialogue.DEFAULT_VOICE_ROTATION
+    rotation = (bare_voice,) if bare_voice is not None else voices.rotation_for(language)
     segments = [
         dialogue.Segment(utterance.speaker, translated)
         # strict=True is an invariant guard only: _translate returns exactly one
@@ -384,7 +387,7 @@ def run_dub(opts: DubOptions, state: AppState, *, json_mode: bool) -> None:
     translations = _translate(
         api_key, utterances, language, opts, json_mode=json_mode, quiet=state.quiet
     )
-    resolved, speakers = _assign_voices(utterances, translations, opts.voice)
+    resolved, speakers = _assign_voices(utterances, translations, opts.voice, language)
     pcm_segments, sample_rate = _synthesize(
         api_key, resolved, language, json_mode=json_mode, quiet=state.quiet
     )

diff --git a/aai_cli/speak_exec.py b/aai_cli/speak_exec.py
@@ -14,12 +14,12 @@
 from aai_cli import environments, output, stdio
 from aai_cli.context import AppState
 from aai_cli.errors import CLIError, UsageError
-from aai_cli.tts import audio, dialogue, session
+from aai_cli.tts import audio, dialogue, session, voices
 
-# The streaming-TTS reference client defaults to the PocketTTS "jane" voice and
-# English, so the CLI sends the same and a bare `assembly speak` works out of the box.
-# Override either with --voice/--language.
-DEFAULT_VOICE = "jane"
+# The streaming-TTS reference client defaults to English, so the CLI does the
+# same. The default voice follows the language (voices.default_voice): each
+# voice speaks one language, so e.g. --language Italian switches to giovanni
+# unless --voice overrides it.
 DEFAULT_LANGUAGE = "English"
 
 
@@ -161,7 +161,7 @@ def _speak_dialogue(
             json_mode=json_mode,
         )
     resolved, speakers = dialogue.assign_voices(
-        segments, dialogue.DEFAULT_VOICE_ROTATION, overrides
+        segments, voices.rotation_for(opts.language), overrides
     )
     with output.status("Synthesizing speech…", json_mode=json_mode, quiet=quiet):
         result = session.synthesize_dialogue(
@@ -209,7 +209,7 @@ def run_speak(opts: SpeakOptions, state: AppState, *, json_mode: bool) -> None:
         _speak_single(
             api_key,
             spoken,
-            bare_voice or DEFAULT_VOICE,
+            bare_voice or voices.default_voice(opts.language),
             opts,
             json_mode=json_mode,
             quiet=state.quiet,

diff --git a/aai_cli/tts/dialogue.py b/aai_cli/tts/dialogue.py
@@ -65,9 +65,6 @@ def flush() -> None:
     return [turn for turn in merged if turn.text]
 
 
-DEFAULT_VOICE_ROTATION = ("jane", "michael", "mary", "paul", "eve", "george")
-
-
 def parse_voice_overrides(values: list[str]) -> tuple[str | None, dict[str, str]]:
     """Split repeatable ``--voice`` values into ``(bare_voice, {speaker_id: voice})``.
 

diff --git a/aai_cli/tts/voices.py b/aai_cli/tts/voices.py
@@ -0,0 +1,90 @@
+"""The streaming-TTS voice catalog: every voice speaks exactly one language.
+
+When no ``--voice`` is chosen, `assembly speak` and `assembly dub` pick the
+voice from the requested language: a non-English language switches to that
+language's native voice(s) — most ship exactly one, so the language alone
+selects the voice — while English keeps the curated multi-speaker rotation.
+"""
+
+from __future__ import annotations
+
+# Voice id -> ISO 639-1 code of the (single) language the voice speaks.
+VOICE_LANGUAGES: dict[str, str] = {
+    "alba": "en",
+    "anna": "en",
+    "azelma": "en",
+    "bill_boerst": "en",
+    "caro_davy": "en",
+    "charles": "en",
+    "cosette": "en",
+    "eponine": "en",
+    "estelle": "fr",
+    "eve": "en",
+    "fantine": "en",
+    "george": "en",
+    "giovanni": "it",
+    "jane": "en",
+    "javert": "en",
+    "jean": "en",
+    "juergen": "de",
+    "lola": "es",
+    "marius": "en",
+    "mary": "en",
+    "michael": "en",
+    "paul": "en",
+    "peter_yearsley": "en",
+    "rafael": "pt",
+    "stuart_bell": "en",
+    "vera": "en",
+}
+
+# The language names the TTS `language` param uses, keyed by ISO code. Only
+# languages with at least one catalog voice belong here (rotation_for relies
+# on that invariant to never resolve to an empty rotation). Deliberately
+# narrower than dub_exec.LANGUAGE_NAMES, which also lists voiceless languages
+# the translator supports.
+_LANGUAGE_NAMES: dict[str, str] = {
+    "de": "German",
+    "en": "English",
+    "es": "Spanish",
+    "fr": "French",
+    "it": "Italian",
+    "pt": "Portuguese",
+}
+
+_NAME_TO_CODE = {name.casefold(): code for code, name in _LANGUAGE_NAMES.items()}
+
+# English has many voices; this curated subset keeps multi-speaker output
+# varied with the confirmed-working voices. Non-English languages rotate
+# through their own (usually single) native voices instead.
+ENGLISH_ROTATION = ("jane", "michael", "mary", "paul", "eve", "george")
+
+
+def language_code(language: str | None) -> str | None:
+    """Normalize a language value (ISO code or name, any case) to its code,
+    or None when it has no catalog voices — the TTS service and the dub
+    translator accept more languages than the catalog covers."""
+    if language is None:
+        return None
+    cleaned = language.strip().casefold()
+    if cleaned in _LANGUAGE_NAMES:
+        return cleaned
+    return _NAME_TO_CODE.get(cleaned)
+
+
+def rotation_for(language: str | None) -> tuple[str, ...]:
+    """The default voice rotation for a language.
+
+    English — and any language without catalog voices — keeps the curated
+    English rotation; a language with native voices rotates through those, so
+    a single-voice language always switches to its one voice.
+    """
+    code = language_code(language)
+    if code is None or code == "en":
+        return ENGLISH_ROTATION
+    return tuple(voice for voice, spoken in VOICE_LANGUAGES.items() if spoken == code)
+
+
+def default_voice(language: str | None) -> str:
+    """The voice used when none is chosen: the language's first rotation voice."""
+    return rotation_for(language)[0]
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -244,7 +244,8 @@
   │    --voice                  TEXT  Voice id for every speaker (e.g. jane,     │
   │                                   michael, paul), or SPEAKER=VOICE to pin a  │
   │                                   diarized speaker (repeatable, e.g. --voice │
-  │                                   A=jane).                                   │
+  │                                   A=jane). Default: the target language's    │
+  │                                   native voice(s).                           │
   │    --out                    PATH  Output file (default:                      │
   │                                   <name>.dub.<lang><ext> next to the input). │
   │    --json           -j            Emit JSON describing the dubbed file.      │
@@ -426,7 +427,10 @@
   │                                              SPEAKER=VOICE for diarized      │
   │                                              input (repeatable, e.g. --voice │
   │                                              A=jane).                        │
-  │ --language             TEXT                  Language of the text.           │
+  │ --language             TEXT                  Language of the text. The       │
+  │                                              default voice follows it (e.g.  │
+  │                                              --language Italian speaks with  │
+  │                                              giovanni).                      │
   │                                              [default: English]              │
   │ --sample-rate          INTEGER RANGE [x>=1]  Output sample rate in Hz        │
   │                                              (positive). Server default if   │
@@ -441,8 +445,8 @@
    Examples
    Speak text aloud (sandbox only)
    $ assembly --sandbox speak "Hello there, friend."
-   Pick a voice and language
-   $ assembly --sandbox speak "Bonjour" --voice jane --language French
+   Pick a language (its native voice is selected automatically)
+   $ assembly --sandbox speak "Bonjour" --language French
    Speak a diarized transcript, one voice per speaker
    $ assembly transcribe meeting.mp3 --speaker-labels | assembly --sandbox speak
    Override a speaker's voice

diff --git a/tests/test_dub_pipeline.py b/tests/test_dub_pipeline.py
@@ -93,10 +93,11 @@ def test_run_dub_pipeline_end_to_end(
         assert "dubbing" in system["content"]
         assert "German" in system["content"]
 
-    # Synthesis: the translated text, rotation voices in speaker order, target language.
+    # Synthesis: the translated text in the target language, every speaker on
+    # German's one native voice (the language selects the voice).
     assert [(cfg.voice, cfg.text) for cfg in fake_synthesize] == [
-        ("jane", "DE:Hello."),
-        ("michael", "DE:World."),
+        ("juergen", "DE:Hello."),
+        ("juergen", "DE:World."),
     ]
     assert all(cfg.language == "German" for cfg in fake_synthesize)
 
@@ -137,7 +138,7 @@ def test_run_dub_pipeline_end_to_end(
         "language": "German",
         "transcript_id": "tr_dub",
         "utterances": 2,
-        "speakers": {"A": "jane", "B": "michael"},
+        "speakers": {"A": "juergen", "B": "juergen"},
         "sample_rate": SAMPLE_RATE,
         "audio_duration_seconds": 5.0,
     }
@@ -157,7 +158,7 @@ def test_run_dub_human_summary(
     assert "dub.de.mp4" in out
     assert "dubbed to German" in out
     assert "2 utterances" in out
-    assert "A=jane, B=michael" in out
+    assert "A=juergen, B=juergen" in out
 
 
 def test_bare_voice_dubs_every_speaker(
@@ -173,8 +174,29 @@ def test_voice_overrides_pin_speakers_without_consuming_rotation(
 ):
     opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["A=mary"])
     _run(opts, json_mode=True)
-    # A is pinned; B still takes the first rotation voice (overrides don't consume slots).
-    assert [cfg.voice for cfg in fake_synthesize] == ["mary", "jane"]
+    # A is pinned; B still takes German's native voice from the rotation.
+    assert [cfg.voice for cfg in fake_synthesize] == ["mary", "juergen"]
+
+
+def test_english_dub_keeps_the_multi_voice_rotation(
+    media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg
+):
+    # English has many voices, so speakers still rotate through the curated set
+    # instead of collapsing onto one voice.
+    opts = dataclasses.replace(DEFAULTS, media=str(media), language="en")
+    _run(opts, json_mode=True)
+    assert [cfg.voice for cfg in fake_synthesize] == ["jane", "michael"]
+
+
+def test_language_without_a_native_voice_falls_back_to_english_rotation(
+    media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg
+):
+    # Japanese is translatable but has no catalog voice: the dub still runs,
+    # on the English rotation.
+    opts = dataclasses.replace(DEFAULTS, media=str(media), language="ja")
+    _run(opts, json_mode=True)
+    assert [cfg.voice for cfg in fake_synthesize] == ["jane", "michael"]
+    assert all(cfg.language == "Japanese" for cfg in fake_synthesize)
 
 
 def test_transcript_id_reuses_existing_transcript(

diff --git a/tests/test_speak.py b/tests/test_speak.py
@@ -115,6 +115,27 @@ def test_voice_and_language_flow_into_config(monkeypatch, fake_synthesize):
     assert cfg.query_params() == {"voice": "jane", "language": "English"}
 
 
+def test_default_voice_follows_the_language(monkeypatch, fake_synthesize):
+    # Each voice speaks one language: with no --voice, a non-English --language
+    # switches to that language's native voice instead of English "jane".
+    monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None)
+    result = runner.invoke(app, ["--sandbox", "speak", "Ciao", "--language", "Italian"])
+    assert result.exit_code == 0
+    cfg = fake_synthesize["cfg"]
+    assert cfg.voice == "giovanni"
+    assert cfg.language == "Italian"
+
+
+def test_explicit_voice_beats_the_language_default(monkeypatch, fake_synthesize):
+    monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None)
+    result = runner.invoke(
+        app, ["--sandbox", "speak", "Bonjour", "--voice", "jane", "--language", "French"]
+    )
+    assert result.exit_code == 0
+    # A chosen voice always wins; the language only drives the default.
+    assert fake_synthesize["cfg"].voice == "jane"
+
+
 def test_json_mode_emits_metadata_object_on_stdout(monkeypatch, fake_synthesize):
     monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None)
     result = runner.invoke(app, ["--sandbox", "speak", "Hi", "--voice", "jane", "--json"])
@@ -166,6 +187,16 @@ def test_labeled_stdin_uses_dialogue_path_with_default_rotation(fake_dialogue):
     ]
 
 
+def test_dialogue_rotation_follows_the_language(fake_dialogue):
+    # French has exactly one native voice, so every speaker switches to it —
+    # the language selects the voice in dialogue mode too.
+    text = "Speaker A: Bonjour.\nSpeaker B: Salut."
+    result = runner.invoke(app, ["--sandbox", "speak", "--language", "French"], input=text)
+    assert result.exit_code == 0
+    assert fake_dialogue["segments"] == [("estelle", "Bonjour."), ("estelle", "Salut.")]
+    assert fake_dialogue["language"] == "French"
+
+
 def test_speaker_voice_override_is_applied(fake_dialogue):
     text = "Speaker A: One.\nSpeaker B: Two."
     result = runner.invoke(

diff --git a/tests/test_tts_dialogue.py b/tests/test_tts_dialogue.py
@@ -101,7 +101,3 @@ def test_assign_voices_override_beats_rotation_without_consuming_a_slot():
     resolved, mapping = dialogue.assign_voices(segs, ["jane", "michael"], {"a": "vera"})
     assert [v for v, _ in resolved] == ["vera", "jane"]
     assert mapping == {"A": "vera", "B": "jane"}
-
-
-def test_default_rotation_is_the_confirmed_working_voices():
-    assert dialogue.DEFAULT_VOICE_ROTATION == ("jane", "michael", "mary", "paul", "eve", "george")