diff --git a/aai_cli/commands/dub.py b/aai_cli/commands/dub.py index 1d3dc552..a1f72125 100644 --- a/aai_cli/commands/dub.py +++ b/aai_cli/commands/dub.py @@ -60,7 +60,8 @@ def dub( [], "--voice", help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE " - "to pin a diarized speaker (repeatable, e.g. --voice A=jane).", + "to pin a diarized speaker (repeatable, e.g. --voice A=jane). Default: the " + "target language's native voice(s).", ), model: str = typer.Option( llm.DEFAULT_MODEL, diff --git a/aai_cli/commands/speak.py b/aai_cli/commands/speak.py index 948def0b..578a1927 100644 --- a/aai_cli/commands/speak.py +++ b/aai_cli/commands/speak.py @@ -19,8 +19,8 @@ [ ("Speak text aloud (sandbox only)", 'assembly --sandbox speak "Hello there, friend."'), ( - "Pick a voice and language", - 'assembly --sandbox speak "Bonjour" --voice jane --language French', + "Pick a language (its native voice is selected automatically)", + 'assembly --sandbox speak "Bonjour" --language French', ), ( "Speak a diarized transcript, one voice per speaker", @@ -46,7 +46,12 @@ def speak( help="Voice id (e.g. jane, michael, mary, paul, eve, george), or SPEAKER=VOICE " "for diarized input (repeatable, e.g. --voice A=jane).", ), - language: str = typer.Option(DEFAULT_LANGUAGE, "--language", help="Language of the text."), + language: str = typer.Option( + DEFAULT_LANGUAGE, + "--language", + help="Language of the text. The default voice follows it " + "(e.g. --language Italian speaks with giovanni).", + ), sample_rate: int | None = typer.Option( None, "--sample-rate", diff --git a/aai_cli/dub_exec.py b/aai_cli/dub_exec.py index 26040d49..af479640 100644 --- a/aai_cli/dub_exec.py +++ b/aai_cli/dub_exec.py @@ -31,7 +31,7 @@ from aai_cli import llm as gateway from aai_cli.context import AppState from aai_cli.errors import APIError, CLIError, UsageError -from aai_cli.tts import audio, dialogue, session +from aai_cli.tts import audio, dialogue, session, voices from aai_cli.tts.session import SpeakConfig # ISO-639-1 codes accepted by --lang, mapped to the language *name* both the @@ -349,15 +349,18 @@ def _assign_voices( utterances: list[_Utterance], translations: list[str], voice_values: list[str], + language: str, ) -> tuple[list[tuple[str, str]], dict[str, str]]: """Resolve each translated utterance to ``(voice, text)`` plus the speaker→voice map. A bare ``--voice`` dubs every speaker with that one voice; ``SPEAKER=VOICE`` - mappings pin individual speakers; everyone else takes the rotation in - first-appearance order (the same rules as `assembly speak`). + mappings pin individual speakers; everyone else takes the target language's + rotation in first-appearance order (the same rules as `assembly speak`) — + each voice speaks one language, so a non-English dub switches to that + language's native voice(s). """ bare_voice, overrides = dialogue.parse_voice_overrides(voice_values) - rotation = (bare_voice,) if bare_voice is not None else dialogue.DEFAULT_VOICE_ROTATION + rotation = (bare_voice,) if bare_voice is not None else voices.rotation_for(language) segments = [ dialogue.Segment(utterance.speaker, translated) # strict=True is an invariant guard only: _translate returns exactly one @@ -384,7 +387,7 @@ def run_dub(opts: DubOptions, state: AppState, *, json_mode: bool) -> None: translations = _translate( api_key, utterances, language, opts, json_mode=json_mode, quiet=state.quiet ) - resolved, speakers = _assign_voices(utterances, translations, opts.voice) + resolved, speakers = _assign_voices(utterances, translations, opts.voice, language) pcm_segments, sample_rate = _synthesize( api_key, resolved, language, json_mode=json_mode, quiet=state.quiet ) diff --git a/aai_cli/speak_exec.py b/aai_cli/speak_exec.py index 1dd16e58..56395d15 100644 --- a/aai_cli/speak_exec.py +++ b/aai_cli/speak_exec.py @@ -14,12 +14,12 @@ from aai_cli import environments, output, stdio from aai_cli.context import AppState from aai_cli.errors import CLIError, UsageError -from aai_cli.tts import audio, dialogue, session +from aai_cli.tts import audio, dialogue, session, voices -# The streaming-TTS reference client defaults to the PocketTTS "jane" voice and -# English, so the CLI sends the same and a bare `assembly speak` works out of the box. -# Override either with --voice/--language. -DEFAULT_VOICE = "jane" +# The streaming-TTS reference client defaults to English, so the CLI does the +# same. The default voice follows the language (voices.default_voice): each +# voice speaks one language, so e.g. --language Italian switches to giovanni +# unless --voice overrides it. DEFAULT_LANGUAGE = "English" @@ -161,7 +161,7 @@ def _speak_dialogue( json_mode=json_mode, ) resolved, speakers = dialogue.assign_voices( - segments, dialogue.DEFAULT_VOICE_ROTATION, overrides + segments, voices.rotation_for(opts.language), overrides ) with output.status("Synthesizing speech…", json_mode=json_mode, quiet=quiet): result = session.synthesize_dialogue( @@ -209,7 +209,7 @@ def run_speak(opts: SpeakOptions, state: AppState, *, json_mode: bool) -> None: _speak_single( api_key, spoken, - bare_voice or DEFAULT_VOICE, + bare_voice or voices.default_voice(opts.language), opts, json_mode=json_mode, quiet=state.quiet, diff --git a/aai_cli/tts/dialogue.py b/aai_cli/tts/dialogue.py index 4cef86ea..cbf7c5d3 100644 --- a/aai_cli/tts/dialogue.py +++ b/aai_cli/tts/dialogue.py @@ -65,9 +65,6 @@ def flush() -> None: return [turn for turn in merged if turn.text] -DEFAULT_VOICE_ROTATION = ("jane", "michael", "mary", "paul", "eve", "george") - - def parse_voice_overrides(values: list[str]) -> tuple[str | None, dict[str, str]]: """Split repeatable ``--voice`` values into ``(bare_voice, {speaker_id: voice})``. diff --git a/aai_cli/tts/voices.py b/aai_cli/tts/voices.py new file mode 100644 index 00000000..b878eed5 --- /dev/null +++ b/aai_cli/tts/voices.py @@ -0,0 +1,90 @@ +"""The streaming-TTS voice catalog: every voice speaks exactly one language. + +When no ``--voice`` is chosen, `assembly speak` and `assembly dub` pick the +voice from the requested language: a non-English language switches to that +language's native voice(s) — most ship exactly one, so the language alone +selects the voice — while English keeps the curated multi-speaker rotation. +""" + +from __future__ import annotations + +# Voice id -> ISO 639-1 code of the (single) language the voice speaks. +VOICE_LANGUAGES: dict[str, str] = { + "alba": "en", + "anna": "en", + "azelma": "en", + "bill_boerst": "en", + "caro_davy": "en", + "charles": "en", + "cosette": "en", + "eponine": "en", + "estelle": "fr", + "eve": "en", + "fantine": "en", + "george": "en", + "giovanni": "it", + "jane": "en", + "javert": "en", + "jean": "en", + "juergen": "de", + "lola": "es", + "marius": "en", + "mary": "en", + "michael": "en", + "paul": "en", + "peter_yearsley": "en", + "rafael": "pt", + "stuart_bell": "en", + "vera": "en", +} + +# The language names the TTS `language` param uses, keyed by ISO code. Only +# languages with at least one catalog voice belong here (rotation_for relies +# on that invariant to never resolve to an empty rotation). Deliberately +# narrower than dub_exec.LANGUAGE_NAMES, which also lists voiceless languages +# the translator supports. +_LANGUAGE_NAMES: dict[str, str] = { + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "it": "Italian", + "pt": "Portuguese", +} + +_NAME_TO_CODE = {name.casefold(): code for code, name in _LANGUAGE_NAMES.items()} + +# English has many voices; this curated subset keeps multi-speaker output +# varied with the confirmed-working voices. Non-English languages rotate +# through their own (usually single) native voices instead. +ENGLISH_ROTATION = ("jane", "michael", "mary", "paul", "eve", "george") + + +def language_code(language: str | None) -> str | None: + """Normalize a language value (ISO code or name, any case) to its code, + or None when it has no catalog voices — the TTS service and the dub + translator accept more languages than the catalog covers.""" + if language is None: + return None + cleaned = language.strip().casefold() + if cleaned in _LANGUAGE_NAMES: + return cleaned + return _NAME_TO_CODE.get(cleaned) + + +def rotation_for(language: str | None) -> tuple[str, ...]: + """The default voice rotation for a language. + + English — and any language without catalog voices — keeps the curated + English rotation; a language with native voices rotates through those, so + a single-voice language always switches to its one voice. + """ + code = language_code(language) + if code is None or code == "en": + return ENGLISH_ROTATION + return tuple(voice for voice, spoken in VOICE_LANGUAGES.items() if spoken == code) + + +def default_voice(language: str | None) -> str: + """The voice used when none is chosen: the language's first rotation voice.""" + return rotation_for(language)[0] diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index b4476fdd..cd06d620 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -244,7 +244,8 @@ │ --voice TEXT Voice id for every speaker (e.g. jane, │ │ michael, paul), or SPEAKER=VOICE to pin a │ │ diarized speaker (repeatable, e.g. --voice │ - │ A=jane). │ + │ A=jane). Default: the target language's │ + │ native voice(s). │ │ --out PATH Output file (default: │ │ .dub. next to the input). │ │ --json -j Emit JSON describing the dubbed file. │ @@ -426,7 +427,10 @@ │ SPEAKER=VOICE for diarized │ │ input (repeatable, e.g. --voice │ │ A=jane). │ - │ --language TEXT Language of the text. │ + │ --language TEXT Language of the text. The │ + │ default voice follows it (e.g. │ + │ --language Italian speaks with │ + │ giovanni). │ │ [default: English] │ │ --sample-rate INTEGER RANGE [x>=1] Output sample rate in Hz │ │ (positive). Server default if │ @@ -441,8 +445,8 @@ Examples Speak text aloud (sandbox only) $ assembly --sandbox speak "Hello there, friend." - Pick a voice and language - $ assembly --sandbox speak "Bonjour" --voice jane --language French + Pick a language (its native voice is selected automatically) + $ assembly --sandbox speak "Bonjour" --language French Speak a diarized transcript, one voice per speaker $ assembly transcribe meeting.mp3 --speaker-labels | assembly --sandbox speak Override a speaker's voice diff --git a/tests/test_dub_pipeline.py b/tests/test_dub_pipeline.py index 78990d0b..200e2e1a 100644 --- a/tests/test_dub_pipeline.py +++ b/tests/test_dub_pipeline.py @@ -93,10 +93,11 @@ def test_run_dub_pipeline_end_to_end( assert "dubbing" in system["content"] assert "German" in system["content"] - # Synthesis: the translated text, rotation voices in speaker order, target language. + # Synthesis: the translated text in the target language, every speaker on + # German's one native voice (the language selects the voice). assert [(cfg.voice, cfg.text) for cfg in fake_synthesize] == [ - ("jane", "DE:Hello."), - ("michael", "DE:World."), + ("juergen", "DE:Hello."), + ("juergen", "DE:World."), ] assert all(cfg.language == "German" for cfg in fake_synthesize) @@ -137,7 +138,7 @@ def test_run_dub_pipeline_end_to_end( "language": "German", "transcript_id": "tr_dub", "utterances": 2, - "speakers": {"A": "jane", "B": "michael"}, + "speakers": {"A": "juergen", "B": "juergen"}, "sample_rate": SAMPLE_RATE, "audio_duration_seconds": 5.0, } @@ -157,7 +158,7 @@ def test_run_dub_human_summary( assert "dub.de.mp4" in out assert "dubbed to German" in out assert "2 utterances" in out - assert "A=jane, B=michael" in out + assert "A=juergen, B=juergen" in out def test_bare_voice_dubs_every_speaker( @@ -173,8 +174,29 @@ def test_voice_overrides_pin_speakers_without_consuming_rotation( ): opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["A=mary"]) _run(opts, json_mode=True) - # A is pinned; B still takes the first rotation voice (overrides don't consume slots). - assert [cfg.voice for cfg in fake_synthesize] == ["mary", "jane"] + # A is pinned; B still takes German's native voice from the rotation. + assert [cfg.voice for cfg in fake_synthesize] == ["mary", "juergen"] + + +def test_english_dub_keeps_the_multi_voice_rotation( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg +): + # English has many voices, so speakers still rotate through the curated set + # instead of collapsing onto one voice. + opts = dataclasses.replace(DEFAULTS, media=str(media), language="en") + _run(opts, json_mode=True) + assert [cfg.voice for cfg in fake_synthesize] == ["jane", "michael"] + + +def test_language_without_a_native_voice_falls_back_to_english_rotation( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg +): + # Japanese is translatable but has no catalog voice: the dub still runs, + # on the English rotation. + opts = dataclasses.replace(DEFAULTS, media=str(media), language="ja") + _run(opts, json_mode=True) + assert [cfg.voice for cfg in fake_synthesize] == ["jane", "michael"] + assert all(cfg.language == "Japanese" for cfg in fake_synthesize) def test_transcript_id_reuses_existing_transcript( diff --git a/tests/test_speak.py b/tests/test_speak.py index b25cc92b..91a0d782 100644 --- a/tests/test_speak.py +++ b/tests/test_speak.py @@ -115,6 +115,27 @@ def test_voice_and_language_flow_into_config(monkeypatch, fake_synthesize): assert cfg.query_params() == {"voice": "jane", "language": "English"} +def test_default_voice_follows_the_language(monkeypatch, fake_synthesize): + # Each voice speaks one language: with no --voice, a non-English --language + # switches to that language's native voice instead of English "jane". + monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None) + result = runner.invoke(app, ["--sandbox", "speak", "Ciao", "--language", "Italian"]) + assert result.exit_code == 0 + cfg = fake_synthesize["cfg"] + assert cfg.voice == "giovanni" + assert cfg.language == "Italian" + + +def test_explicit_voice_beats_the_language_default(monkeypatch, fake_synthesize): + monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None) + result = runner.invoke( + app, ["--sandbox", "speak", "Bonjour", "--voice", "jane", "--language", "French"] + ) + assert result.exit_code == 0 + # A chosen voice always wins; the language only drives the default. + assert fake_synthesize["cfg"].voice == "jane" + + def test_json_mode_emits_metadata_object_on_stdout(monkeypatch, fake_synthesize): monkeypatch.setattr("aai_cli.speak_exec.audio.play_pcm", lambda *a, **k: None) result = runner.invoke(app, ["--sandbox", "speak", "Hi", "--voice", "jane", "--json"]) @@ -166,6 +187,16 @@ def test_labeled_stdin_uses_dialogue_path_with_default_rotation(fake_dialogue): ] +def test_dialogue_rotation_follows_the_language(fake_dialogue): + # French has exactly one native voice, so every speaker switches to it — + # the language selects the voice in dialogue mode too. + text = "Speaker A: Bonjour.\nSpeaker B: Salut." + result = runner.invoke(app, ["--sandbox", "speak", "--language", "French"], input=text) + assert result.exit_code == 0 + assert fake_dialogue["segments"] == [("estelle", "Bonjour."), ("estelle", "Salut.")] + assert fake_dialogue["language"] == "French" + + def test_speaker_voice_override_is_applied(fake_dialogue): text = "Speaker A: One.\nSpeaker B: Two." result = runner.invoke( diff --git a/tests/test_tts_dialogue.py b/tests/test_tts_dialogue.py index 6f3dcbc8..164cc1b0 100644 --- a/tests/test_tts_dialogue.py +++ b/tests/test_tts_dialogue.py @@ -101,7 +101,3 @@ def test_assign_voices_override_beats_rotation_without_consuming_a_slot(): resolved, mapping = dialogue.assign_voices(segs, ["jane", "michael"], {"a": "vera"}) assert [v for v, _ in resolved] == ["vera", "jane"] assert mapping == {"A": "vera", "B": "jane"} - - -def test_default_rotation_is_the_confirmed_working_voices(): - assert dialogue.DEFAULT_VOICE_ROTATION == ("jane", "michael", "mary", "paul", "eve", "george") diff --git a/tests/test_tts_voices.py b/tests/test_tts_voices.py new file mode 100644 index 00000000..2024547b --- /dev/null +++ b/tests/test_tts_voices.py @@ -0,0 +1,99 @@ +"""Tests for the streaming-TTS voice catalog (aai_cli/tts/voices.py): the +voice -> language mapping and the language-driven default-voice selection +`assembly speak` and `assembly dub` share.""" + +from __future__ import annotations + +import pytest + +from aai_cli.tts import voices + + +def test_voice_languages_catalog(): + # An independent copy of the expected catalog: a silently edited entry in + # the shipped map must fail here, not just round-trip through itself. + assert voices.VOICE_LANGUAGES == { + "alba": "en", + "anna": "en", + "azelma": "en", + "bill_boerst": "en", + "caro_davy": "en", + "charles": "en", + "cosette": "en", + "eponine": "en", + "estelle": "fr", + "eve": "en", + "fantine": "en", + "george": "en", + "giovanni": "it", + "jane": "en", + "javert": "en", + "jean": "en", + "juergen": "de", + "lola": "es", + "marius": "en", + "mary": "en", + "michael": "en", + "paul": "en", + "peter_yearsley": "en", + "rafael": "pt", + "stuart_bell": "en", + "vera": "en", + } + + +def test_english_rotation_is_the_confirmed_working_voices(): + assert voices.ENGLISH_ROTATION == ("jane", "michael", "mary", "paul", "eve", "george") + # Every rotation voice must actually speak English. + assert all(voices.VOICE_LANGUAGES[voice] == "en" for voice in voices.ENGLISH_ROTATION) + + +def test_every_voice_language_has_a_name(): + # rotation_for relies on this: any code a catalog voice speaks must + # normalize through language_code, so a rotation can never come back empty. + for code in set(voices.VOICE_LANGUAGES.values()): + assert voices.language_code(code) == code + assert voices.rotation_for(code) + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ("de", "de"), # an ISO code passes through + (" DE ", "de"), # trimmed and case-insensitive + ("German", "de"), # a language name maps to its code + ("english", "en"), + ("Portuguese", "pt"), + (None, None), # no language requested + ("Klingon", None), # unknown language -> no catalog voices + ("ja", None), # a translatable language without a voice + ], +) +def test_language_code(value, expected): + assert voices.language_code(value) == expected + + +@pytest.mark.parametrize( + ("language", "expected"), + [ + ("English", voices.ENGLISH_ROTATION), + ("en", voices.ENGLISH_ROTATION), + (None, voices.ENGLISH_ROTATION), # server-default language -> English voices + ("Japanese", voices.ENGLISH_ROTATION), # no native voice -> English fallback + ("Italian", ("giovanni",)), + ("es", ("lola",)), + ("German", ("juergen",)), + ("pt", ("rafael",)), + ("French", ("estelle",)), + ], +) +def test_rotation_for(language, expected): + assert voices.rotation_for(language) == expected + + +@pytest.mark.parametrize( + ("language", "expected"), + [("English", "jane"), ("Italian", "giovanni"), ("fr", "estelle"), ("Klingon", "jane")], +) +def test_default_voice(language, expected): + assert voices.default_voice(language) == expected