From fc5a1beb8888304f4dafb09491834ea507d66c71 Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 12 Jun 2026 22:09:46 +0000 Subject: [PATCH 1/2] =?UTF-8?q?Add=20assembly=20dub:=20transcribe=20?= =?UTF-8?q?=E2=86=92=20translate=20=E2=86=92=20speak=20=E2=86=92=20ffmpeg?= =?UTF-8?q?=20track-swap?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit One command that runs the whole platform end to end: the media is transcribed with diarized utterance timestamps, each utterance is translated to the target language by an LLM Gateway model, each translation is synthesized with streaming TTS (rotation voice per speaker, --voice/SPEAKER=VOICE overrides like speak), the segments are laid on a silence timeline at their original start times, and ffmpeg swaps the new track over the original media with the video stream copied untouched (-map 0:v? -c:v copy, so audio-only input works too). Usage: assembly --sandbox dub talk.mp4 --lang de - --lang takes an ISO code (mapped to a language name) or a name as-is - --transcript-id reuses an existing diarized transcript - default output .dub.; --out overrides (and refuses to overwrite the input) - sandbox-only, like speak: streaming TTS has no production host yet Follows the options/run split (commands/dub.py parses argv into a frozen DubOptions; dub_exec.run_dub does the work), with the LLM, TTS, and ffmpeg boundaries seamed for hermetic tests. https://claude.ai/code/session_01Mcran5xqMHcrt4RUxSHrkX --- .importlinter | 2 + README.md | 9 + aai_cli/commands/dub.py | 106 +++++ aai_cli/dub_exec.py | 420 ++++++++++++++++++ aai_cli/main.py | 3 + .../test_snapshots_help_run.ambr | 62 +++ tests/_dub_helpers.py | 120 +++++ tests/_snapshot_surface.py | 4 +- tests/test_dub_command.py | 95 ++++ tests/test_dub_exec.py | 238 ++++++++++ tests/test_dub_pipeline.py | 278 ++++++++++++ tests/test_smoke.py | 1 + 12 files changed, 1337 insertions(+), 1 deletion(-) create mode 100644 aai_cli/commands/dub.py create mode 100644 aai_cli/dub_exec.py create mode 100644 tests/_dub_helpers.py create mode 100644 tests/test_dub_command.py create mode 100644 tests/test_dub_exec.py create mode 100644 tests/test_dub_pipeline.py diff --git a/.importlinter b/.importlinter index f835c632..7378b79a 100644 --- a/.importlinter +++ b/.importlinter @@ -19,6 +19,7 @@ source_modules = aai_cli.config_builder aai_cli.context aai_cli.debuglog + aai_cli.dub_exec aai_cli.environments aai_cli.errors aai_cli.eval_data @@ -60,6 +61,7 @@ modules = aai_cli.commands.deploy aai_cli.commands.dev aai_cli.commands.doctor + aai_cli.commands.dub aai_cli.commands.evaluate aai_cli.commands.init aai_cli.commands.keys diff --git a/README.md b/README.md index bcbbdbf2..dafbc37b 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,14 @@ assembly transcribe "https://www.youtube.com/watch?v=awmCtXzFsJo" --speaker-labe `speak` auto-detects `Speaker A:` labels, merges each speaker's turns, and rotates voices. (`speak` is sandbox-only today, hence `--sandbox`.) +**Dub a video into another language** — the whole platform in one command: transcription with utterance timestamps, per-utterance LLM translation, TTS for each line (one voice per speaker), and ffmpeg laying the new track over the original video: + +```sh +assembly --sandbox dub talk.mp4 --lang de +``` + +The video stream is copied untouched; each dubbed line lands at its original start time. (Sandbox-only, like `speak`.) + **Turn a podcast into audio** — Apple and Spotify podcast pages work too (yt-dlp ingestion): ```sh @@ -174,6 +182,7 @@ assembly init # scaffold a starter app - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal. - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`). - **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). +- **Dubbing**: `assembly dub` re-voices an audio/video file in another language (`assembly --sandbox dub talk.mp4 --lang de`): diarized transcription, per-utterance LLM translation, streaming TTS per speaker, and an ffmpeg track-swap that leaves the video untouched. Sandbox-only today, like `speak`. - **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model. - **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io. - **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`. diff --git a/aai_cli/commands/dub.py b/aai_cli/commands/dub.py new file mode 100644 index 00000000..1d3dc552 --- /dev/null +++ b/aai_cli/commands/dub.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from pathlib import Path + +import typer + +from aai_cli import dub_exec, help_panels, llm, options +from aai_cli.context import run_command +from aai_cli.help_text import examples_epilog + +app = typer.Typer() + + +@app.command( + rich_help_panel=help_panels.TRANSCRIPTION, + # --sandbox is a root flag, so it must come before the subcommand in every example. + epilog=examples_epilog( + [ + ("Dub a talk into German (sandbox only)", "assembly --sandbox dub talk.mp4 --lang de"), + ("Use a language name instead of a code", "assembly --sandbox dub talk.mp4 -l Spanish"), + ( + "Dub every speaker with one voice", + "assembly --sandbox dub talk.mp4 -l fr --voice paul", + ), + ( + "Pin a voice per diarized speaker", + "assembly --sandbox dub panel.mp4 -l de --voice A=jane --voice B=paul", + ), + ( + "Reuse a finished transcript instead of re-transcribing", + "assembly --sandbox dub talk.mp4 -l de -t TRANSCRIPT_ID", + ), + ( + "Choose the output file", + "assembly --sandbox dub talk.mp4 -l de --out talk-german.mp4", + ), + ] + ), +) +def dub( + ctx: typer.Context, + media: str = typer.Argument( + ..., + help="Local audio/video file to dub (the video stream is copied untouched).", + ), + lang: str = typer.Option( + ..., + "--lang", + "-l", + help="Target language: an ISO code (de, fr, es, …) or a language name (German).", + ), + transcript_id: str | None = typer.Option( + None, + "--transcript-id", + "-t", + help="Reuse an existing diarized transcript of this media instead of " + "transcribing it again.", + ), + voice: list[str] = typer.Option( + [], + "--voice", + help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE " + "to pin a diarized speaker (repeatable, e.g. --voice A=jane).", + ), + model: str = typer.Option( + llm.DEFAULT_MODEL, + "--model", + help="LLM Gateway model that translates the utterances.", + rich_help_panel=help_panels.OPT_LLM, + autocompletion=llm.complete_model, + ), + max_tokens: int = typer.Option( + llm.DEFAULT_MAX_TOKENS, + "--max-tokens", + help="Max tokens per utterance translation.", + rich_help_panel=help_panels.OPT_LLM, + ), + out: Path | None = typer.Option( + None, "--out", help="Output file (default: .dub. next to the input)." + ), + json_out: bool = options.json_option("Emit JSON describing the dubbed file."), +) -> None: + """Dub a video or audio file into another language (sandbox only). + + The whole platform in one command: the media is transcribed with diarized + utterance timestamps, each utterance is translated by an LLM Gateway model, + the translations are synthesized with streaming TTS (one voice per + speaker), and ffmpeg lays the new audio over the original — video copied + untouched. Streaming TTS only exists in the sandbox today — run it as + 'assembly --sandbox dub' (--sandbox goes before the subcommand). Requires + ffmpeg. + """ + opts = dub_exec.DubOptions( + media=media, + language=lang, + transcript_id=transcript_id, + voice=voice, + model=model, + max_tokens=max_tokens, + out=out, + ) + run_command( + ctx, + lambda state, json_mode: dub_exec.run_dub(opts, state, json_mode=json_mode), + json=json_out, + ) diff --git a/aai_cli/dub_exec.py b/aai_cli/dub_exec.py new file mode 100644 index 00000000..26040d49 --- /dev/null +++ b/aai_cli/dub_exec.py @@ -0,0 +1,420 @@ +"""Run logic for `assembly dub`: transcribe → translate → synthesize → ffmpeg track-swap. + +The command module (aai_cli/commands/dub.py) only parses argv — it builds a +``DubOptions`` and hands it to ``run_dub`` via ``context.run_command`` (the +options/run split, see AGENTS.md), so tests drive the whole pipeline by +constructing options directly. + +The pipeline runs the platform end to end in one command: the media is +transcribed with diarized utterance timestamps, each utterance is translated to +the target language by an LLM Gateway model, each translation is synthesized +with streaming TTS (one voice per speaker), the segments are laid out on a +silence timeline at their original start times, and ffmpeg swaps the new track +over the original media (video stream copied untouched). Streaming TTS only +exists in the sandbox today, so — like `assembly speak` — the command is +sandbox-only. +""" + +from __future__ import annotations + +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import assemblyai as aai +from rich.markup import escape + +from aai_cli import client, environments, jsonshape, output +from aai_cli import llm as gateway +from aai_cli.context import AppState +from aai_cli.errors import APIError, CLIError, UsageError +from aai_cli.tts import audio, dialogue, session +from aai_cli.tts.session import SpeakConfig + +# ISO-639-1 codes accepted by --lang, mapped to the language *name* both the +# translation prompt and the streaming-TTS `language` param expect. A value not +# listed passes through as typed, so a full name ("German") — or an unlisted +# language the gateway can translate to — still works. +LANGUAGE_NAMES = { + "ar": "Arabic", + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "hi": "Hindi", + "it": "Italian", + "ja": "Japanese", + "ko": "Korean", + "nl": "Dutch", + "pl": "Polish", + "pt": "Portuguese", + "ru": "Russian", + "tr": "Turkish", + "vi": "Vietnamese", + "zh": "Chinese", +} + +# System prompt for the per-utterance translation calls. Length matters: the dub +# replaces speech that occupied a fixed window, so the model is told to keep the +# spoken length close to the original. +TRANSLATION_SYSTEM_TEMPLATE = ( + "You translate dialogue for dubbing. Translate the user's text to {language}. " + "Keep the meaning and register, and stay close to the original spoken length so " + "the dub fits the original timing. Reply with only the translated text — no " + "quotes, notes, or extra commentary." +) + + +@dataclass(frozen=True) +class DubOptions: + """Every `assembly dub` flag as plain data (``--json`` excluded: run_command + resolves it into the ``json_mode`` argument).""" + + media: str + language: str + transcript_id: str | None + voice: list[str] + model: str + max_tokens: int + out: Path | None + + +def resolve_language(value: str) -> str: + """The target language name: an ISO code maps to its name, anything else + passes through as typed (the gateway accepts more languages than the map).""" + cleaned = value.strip() + if not cleaned: + raise UsageError( + "--lang needs a language.", + suggestion="Pass an ISO code (--lang de) or a language name (--lang German).", + ) + return LANGUAGE_NAMES.get(cleaned.casefold(), cleaned) + + +def default_out_path(media: Path, language: str) -> Path: + """The default output file: ``.dub.`` next to the input.""" + slug = re.sub(r"[^a-z0-9]+", "-", language.casefold()).strip("-") + return media.parent / f"{media.stem}.dub.{slug}{media.suffix}" + + +def assemble_timeline( + placed: list[tuple[int, bytes]], + sample_rate: int, + total_seconds: float | None, +) -> bytes: + """Lay each ``(start_ms, pcm)`` segment onto a silence timeline. + + Gaps before a segment's start are filled with silence; a segment whose + predecessor overran its start time is appended immediately (the dub drifts + rather than dropping speech). The tail is padded out to ``total_seconds`` + (the source duration) so the dubbed track never ends early. + """ + pcm = bytearray() + for start_ms, segment in placed: + gap = start_ms / 1000 - _pcm_seconds(pcm, sample_rate) + if gap > 0: + pcm.extend(audio.silence(sample_rate, gap)) + pcm.extend(segment) + if total_seconds is not None: + tail = total_seconds - _pcm_seconds(pcm, sample_rate) + if tail > 0: + pcm.extend(audio.silence(sample_rate, tail)) + return bytes(pcm) + + +def _pcm_seconds(pcm: bytes | bytearray, sample_rate: int) -> float: + """Seconds of audio in 16-bit mono PCM: two bytes per sample.""" + return len(pcm) / 2 / sample_rate + + +def _require_sandbox() -> None: + """`assembly dub` synthesizes with streaming TTS, which is sandbox-only today.""" + if not session.is_available(): + raise CLIError( + "assembly dub is only available in the sandbox (it uses streaming TTS).", + error_type="unsupported_environment", + exit_code=2, + suggestion="Re-run as: assembly --sandbox dub … " + f"(--sandbox goes before the command; or use --env {environments.SANDBOX_ENV}).", + ) + + +def _validate_media(media: Path) -> None: + """Reject a missing local source before credential resolution, so a typo'd + path reads as "file not found", never as a login prompt or an ffmpeg error.""" + if not media.exists(): + raise CLIError( + f"File not found: {media}", + error_type="file_not_found", + exit_code=2, + suggestion="Check the path. assembly dub needs a local audio/video file.", + ) + if not media.is_file(): + raise CLIError( + f"Not a file: {media}", + error_type="not_a_file", + exit_code=2, + suggestion="Pass a media file, not a directory.", + ) + + +def _validate_out(out: Path, media: Path) -> None: + """The dub must never overwrite its own input: ffmpeg would read and write the + same file concurrently, corrupting it.""" + if out.resolve() == media.resolve(): + raise UsageError( + "--out would overwrite the input file.", + suggestion="Pick a different output path.", + ) + + +def _require_ffmpeg() -> str: + """The ffmpeg executable; checked before any (billed) transcription work.""" + path = shutil.which("ffmpeg") + if path is None: + raise CLIError( + "ffmpeg is required to write the dubbed file, but it isn't on PATH.", + error_type="missing_dependency", + suggestion="Install it (brew install ffmpeg / apt install ffmpeg) and re-run.", + ) + return path + + +def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]: + """Boundary seam for tests: one ffmpeg invocation, output captured.""" + return subprocess.run(args, capture_output=True, text=True, check=False) + + +def _mux(ffmpeg: str, media: Path, track: Path, out: Path) -> None: + """Swap ``track`` in as the audio of ``media``, writing ``out``. + + ``-map 0:v?`` carries the video stream over untouched (``-c:v copy``) when + there is one, and maps nothing for audio-only input, so the same invocation + dubs both a video and a plain audio file. ``-y`` makes a re-run overwrite + its own earlier output instead of stalling on ffmpeg's prompt. + """ + result = _run_ffmpeg( + [ + ffmpeg, + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(media), + "-i", + str(track), + "-map", + "0:v?", + "-map", + "1:a", + "-c:v", + "copy", + str(out), + ] + ) + if result.returncode != 0: + detail = result.stderr.strip().splitlines() + reason = detail[-1] if detail else f"ffmpeg exited with code {result.returncode}" + raise CLIError( + f"Could not write {out.name}: {reason}", + error_type="dub_failed", + suggestion="Check that the input is a readable audio/video file.", + ) + + +@dataclass(frozen=True) +class _Utterance: + """One diarized utterance reduced to the fields the dub pipeline needs.""" + + start_ms: int + speaker: str + text: str + + +def _resolve_transcript( + opts: DubOptions, media: Path, state: AppState, *, json_mode: bool +) -> object: + """The diarized transcript driving the dub: fetched by id, or made fresh from + the (already local) media file — always with speaker labels, so each speaker + can keep a distinct voice in the dub.""" + if opts.transcript_id is not None: + return client.get_transcript(state.resolve_api_key(), opts.transcript_id) + config = aai.TranscriptionConfig(speaker_labels=True) + api_key = state.resolve_api_key() + with output.status("Transcribing for dubbing…", json_mode=json_mode, quiet=state.quiet): + return client.transcribe(api_key, str(media), config=config) + + +def _utterances_of(transcript: object) -> list[_Utterance]: + """The transcript's spoken utterances, with empty-text ones dropped.""" + utterances = [ + _Utterance( + start_ms=jsonshape.as_int(getattr(item, "start", 0)), + speaker=str(getattr(item, "speaker", None) or "A"), + text=str(getattr(item, "text", "") or "").strip(), + ) + for item in jsonshape.object_list(getattr(transcript, "utterances", None)) + ] + spoken = [utterance for utterance in utterances if utterance.text] + if not spoken: + transcript_id = str(getattr(transcript, "id", "")) + raise CLIError( + f"Transcript {transcript_id} has no utterances to dub.", + error_type="no_utterances", + exit_code=2, + suggestion=( + "Dubbing needs a diarized transcript. Pass a --transcript-id created " + "with --speaker-labels, or drop -t to let dub transcribe the file." + ), + ) + return spoken + + +def _total_seconds(transcript: object) -> float | None: + """The source duration in seconds (used to pad the dubbed track's tail).""" + duration = getattr(transcript, "audio_duration", None) + if isinstance(duration, int | float) and not isinstance(duration, bool): + return float(duration) + return None + + +def _translate( + api_key: str, + utterances: list[_Utterance], + language: str, + opts: DubOptions, + *, + json_mode: bool, + quiet: bool, +) -> list[str]: + """Translate each utterance to ``language`` with the LLM Gateway, in order. + + One call per utterance keeps the translation↔timestamp alignment exact — + no reply-parsing step that could shift a line against its window. + """ + system = TRANSLATION_SYSTEM_TEMPLATE.format(language=language) + translating = f"Translating {len(utterances)} utterance(s) to {language} with {opts.model}…" + translations: list[str] = [] + with output.status(translating, json_mode=json_mode, quiet=quiet): + for index, utterance in enumerate(utterances, 1): + messages = gateway.build_messages(utterance.text, system=system) + response = gateway.complete( + api_key, model=opts.model, messages=messages, max_tokens=opts.max_tokens + ) + translated = gateway.content_of(response).strip() + if not translated: + raise APIError( + f"The model returned an empty translation for utterance {index} " + f"({utterance.text[:50]!r})." + ) + translations.append(translated) + return translations + + +def _synthesize( + api_key: str, + segments: list[tuple[str, str]], + language: str, + *, + json_mode: bool, + quiet: bool, +) -> tuple[list[bytes], int]: + """Synthesize each ``(voice, text)`` segment; returns the PCM list + sample rate. + + Every segment must come back at one rate — the timeline math places segments + by sample position, so a mid-run rate change would silently shift timing. + """ + synthesizing = f"Synthesizing {len(segments)} segment(s)…" + with output.status(synthesizing, json_mode=json_mode, quiet=quiet): + results = [ + session.synthesize( + api_key, + SpeakConfig(text=text, voice=voice, language=language), + on_warning=lambda m: output.emit_warning(m, json_mode=json_mode), + ) + for voice, text in segments + ] + rates = {result.sample_rate for result in results} + if len(rates) > 1: + raise APIError(f"TTS service returned mixed sample rates ({sorted(rates)}).") + # `segments` is never empty (_utterances_of raised otherwise), so results[0] exists. + return [result.pcm for result in results], results[0].sample_rate + + +def _assign_voices( + utterances: list[_Utterance], + translations: list[str], + voice_values: list[str], +) -> tuple[list[tuple[str, str]], dict[str, str]]: + """Resolve each translated utterance to ``(voice, text)`` plus the speaker→voice map. + + A bare ``--voice`` dubs every speaker with that one voice; ``SPEAKER=VOICE`` + mappings pin individual speakers; everyone else takes the rotation in + first-appearance order (the same rules as `assembly speak`). + """ + bare_voice, overrides = dialogue.parse_voice_overrides(voice_values) + rotation = (bare_voice,) if bare_voice is not None else dialogue.DEFAULT_VOICE_ROTATION + segments = [ + dialogue.Segment(utterance.speaker, translated) + # strict=True is an invariant guard only: _translate returns exactly one + # translation per utterance, so the lengths can never differ. + for utterance, translated in zip(utterances, translations, strict=True) # pragma: no mutate + ] + return dialogue.assign_voices(segments, rotation, overrides) + + +def run_dub(opts: DubOptions, state: AppState, *, json_mode: bool) -> None: + """Execute one `assembly dub` invocation from already-parsed flags.""" + language = resolve_language(opts.language) + _require_sandbox() + media = Path(opts.media) + _validate_media(media) + out = opts.out if opts.out is not None else default_out_path(media, language) + _validate_out(out, media) + ffmpeg = _require_ffmpeg() + + transcript = _resolve_transcript(opts, media, state, json_mode=json_mode) + transcript_id = str(getattr(transcript, "id", "")) + utterances = _utterances_of(transcript) + api_key = state.resolve_api_key() + translations = _translate( + api_key, utterances, language, opts, json_mode=json_mode, quiet=state.quiet + ) + resolved, speakers = _assign_voices(utterances, translations, opts.voice) + pcm_segments, sample_rate = _synthesize( + api_key, resolved, language, json_mode=json_mode, quiet=state.quiet + ) + + # strict=True is an invariant guard only: _synthesize returns one PCM per segment. + starts = (u.start_ms for u in utterances) + placed = list(zip(starts, pcm_segments, strict=True)) # pragma: no mutate + track = assemble_timeline(placed, sample_rate, _total_seconds(transcript)) + with tempfile.TemporaryDirectory(prefix="aai-dub-") as tmp: + wav = Path(tmp) / "dub.wav" + audio.write_wav(wav, track, sample_rate) + with output.status("Writing the dubbed file…", json_mode=json_mode, quiet=state.quiet): + _mux(ffmpeg, media, wav, out) + + duration = round(_pcm_seconds(track, sample_rate), 3) + voices = ", ".join(f"{speaker}={voice}" for speaker, voice in speakers.items()) + payload: dict[str, object] = { + "source": opts.media, + "out": str(out), + "language": language, + "transcript_id": transcript_id, + "utterances": len(utterances), + "speakers": speakers, + "sample_rate": sample_rate, + "audio_duration_seconds": duration, + } + output.emit( + payload, + lambda _: output.success( + f"{escape(str(out))} dubbed to {language} ({len(utterances)} utterances, {voices})" + ), + json_mode=json_mode, + ) diff --git a/aai_cli/main.py b/aai_cli/main.py index bd522934..08be0b4a 100644 --- a/aai_cli/main.py +++ b/aai_cli/main.py @@ -29,6 +29,7 @@ deploy, dev, doctor, + dub, evaluate, init, keys, @@ -70,6 +71,7 @@ "speak", "llm", "clip", + "dub", "eval", "webhooks", # Setup & Tools — get set up & maintain @@ -409,6 +411,7 @@ def main( app.add_typer(speak.app) app.add_typer(llm.app) app.add_typer(clip.app) +app.add_typer(dub.app) app.add_typer(evaluate.app) # eval app.add_typer(account.app) # balance, usage, limits app.add_typer(login.app) # login, logout, whoami diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index d4f7c2b9..54c83017 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -144,6 +144,68 @@ + ''' +# --- +# name: test_command_help_matches_snapshot[dub] + ''' + + Usage: assembly dub [OPTIONS] MEDIA + + Dub a video or audio file into another language (sandbox only). + + The whole platform in one command: the media is transcribed with diarized + utterance timestamps, each utterance is translated by an LLM Gateway model, + the translations are synthesized with streaming TTS (one voice per + speaker), and ffmpeg lays the new audio over the original — video copied + untouched. Streaming TTS only exists in the sandbox today — run it as + 'assembly --sandbox dub' (--sandbox goes before the subcommand). Requires + ffmpeg. + + ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ + │ * media TEXT Local audio/video file to dub (the video stream is │ + │ copied untouched). │ + │ [required] │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ Options ────────────────────────────────────────────────────────────────────╮ + │ * --lang -l TEXT Target language: an ISO code (de, fr, es, │ + │ …) or a language name (German). │ + │ [required] │ + │ --transcript-id -t TEXT Reuse an existing diarized transcript of │ + │ this media instead of transcribing it │ + │ again. │ + │ --voice TEXT Voice id for every speaker (e.g. jane, │ + │ michael, paul), or SPEAKER=VOICE to pin a │ + │ diarized speaker (repeatable, e.g. --voice │ + │ A=jane). │ + │ --out PATH Output file (default: │ + │ .dub. next to the input). │ + │ --json -j Emit JSON describing the dubbed file. │ + │ --help Show this message and exit. │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ LLM Transform ──────────────────────────────────────────────────────────────╮ + │ --model TEXT LLM Gateway model that translates the │ + │ utterances. │ + │ [default: claude-haiku-4-5-20251001] │ + │ --max-tokens INTEGER Max tokens per utterance translation. │ + │ [default: 1000] │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + + Examples + Dub a talk into German (sandbox only) + $ assembly --sandbox dub talk.mp4 --lang de + Use a language name instead of a code + $ assembly --sandbox dub talk.mp4 -l Spanish + Dub every speaker with one voice + $ assembly --sandbox dub talk.mp4 -l fr --voice paul + Pin a voice per diarized speaker + $ assembly --sandbox dub panel.mp4 -l de --voice A=jane --voice B=paul + Reuse a finished transcript instead of re-transcribing + $ assembly --sandbox dub talk.mp4 -l de -t TRANSCRIPT_ID + Choose the output file + $ assembly --sandbox dub talk.mp4 -l de --out talk-german.mp4 + + + ''' # --- # name: test_command_help_matches_snapshot[eval] diff --git a/tests/_dub_helpers.py b/tests/_dub_helpers.py new file mode 100644 index 00000000..450d42ba --- /dev/null +++ b/tests/_dub_helpers.py @@ -0,0 +1,120 @@ +"""Shared builders for the `assembly dub` test modules. + +The dub suite is split across test_dub_exec.py (pure helpers + validation), +test_dub_pipeline.py (the faked transcribe → translate → synthesize → mux +runs), and test_dub_command.py (argv parsing); the option defaults, transcript +fakes, and boundary recorders they share live here. +""" + +from __future__ import annotations + +import subprocess +import wave +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from aai_cli import client, config, dub_exec, llm +from aai_cli.dub_exec import DubOptions +from aai_cli.tts import session +from aai_cli.tts.session import SpeakResult + +# The CLI's flag defaults, as data. Tests override per-case with dataclasses.replace. +DEFAULTS = DubOptions( + media="talk.mp4", + language="de", + transcript_id=None, + voice=[], + model=llm.DEFAULT_MODEL, + max_tokens=llm.DEFAULT_MAX_TOKENS, + out=None, +) + +SAMPLE_RATE = 100 # tiny rate keeps the timeline byte math exact and readable + + +def utterance(start, speaker, text): + return SimpleNamespace(start=start, end=None, speaker=speaker, text=text) + + +def fake_transcript(utterances, *, audio_duration=5): + return SimpleNamespace(id="tr_dub", utterances=utterances, audio_duration=audio_duration) + + +def completion(text): + """The slice of an OpenAI ChatCompletion that gateway.content_of reads.""" + return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content=text))]) + + +def write_media(tmp_path: Path) -> Path: + path = tmp_path / "talk.mp4" + path.write_bytes(b"\x00fake-media") + return path + + +def enable_sandbox(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(session, "is_available", lambda: True) + + +def patch_api_key(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(config, "resolve_api_key", lambda **_: "test-key") + + +def record_transcribe(monkeypatch: pytest.MonkeyPatch) -> dict[str, object]: + """Record the transcription request and return a two-speaker transcript.""" + calls: dict[str, object] = {} + + def _fake(api_key, audio, *, config): + calls["api_key"] = api_key + calls["audio"] = audio + calls["config"] = config + return fake_transcript([utterance(1000, "A", "Hello."), utterance(3000, "B", "World.")]) + + monkeypatch.setattr(client, "transcribe", _fake) + return calls + + +def record_translate(monkeypatch: pytest.MonkeyPatch) -> list[dict[str, object]]: + """Record each gateway call and reply with a marked 'DE:' translation.""" + calls: list[dict[str, object]] = [] + + def _fake(api_key, *, model, messages, max_tokens=llm.DEFAULT_MAX_TOKENS, transcript_id=None): + calls.append({"model": model, "messages": messages, "max_tokens": max_tokens}) + return completion(f"DE:{messages[-1]['content']}") + + monkeypatch.setattr(llm, "complete", _fake) + return calls + + +def record_synthesize(monkeypatch: pytest.MonkeyPatch) -> list[object]: + """Record each TTS request; segment i comes back as 100 bytes of 0xA1+i.""" + calls: list[object] = [] + + def _fake(api_key, cfg, *, connect=None, on_warning=None): + calls.append(cfg) + pcm = bytes([0xA0 + len(calls)]) * 100 + return SpeakResult(pcm=pcm, sample_rate=SAMPLE_RATE, audio_duration_seconds=0.5) + + monkeypatch.setattr(session, "synthesize", _fake) + return calls + + +def record_ffmpeg(monkeypatch: pytest.MonkeyPatch) -> dict[str, object]: + """Resolve ffmpeg and record the invocation plus the WAV it was handed. + + The temp WAV is deleted right after the mux, so its contents are captured + here, while the file still exists. + """ + monkeypatch.setattr("shutil.which", lambda name: f"/usr/bin/{name}") + recorded: dict[str, object] = {} + + def run(args: list[str]) -> subprocess.CompletedProcess[str]: + recorded["args"] = args + with wave.open(args[8], "rb") as wav: # args[8] is the dub.wav input + recorded["wav_params"] = wav.getparams() + recorded["wav_frames"] = wav.readframes(wav.getnframes()) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(dub_exec, "_run_ffmpeg", run) + return recorded diff --git a/tests/_snapshot_surface.py b/tests/_snapshot_surface.py index a329c1c3..8b8727d5 100644 --- a/tests/_snapshot_surface.py +++ b/tests/_snapshot_surface.py @@ -23,7 +23,9 @@ # ``tests/test_snapshots_help_.py`` module suffixes. HELP_GROUPS: dict[str, frozenset[str]] = { "build": frozenset({"onboard", "init", "dev", "share", "deploy"}), - "run": frozenset({"transcribe", "stream", "agent", "speak", "llm", "clip", "eval", "webhooks"}), + "run": frozenset( + {"transcribe", "stream", "agent", "speak", "llm", "clip", "dub", "eval", "webhooks"} + ), "tools": frozenset({"doctor", "setup", "telemetry", "_update-check"}), "history": frozenset({"transcripts", "sessions"}), "account": frozenset( diff --git a/tests/test_dub_command.py b/tests/test_dub_command.py new file mode 100644 index 00000000..1a323e23 --- /dev/null +++ b/tests/test_dub_command.py @@ -0,0 +1,95 @@ +"""Argv parsing tests for `assembly dub` (aai_cli/commands/dub.py): the command +module only builds a DubOptions and hands it to dub_exec.run_dub, so these +tests pin the flag -> options mapping and the end-to-end sandbox guard; the +pipeline itself is covered in test_dub_exec.py.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from typer.testing import CliRunner + +from aai_cli import dub_exec, llm +from aai_cli.main import app + +runner = CliRunner() + + +@pytest.fixture +def captured_run(monkeypatch: pytest.MonkeyPatch): + """Capture the (opts, json_mode) the command hands to run_dub.""" + seen: dict[str, object] = {} + + def fake_run(opts, state, *, json_mode): + seen["opts"] = opts + seen["json_mode"] = json_mode + + monkeypatch.setattr(dub_exec, "run_dub", fake_run) + return seen + + +def test_lang_is_required(): + result = runner.invoke(app, ["dub", "talk.mp4"]) + assert result.exit_code == 2 + assert "--lang" in result.output + + +def test_production_env_is_rejected_with_sandbox_hint(): + result = runner.invoke(app, ["dub", "talk.mp4", "--lang", "de"]) # default = production + assert result.exit_code == 2 + assert "only available in the sandbox" in result.output + # The suggestion spells out the exact corrected invocation: --sandbox is a root + # flag, so it must go before the command, not after it. + assert "Re-run as: assembly --sandbox dub" in result.output + + +def test_defaults_map_to_options(captured_run): + result = runner.invoke(app, ["dub", "talk.mp4", "--lang", "de"]) + assert result.exit_code == 0 + assert captured_run["json_mode"] is False + assert captured_run["opts"] == dub_exec.DubOptions( + media="talk.mp4", + language="de", + transcript_id=None, + voice=[], + model=llm.DEFAULT_MODEL, + max_tokens=llm.DEFAULT_MAX_TOKENS, + out=None, + ) + + +def test_every_flag_maps_to_options(captured_run): + result = runner.invoke( + app, + [ + "dub", + "talk.mp4", + "--lang", + "German", + "-t", + "tr_1", + "--voice", + "A=jane", + "--voice", + "paul", + "--model", + "gpt-5", + "--max-tokens", + "7", + "--out", + "dubbed.mp4", + "--json", + ], + ) + assert result.exit_code == 0 + assert captured_run["json_mode"] is True + assert captured_run["opts"] == dub_exec.DubOptions( + media="talk.mp4", + language="German", + transcript_id="tr_1", + voice=["A=jane", "paul"], + model="gpt-5", + max_tokens=7, + out=Path("dubbed.mp4"), + ) diff --git a/tests/test_dub_exec.py b/tests/test_dub_exec.py new file mode 100644 index 00000000..99cf99fc --- /dev/null +++ b/tests/test_dub_exec.py @@ -0,0 +1,238 @@ +"""Direct tests of the `assembly dub` options/run seam (aai_cli/dub_exec.py): +the pure helpers (language resolution, output naming, timeline assembly, +utterance extraction) and run_dub's validation order. Constructed-options +tests (dataclasses.replace off the shared defaults) avoid any argv +round-trip. The faked pipeline runs live in test_dub_pipeline.py; argv +parsing in test_dub_command.py.""" + +from __future__ import annotations + +import dataclasses +import sys +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from aai_cli import dub_exec +from aai_cli.context import AppState +from aai_cli.errors import CLIError, UsageError +from tests._dub_helpers import ( + DEFAULTS, + enable_sandbox, + fake_transcript, + patch_api_key, + utterance, + write_media, +) + + +@pytest.fixture +def media(tmp_path: Path) -> Path: + return write_media(tmp_path) + + +@pytest.fixture +def sandbox(monkeypatch: pytest.MonkeyPatch): + enable_sandbox(monkeypatch) + + +@pytest.fixture(autouse=True) +def _fake_key(monkeypatch: pytest.MonkeyPatch): + patch_api_key(monkeypatch) + + +# --- records and pure helpers -------------------------------------------------- + + +@pytest.mark.parametrize( + "instance", + [DEFAULTS, dub_exec._Utterance(start_ms=0, speaker="A", text="hi")], + ids=["options", "utterance"], +) +def test_records_are_immutable(instance): + field_name = dataclasses.fields(instance)[0].name + with pytest.raises(dataclasses.FrozenInstanceError): + setattr(instance, field_name, None) + + +def test_language_names_map_codes_to_names(): + # An independent copy of the expected table: a silently edited entry in the + # shipped map must fail here, not just round-trip through itself. + assert dub_exec.LANGUAGE_NAMES == { + "ar": "Arabic", + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "hi": "Hindi", + "it": "Italian", + "ja": "Japanese", + "ko": "Korean", + "nl": "Dutch", + "pl": "Polish", + "pt": "Portuguese", + "ru": "Russian", + "tr": "Turkish", + "vi": "Vietnamese", + "zh": "Chinese", + } + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ("de", "German"), + (" DE ", "German"), # codes are trimmed and case-insensitive + ("German", "German"), # a full name passes through + (" Klingon ", "Klingon"), # unlisted languages pass through, trimmed + ], +) +def test_resolve_language(value, expected): + assert dub_exec.resolve_language(value) == expected + + +def test_resolve_language_rejects_blank(): + with pytest.raises(UsageError) as exc: + dub_exec.resolve_language(" ") + assert "--lang needs a language" in exc.value.message + assert "--lang de" in (exc.value.suggestion or "") + + +@pytest.mark.parametrize( + ("language", "expected"), + [ + ("German", "talk.dub.german.mp4"), + ("Brazilian Portuguese", "talk.dub.brazilian-portuguese.mp4"), + ], +) +def test_default_out_path(language, expected): + out = dub_exec.default_out_path(Path("/x/talk.mp4"), language) + assert out == Path("/x") / expected + + +def test_assemble_timeline_fills_gaps_and_pads_tail(): + # rate 1000: one second of 16-bit mono PCM is 2000 bytes. + track = dub_exec.assemble_timeline([(500, b"\x01\x02")], 1000, total_seconds=1.0) + # 0.5 s leading silence, the segment, then a 0.499 s tail pad to 1.0 s. + assert track == b"\x00" * 1000 + b"\x01\x02" + b"\x00" * 998 + + +def test_assemble_timeline_overlap_appends_without_silence(): + # The first segment runs to 0.1 s; the second "starts" at 0.05 s, so it is + # appended immediately (the dub drifts) rather than overlapping or crashing. + placed = [(0, b"\x01" * 200), (50, b"\x02\x02")] + track = dub_exec.assemble_timeline(placed, 1000, total_seconds=None) + assert track == b"\x01" * 200 + b"\x02\x02" + + +def test_assemble_timeline_skips_tail_when_track_is_long_enough(): + track = dub_exec.assemble_timeline([(0, b"\x01" * 200)], 1000, total_seconds=0.05) + assert track == b"\x01" * 200 + + +def test_utterances_of_defaults_and_filtering(): + transcript = fake_transcript( + [ + SimpleNamespace(speaker=None, text="Hi"), # no start attr, no speaker label + utterance(2000, "B", None), # no text -> dropped + utterance(3000, "B", " "), # blank text -> dropped + utterance(4000, "C", " Bye "), + ] + ) + assert dub_exec._utterances_of(transcript) == [ + dub_exec._Utterance(start_ms=0, speaker="A", text="Hi"), + dub_exec._Utterance(start_ms=4000, speaker="C", text="Bye"), + ] + + +@pytest.mark.parametrize( + "utterances", + [None, [], [utterance(0, "A", "")]], + ids=["missing", "empty", "all-blank"], +) +def test_utterances_of_requires_spoken_utterances(utterances): + with pytest.raises(CLIError) as exc: + dub_exec._utterances_of(SimpleNamespace(id="tr_x", utterances=utterances)) + assert exc.value.error_type == "no_utterances" + assert exc.value.exit_code == 2 + assert "Transcript tr_x has no utterances to dub" in exc.value.message + assert "--speaker-labels" in (exc.value.suggestion or "") + + +@pytest.mark.parametrize( + ("duration", "expected"), + [(12, 12.0), (4.5, 4.5), (None, None), (True, None), ("90", None)], + ids=["int", "float", "none", "bool", "str"], +) +def test_total_seconds(duration, expected): + transcript = SimpleNamespace(audio_duration=duration) + assert dub_exec._total_seconds(transcript) == expected + + +def test_run_ffmpeg_captures_output_and_does_not_raise(): + # The real boundary (not the fake): output is captured as text and a non-zero + # exit must not raise — _mux turns the exit code into a CLIError itself. + result = dub_exec._run_ffmpeg( + [ + sys.executable, + "-c", + "import sys; print('out'); print('err', file=sys.stderr); sys.exit(3)", + ] + ) + assert result.returncode == 3 + assert result.stdout == "out\n" + assert result.stderr == "err\n" + + +# --- validation order (cheap local checks before any credential or network) ---- + + +def test_run_dub_rejects_blank_language_first(): + opts = dataclasses.replace(DEFAULTS, language=" ") + with pytest.raises(UsageError): # not the sandbox CLIError: language wins + dub_exec.run_dub(opts, AppState(), json_mode=False) + + +def test_run_dub_requires_sandbox(): + # The active environment defaults to production, which has no streaming-TTS host. + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(DEFAULTS, AppState(), json_mode=False) + assert exc.value.error_type == "unsupported_environment" + assert exc.value.exit_code == 2 + assert "only available in the sandbox" in exc.value.message + assert "Re-run as: assembly --sandbox dub" in (exc.value.suggestion or "") + + +def test_run_dub_rejects_missing_file(sandbox, tmp_path): + opts = dataclasses.replace(DEFAULTS, media=str(tmp_path / "nope.mp4")) + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert exc.value.error_type == "file_not_found" + assert exc.value.exit_code == 2 + assert "local audio/video file" in (exc.value.suggestion or "") + + +def test_run_dub_rejects_directory(sandbox, tmp_path): + opts = dataclasses.replace(DEFAULTS, media=str(tmp_path)) + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert exc.value.error_type == "not_a_file" + assert exc.value.exit_code == 2 + assert "not a directory" in (exc.value.suggestion or "") + + +def test_run_dub_refuses_to_overwrite_the_input(sandbox, media): + opts = dataclasses.replace(DEFAULTS, media=str(media), out=media) + with pytest.raises(UsageError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert "overwrite the input file" in exc.value.message + + +def test_run_dub_requires_ffmpeg(sandbox, media, monkeypatch): + monkeypatch.setattr("shutil.which", lambda name: None) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert exc.value.error_type == "missing_dependency" + assert "ffmpeg" in exc.value.message diff --git a/tests/test_dub_pipeline.py b/tests/test_dub_pipeline.py new file mode 100644 index 00000000..072fb43e --- /dev/null +++ b/tests/test_dub_pipeline.py @@ -0,0 +1,278 @@ +"""Faked end-to-end runs of the `assembly dub` pipeline (aai_cli/dub_exec.py): +the transcribe → translate → synthesize → ffmpeg mux orchestration, voice +assignment, and the failure modes of each boundary. The LLM Gateway, streaming +TTS, and ffmpeg are faked at the modules dub_exec calls into (`llm.complete`, +`session.synthesize`, `client.transcribe`) and at `dub_exec._run_ffmpeg`; the +pure helpers and validation order live in test_dub_exec.py.""" + +from __future__ import annotations + +import dataclasses +import json +import subprocess +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from aai_cli import client, dub_exec, llm +from aai_cli.context import AppState +from aai_cli.errors import APIError, CLIError +from aai_cli.tts import session +from aai_cli.tts.session import SpeakResult +from tests._dub_helpers import ( + DEFAULTS, + SAMPLE_RATE, + completion, + enable_sandbox, + fake_transcript, + patch_api_key, + record_ffmpeg, + record_synthesize, + record_transcribe, + record_translate, + utterance, + write_media, +) + + +@pytest.fixture +def media(tmp_path: Path) -> Path: + return write_media(tmp_path) + + +@pytest.fixture(autouse=True) +def _sandbox_and_key(monkeypatch: pytest.MonkeyPatch): + enable_sandbox(monkeypatch) + patch_api_key(monkeypatch) + + +@pytest.fixture +def fake_transcribe(monkeypatch: pytest.MonkeyPatch): + return record_transcribe(monkeypatch) + + +@pytest.fixture +def fake_translate(monkeypatch: pytest.MonkeyPatch): + return record_translate(monkeypatch) + + +@pytest.fixture +def fake_synthesize(monkeypatch: pytest.MonkeyPatch): + return record_synthesize(monkeypatch) + + +@pytest.fixture +def fake_ffmpeg(monkeypatch: pytest.MonkeyPatch): + return record_ffmpeg(monkeypatch) + + +def _run(opts, *, json_mode): + dub_exec.run_dub(opts, AppState(), json_mode=json_mode) + + +def test_run_dub_pipeline_end_to_end( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg, capsys +): + opts = dataclasses.replace(DEFAULTS, media=str(media)) + _run(opts, json_mode=True) + + # Transcription: the local file, diarized so speakers keep distinct voices. + assert fake_transcribe["audio"] == str(media) + assert fake_transcribe["config"].speaker_labels is True + + # Translation: one gateway call per utterance, in order, with the dubbing + # system prompt naming the resolved language ("de" -> "German"). + assert [c["messages"][-1]["content"] for c in fake_translate] == ["Hello.", "World."] + for call in fake_translate: + assert call["model"] == llm.DEFAULT_MODEL + assert call["max_tokens"] == llm.DEFAULT_MAX_TOKENS + system = call["messages"][0] + assert system["role"] == "system" + assert "dubbing" in system["content"] + assert "German" in system["content"] + + # Synthesis: the translated text, rotation voices in speaker order, target language. + assert [(cfg.voice, cfg.text) for cfg in fake_synthesize] == [ + ("jane", "DE:Hello."), + ("michael", "DE:World."), + ] + assert all(cfg.language == "German" for cfg in fake_synthesize) + + # The dubbed track: silence to 1.0 s, segment 1, silence to 3.0 s, segment 2, + # then a tail pad out to the source's 5 s duration (rate 100 -> 200 bytes/s). + expected_track = b"\x00" * 200 + b"\xa1" * 100 + b"\x00" * 300 + b"\xa2" * 100 + b"\x00" * 300 + assert fake_ffmpeg["wav_frames"] == expected_track + params = fake_ffmpeg["wav_params"] + assert (params.nchannels, params.sampwidth, params.framerate) == (1, 2, SAMPLE_RATE) + + # The mux: video copied, WAV swapped in as the only audio, default out path. + out = media.parent / "talk.dub.german.mp4" + wav_path = fake_ffmpeg["args"][8] + assert fake_ffmpeg["args"] == [ + "/usr/bin/ffmpeg", + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(media), + "-i", + wav_path, + "-map", + "0:v?", + "-map", + "1:a", + "-c:v", + "copy", + str(out), + ] + assert wav_path.endswith("dub.wav") + + payload = json.loads(capsys.readouterr().out) + assert payload == { + "source": str(media), + "out": str(out), + "language": "German", + "transcript_id": "tr_dub", + "utterances": 2, + "speakers": {"A": "jane", "B": "michael"}, + "sample_rate": SAMPLE_RATE, + "audio_duration_seconds": 5.0, + } + + +def test_run_dub_human_summary( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg, capsys +): + # A short --out keeps the one-line summary under the 80-column console width: + # with the default (tmp_path-prefixed) out path, Rich would hard-wrap the line + # mid-word and these substring asserts would depend on where the break lands. + opts = dataclasses.replace(DEFAULTS, media=str(media), out=Path("dub.de.mp4")) + _run(opts, json_mode=False) + out = capsys.readouterr().out + assert "dub.de.mp4" in out + assert "dubbed to German" in out + assert "2 utterances" in out + assert "A=jane, B=michael" in out + + +def test_bare_voice_dubs_every_speaker( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg +): + opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["paul"]) + _run(opts, json_mode=True) + assert [cfg.voice for cfg in fake_synthesize] == ["paul", "paul"] + + +def test_voice_overrides_pin_speakers_without_consuming_rotation( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg +): + opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["A=mary"]) + _run(opts, json_mode=True) + # A is pinned; B still takes the first rotation voice (overrides don't consume slots). + assert [cfg.voice for cfg in fake_synthesize] == ["mary", "jane"] + + +def test_transcript_id_reuses_existing_transcript( + media, fake_translate, fake_ffmpeg, monkeypatch, capsys +): + fetched: dict[str, str] = {} + + def get_transcript(api_key, transcript_id): + fetched["id"] = transcript_id + return SimpleNamespace( + id=transcript_id, + utterances=[utterance(0, "A", "Hello.")], + audio_duration=None, # duration unknown -> no tail pad + ) + + monkeypatch.setattr(client, "get_transcript", get_transcript) + monkeypatch.setattr( + client, + "transcribe", + lambda *a, **k: pytest.fail("must not re-transcribe with --transcript-id"), + ) + monkeypatch.setattr( + session, + "synthesize", + lambda api_key, cfg, **_: SpeakResult( + pcm=b"\xaa" * 2000, sample_rate=300, audio_duration_seconds=0.0 + ), + ) + + opts = dataclasses.replace(DEFAULTS, media=str(media), transcript_id="tr_99") + _run(opts, json_mode=True) + assert fetched["id"] == "tr_99" + payload = json.loads(capsys.readouterr().out) + assert payload["transcript_id"] == "tr_99" + # 1000 samples at 300 Hz, rounded to milliseconds: 3.3333... -> 3.333. + assert payload["audio_duration_seconds"] == 3.333 + + +def test_empty_translation_is_an_api_error(media, fake_synthesize, fake_ffmpeg, monkeypatch): + long_text = "a" * 50 + "TAIL!" + transcript = fake_transcript([utterance(0, "A", "Hello."), utterance(1000, "B", long_text)]) + monkeypatch.setattr(client, "transcribe", lambda *a, **k: transcript) + replies = iter(["Hallo.", " "]) + monkeypatch.setattr(llm, "complete", lambda *a, **k: completion(next(replies))) + + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(APIError) as exc: + _run(opts, json_mode=False) + # The 1-based index and the (50-char) text preview pin which utterance failed. + assert f"empty translation for utterance 2 ({'a' * 50!r})." in exc.value.message + + +def test_mixed_sample_rates_are_an_api_error( + media, fake_transcribe, fake_translate, fake_ffmpeg, monkeypatch +): + rates = iter([100, 200]) + monkeypatch.setattr( + session, + "synthesize", + lambda api_key, cfg, **_: SpeakResult( + pcm=b"\x01\x02", sample_rate=next(rates), audio_duration_seconds=0.0 + ), + ) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(APIError) as exc: + _run(opts, json_mode=False) + assert "mixed sample rates ([100, 200])" in exc.value.message + + +def test_ffmpeg_failure_reports_last_stderr_line( + media, fake_transcribe, fake_translate, fake_synthesize, monkeypatch +): + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg") + monkeypatch.setattr( + dub_exec, + "_run_ffmpeg", + lambda args: subprocess.CompletedProcess( + args=args, returncode=1, stdout="", stderr="noise\nInvalid data found\n" + ), + ) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(CLIError) as exc: + _run(opts, json_mode=False) + assert exc.value.error_type == "dub_failed" + assert "Could not write talk.dub.german.mp4" in exc.value.message + # The last stderr line is the reason ffmpeg gives; earlier noise is dropped. + assert "Invalid data found" in exc.value.message + assert "noise" not in exc.value.message + assert "readable audio/video file" in (exc.value.suggestion or "") + + +def test_ffmpeg_silent_failure_reports_exit_code( + media, fake_transcribe, fake_translate, fake_synthesize, monkeypatch +): + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg") + monkeypatch.setattr( + dub_exec, + "_run_ffmpeg", + lambda args: subprocess.CompletedProcess(args=args, returncode=3, stdout="", stderr=""), + ) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(CLIError) as exc: + _run(opts, json_mode=False) + assert "ffmpeg exited with code 3" in exc.value.message diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 9e32a297..54a9dc28 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -157,6 +157,7 @@ def test_help_lists_commands_in_workflow_order(): "speak", "llm", "clip", + "dub", "eval", "webhooks", # Setup & Tools From f8248f59e92f25d489a4edb39fb892be6850008e Mon Sep 17 00:00:00 2001 From: Claude Date: Fri, 12 Jun 2026 22:28:44 +0000 Subject: [PATCH 2/2] Strip ANSI color in dub CLI-output asserts (CI forced-color failure) CI renders CliRunner output with color, so style codes interleave inside flag names ("--lang") and the human summary line, breaking substring asserts that pass locally without color. Strip SGR sequences first via a shared plain() helper, the same convention test_help_rendering and the clip suite use. https://claude.ai/code/session_01Mcran5xqMHcrt4RUxSHrkX --- tests/_dub_helpers.py | 9 +++++++++ tests/test_dub_command.py | 8 +++++--- tests/test_dub_pipeline.py | 5 ++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/tests/_dub_helpers.py b/tests/_dub_helpers.py index 450d42ba..d1670d9e 100644 --- a/tests/_dub_helpers.py +++ b/tests/_dub_helpers.py @@ -8,6 +8,7 @@ from __future__ import annotations +import re import subprocess import wave from pathlib import Path @@ -33,6 +34,14 @@ SAMPLE_RATE = 100 # tiny rate keeps the timeline byte math exact and readable +_ANSI_SGR = re.compile(r"\x1b\[[0-9;]*m") + + +def plain(text: str) -> str: + """Strip SGR color codes (CI forces color on, splitting flags like --lang + with style sequences) for substring assertions.""" + return _ANSI_SGR.sub("", text) + def utterance(start, speaker, text): return SimpleNamespace(start=start, end=None, speaker=speaker, text=text) diff --git a/tests/test_dub_command.py b/tests/test_dub_command.py index 1a323e23..b30cb784 100644 --- a/tests/test_dub_command.py +++ b/tests/test_dub_command.py @@ -12,6 +12,7 @@ from aai_cli import dub_exec, llm from aai_cli.main import app +from tests._dub_helpers import plain runner = CliRunner() @@ -32,16 +33,17 @@ def fake_run(opts, state, *, json_mode): def test_lang_is_required(): result = runner.invoke(app, ["dub", "talk.mp4"]) assert result.exit_code == 2 - assert "--lang" in result.output + assert "--lang" in plain(result.output) def test_production_env_is_rejected_with_sandbox_hint(): result = runner.invoke(app, ["dub", "talk.mp4", "--lang", "de"]) # default = production assert result.exit_code == 2 - assert "only available in the sandbox" in result.output + output = plain(result.output) + assert "only available in the sandbox" in output # The suggestion spells out the exact corrected invocation: --sandbox is a root # flag, so it must go before the command, not after it. - assert "Re-run as: assembly --sandbox dub" in result.output + assert "Re-run as: assembly --sandbox dub" in output def test_defaults_map_to_options(captured_run): diff --git a/tests/test_dub_pipeline.py b/tests/test_dub_pipeline.py index 072fb43e..78990d0b 100644 --- a/tests/test_dub_pipeline.py +++ b/tests/test_dub_pipeline.py @@ -27,6 +27,7 @@ enable_sandbox, fake_transcript, patch_api_key, + plain, record_ffmpeg, record_synthesize, record_transcribe, @@ -150,7 +151,9 @@ def test_run_dub_human_summary( # mid-word and these substring asserts would depend on where the break lands. opts = dataclasses.replace(DEFAULTS, media=str(media), out=Path("dub.de.mp4")) _run(opts, json_mode=False) - out = capsys.readouterr().out + # plain(): under FORCE_COLOR (CI) Rich's repr highlighter interleaves style + # codes inside the line ("(2 utterances" renders with the 2 colored). + out = plain(capsys.readouterr().out) assert "dub.de.mp4" in out assert "dubbed to German" in out assert "2 utterances" in out