diff --git a/.importlinter b/.importlinter index 15dd3336..9a666fdc 100644 --- a/.importlinter +++ b/.importlinter @@ -20,6 +20,7 @@ source_modules = aai_cli.context aai_cli.debuglog aai_cli.dictate_exec + aai_cli.dub_exec aai_cli.environments aai_cli.errors aai_cli.eval_data @@ -65,6 +66,7 @@ modules = aai_cli.commands.dev aai_cli.commands.dictate aai_cli.commands.doctor + aai_cli.commands.dub aai_cli.commands.evaluate aai_cli.commands.init aai_cli.commands.keys diff --git a/README.md b/README.md index 50b32b85..a83cb5bb 100644 --- a/README.md +++ b/README.md @@ -32,6 +32,14 @@ assembly transcribe "https://www.youtube.com/watch?v=awmCtXzFsJo" --speaker-labe `speak` auto-detects `Speaker A:` labels, merges each speaker's turns, and rotates voices. (`speak` is sandbox-only today, hence `--sandbox`.) +**Dub a video into another language** — the whole platform in one command: transcription with utterance timestamps, per-utterance LLM translation, TTS for each line (one voice per speaker), and ffmpeg laying the new track over the original video: + +```sh +assembly --sandbox dub talk.mp4 --lang de +``` + +The video stream is copied untouched; each dubbed line lands at its original start time. (Sandbox-only, like `speak`.) + **Turn a podcast into audio** — Apple and Spotify podcast pages work too (yt-dlp ingestion): ```sh @@ -183,6 +191,7 @@ assembly init # scaffold a starter app - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal. - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`). - **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times. +- **Dubbing**: `assembly dub` re-voices an audio/video file in another language (`assembly --sandbox dub talk.mp4 --lang de`): diarized transcription, per-utterance LLM translation, streaming TTS per speaker, and an ffmpeg track-swap that leaves the video untouched. Sandbox-only today, like `speak`. - **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model. - **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io. - **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`. diff --git a/aai_cli/commands/dub.py b/aai_cli/commands/dub.py new file mode 100644 index 00000000..1d3dc552 --- /dev/null +++ b/aai_cli/commands/dub.py @@ -0,0 +1,106 @@ +from __future__ import annotations + +from pathlib import Path + +import typer + +from aai_cli import dub_exec, help_panels, llm, options +from aai_cli.context import run_command +from aai_cli.help_text import examples_epilog + +app = typer.Typer() + + +@app.command( + rich_help_panel=help_panels.TRANSCRIPTION, + # --sandbox is a root flag, so it must come before the subcommand in every example. + epilog=examples_epilog( + [ + ("Dub a talk into German (sandbox only)", "assembly --sandbox dub talk.mp4 --lang de"), + ("Use a language name instead of a code", "assembly --sandbox dub talk.mp4 -l Spanish"), + ( + "Dub every speaker with one voice", + "assembly --sandbox dub talk.mp4 -l fr --voice paul", + ), + ( + "Pin a voice per diarized speaker", + "assembly --sandbox dub panel.mp4 -l de --voice A=jane --voice B=paul", + ), + ( + "Reuse a finished transcript instead of re-transcribing", + "assembly --sandbox dub talk.mp4 -l de -t TRANSCRIPT_ID", + ), + ( + "Choose the output file", + "assembly --sandbox dub talk.mp4 -l de --out talk-german.mp4", + ), + ] + ), +) +def dub( + ctx: typer.Context, + media: str = typer.Argument( + ..., + help="Local audio/video file to dub (the video stream is copied untouched).", + ), + lang: str = typer.Option( + ..., + "--lang", + "-l", + help="Target language: an ISO code (de, fr, es, …) or a language name (German).", + ), + transcript_id: str | None = typer.Option( + None, + "--transcript-id", + "-t", + help="Reuse an existing diarized transcript of this media instead of " + "transcribing it again.", + ), + voice: list[str] = typer.Option( + [], + "--voice", + help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE " + "to pin a diarized speaker (repeatable, e.g. --voice A=jane).", + ), + model: str = typer.Option( + llm.DEFAULT_MODEL, + "--model", + help="LLM Gateway model that translates the utterances.", + rich_help_panel=help_panels.OPT_LLM, + autocompletion=llm.complete_model, + ), + max_tokens: int = typer.Option( + llm.DEFAULT_MAX_TOKENS, + "--max-tokens", + help="Max tokens per utterance translation.", + rich_help_panel=help_panels.OPT_LLM, + ), + out: Path | None = typer.Option( + None, "--out", help="Output file (default: .dub. next to the input)." + ), + json_out: bool = options.json_option("Emit JSON describing the dubbed file."), +) -> None: + """Dub a video or audio file into another language (sandbox only). + + The whole platform in one command: the media is transcribed with diarized + utterance timestamps, each utterance is translated by an LLM Gateway model, + the translations are synthesized with streaming TTS (one voice per + speaker), and ffmpeg lays the new audio over the original — video copied + untouched. Streaming TTS only exists in the sandbox today — run it as + 'assembly --sandbox dub' (--sandbox goes before the subcommand). Requires + ffmpeg. + """ + opts = dub_exec.DubOptions( + media=media, + language=lang, + transcript_id=transcript_id, + voice=voice, + model=model, + max_tokens=max_tokens, + out=out, + ) + run_command( + ctx, + lambda state, json_mode: dub_exec.run_dub(opts, state, json_mode=json_mode), + json=json_out, + ) diff --git a/aai_cli/dub_exec.py b/aai_cli/dub_exec.py new file mode 100644 index 00000000..26040d49 --- /dev/null +++ b/aai_cli/dub_exec.py @@ -0,0 +1,420 @@ +"""Run logic for `assembly dub`: transcribe → translate → synthesize → ffmpeg track-swap. + +The command module (aai_cli/commands/dub.py) only parses argv — it builds a +``DubOptions`` and hands it to ``run_dub`` via ``context.run_command`` (the +options/run split, see AGENTS.md), so tests drive the whole pipeline by +constructing options directly. + +The pipeline runs the platform end to end in one command: the media is +transcribed with diarized utterance timestamps, each utterance is translated to +the target language by an LLM Gateway model, each translation is synthesized +with streaming TTS (one voice per speaker), the segments are laid out on a +silence timeline at their original start times, and ffmpeg swaps the new track +over the original media (video stream copied untouched). Streaming TTS only +exists in the sandbox today, so — like `assembly speak` — the command is +sandbox-only. +""" + +from __future__ import annotations + +import re +import shutil +import subprocess +import tempfile +from dataclasses import dataclass +from pathlib import Path + +import assemblyai as aai +from rich.markup import escape + +from aai_cli import client, environments, jsonshape, output +from aai_cli import llm as gateway +from aai_cli.context import AppState +from aai_cli.errors import APIError, CLIError, UsageError +from aai_cli.tts import audio, dialogue, session +from aai_cli.tts.session import SpeakConfig + +# ISO-639-1 codes accepted by --lang, mapped to the language *name* both the +# translation prompt and the streaming-TTS `language` param expect. A value not +# listed passes through as typed, so a full name ("German") — or an unlisted +# language the gateway can translate to — still works. +LANGUAGE_NAMES = { + "ar": "Arabic", + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "hi": "Hindi", + "it": "Italian", + "ja": "Japanese", + "ko": "Korean", + "nl": "Dutch", + "pl": "Polish", + "pt": "Portuguese", + "ru": "Russian", + "tr": "Turkish", + "vi": "Vietnamese", + "zh": "Chinese", +} + +# System prompt for the per-utterance translation calls. Length matters: the dub +# replaces speech that occupied a fixed window, so the model is told to keep the +# spoken length close to the original. +TRANSLATION_SYSTEM_TEMPLATE = ( + "You translate dialogue for dubbing. Translate the user's text to {language}. " + "Keep the meaning and register, and stay close to the original spoken length so " + "the dub fits the original timing. Reply with only the translated text — no " + "quotes, notes, or extra commentary." +) + + +@dataclass(frozen=True) +class DubOptions: + """Every `assembly dub` flag as plain data (``--json`` excluded: run_command + resolves it into the ``json_mode`` argument).""" + + media: str + language: str + transcript_id: str | None + voice: list[str] + model: str + max_tokens: int + out: Path | None + + +def resolve_language(value: str) -> str: + """The target language name: an ISO code maps to its name, anything else + passes through as typed (the gateway accepts more languages than the map).""" + cleaned = value.strip() + if not cleaned: + raise UsageError( + "--lang needs a language.", + suggestion="Pass an ISO code (--lang de) or a language name (--lang German).", + ) + return LANGUAGE_NAMES.get(cleaned.casefold(), cleaned) + + +def default_out_path(media: Path, language: str) -> Path: + """The default output file: ``.dub.`` next to the input.""" + slug = re.sub(r"[^a-z0-9]+", "-", language.casefold()).strip("-") + return media.parent / f"{media.stem}.dub.{slug}{media.suffix}" + + +def assemble_timeline( + placed: list[tuple[int, bytes]], + sample_rate: int, + total_seconds: float | None, +) -> bytes: + """Lay each ``(start_ms, pcm)`` segment onto a silence timeline. + + Gaps before a segment's start are filled with silence; a segment whose + predecessor overran its start time is appended immediately (the dub drifts + rather than dropping speech). The tail is padded out to ``total_seconds`` + (the source duration) so the dubbed track never ends early. + """ + pcm = bytearray() + for start_ms, segment in placed: + gap = start_ms / 1000 - _pcm_seconds(pcm, sample_rate) + if gap > 0: + pcm.extend(audio.silence(sample_rate, gap)) + pcm.extend(segment) + if total_seconds is not None: + tail = total_seconds - _pcm_seconds(pcm, sample_rate) + if tail > 0: + pcm.extend(audio.silence(sample_rate, tail)) + return bytes(pcm) + + +def _pcm_seconds(pcm: bytes | bytearray, sample_rate: int) -> float: + """Seconds of audio in 16-bit mono PCM: two bytes per sample.""" + return len(pcm) / 2 / sample_rate + + +def _require_sandbox() -> None: + """`assembly dub` synthesizes with streaming TTS, which is sandbox-only today.""" + if not session.is_available(): + raise CLIError( + "assembly dub is only available in the sandbox (it uses streaming TTS).", + error_type="unsupported_environment", + exit_code=2, + suggestion="Re-run as: assembly --sandbox dub … " + f"(--sandbox goes before the command; or use --env {environments.SANDBOX_ENV}).", + ) + + +def _validate_media(media: Path) -> None: + """Reject a missing local source before credential resolution, so a typo'd + path reads as "file not found", never as a login prompt or an ffmpeg error.""" + if not media.exists(): + raise CLIError( + f"File not found: {media}", + error_type="file_not_found", + exit_code=2, + suggestion="Check the path. assembly dub needs a local audio/video file.", + ) + if not media.is_file(): + raise CLIError( + f"Not a file: {media}", + error_type="not_a_file", + exit_code=2, + suggestion="Pass a media file, not a directory.", + ) + + +def _validate_out(out: Path, media: Path) -> None: + """The dub must never overwrite its own input: ffmpeg would read and write the + same file concurrently, corrupting it.""" + if out.resolve() == media.resolve(): + raise UsageError( + "--out would overwrite the input file.", + suggestion="Pick a different output path.", + ) + + +def _require_ffmpeg() -> str: + """The ffmpeg executable; checked before any (billed) transcription work.""" + path = shutil.which("ffmpeg") + if path is None: + raise CLIError( + "ffmpeg is required to write the dubbed file, but it isn't on PATH.", + error_type="missing_dependency", + suggestion="Install it (brew install ffmpeg / apt install ffmpeg) and re-run.", + ) + return path + + +def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]: + """Boundary seam for tests: one ffmpeg invocation, output captured.""" + return subprocess.run(args, capture_output=True, text=True, check=False) + + +def _mux(ffmpeg: str, media: Path, track: Path, out: Path) -> None: + """Swap ``track`` in as the audio of ``media``, writing ``out``. + + ``-map 0:v?`` carries the video stream over untouched (``-c:v copy``) when + there is one, and maps nothing for audio-only input, so the same invocation + dubs both a video and a plain audio file. ``-y`` makes a re-run overwrite + its own earlier output instead of stalling on ffmpeg's prompt. + """ + result = _run_ffmpeg( + [ + ffmpeg, + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(media), + "-i", + str(track), + "-map", + "0:v?", + "-map", + "1:a", + "-c:v", + "copy", + str(out), + ] + ) + if result.returncode != 0: + detail = result.stderr.strip().splitlines() + reason = detail[-1] if detail else f"ffmpeg exited with code {result.returncode}" + raise CLIError( + f"Could not write {out.name}: {reason}", + error_type="dub_failed", + suggestion="Check that the input is a readable audio/video file.", + ) + + +@dataclass(frozen=True) +class _Utterance: + """One diarized utterance reduced to the fields the dub pipeline needs.""" + + start_ms: int + speaker: str + text: str + + +def _resolve_transcript( + opts: DubOptions, media: Path, state: AppState, *, json_mode: bool +) -> object: + """The diarized transcript driving the dub: fetched by id, or made fresh from + the (already local) media file — always with speaker labels, so each speaker + can keep a distinct voice in the dub.""" + if opts.transcript_id is not None: + return client.get_transcript(state.resolve_api_key(), opts.transcript_id) + config = aai.TranscriptionConfig(speaker_labels=True) + api_key = state.resolve_api_key() + with output.status("Transcribing for dubbing…", json_mode=json_mode, quiet=state.quiet): + return client.transcribe(api_key, str(media), config=config) + + +def _utterances_of(transcript: object) -> list[_Utterance]: + """The transcript's spoken utterances, with empty-text ones dropped.""" + utterances = [ + _Utterance( + start_ms=jsonshape.as_int(getattr(item, "start", 0)), + speaker=str(getattr(item, "speaker", None) or "A"), + text=str(getattr(item, "text", "") or "").strip(), + ) + for item in jsonshape.object_list(getattr(transcript, "utterances", None)) + ] + spoken = [utterance for utterance in utterances if utterance.text] + if not spoken: + transcript_id = str(getattr(transcript, "id", "")) + raise CLIError( + f"Transcript {transcript_id} has no utterances to dub.", + error_type="no_utterances", + exit_code=2, + suggestion=( + "Dubbing needs a diarized transcript. Pass a --transcript-id created " + "with --speaker-labels, or drop -t to let dub transcribe the file." + ), + ) + return spoken + + +def _total_seconds(transcript: object) -> float | None: + """The source duration in seconds (used to pad the dubbed track's tail).""" + duration = getattr(transcript, "audio_duration", None) + if isinstance(duration, int | float) and not isinstance(duration, bool): + return float(duration) + return None + + +def _translate( + api_key: str, + utterances: list[_Utterance], + language: str, + opts: DubOptions, + *, + json_mode: bool, + quiet: bool, +) -> list[str]: + """Translate each utterance to ``language`` with the LLM Gateway, in order. + + One call per utterance keeps the translation↔timestamp alignment exact — + no reply-parsing step that could shift a line against its window. + """ + system = TRANSLATION_SYSTEM_TEMPLATE.format(language=language) + translating = f"Translating {len(utterances)} utterance(s) to {language} with {opts.model}…" + translations: list[str] = [] + with output.status(translating, json_mode=json_mode, quiet=quiet): + for index, utterance in enumerate(utterances, 1): + messages = gateway.build_messages(utterance.text, system=system) + response = gateway.complete( + api_key, model=opts.model, messages=messages, max_tokens=opts.max_tokens + ) + translated = gateway.content_of(response).strip() + if not translated: + raise APIError( + f"The model returned an empty translation for utterance {index} " + f"({utterance.text[:50]!r})." + ) + translations.append(translated) + return translations + + +def _synthesize( + api_key: str, + segments: list[tuple[str, str]], + language: str, + *, + json_mode: bool, + quiet: bool, +) -> tuple[list[bytes], int]: + """Synthesize each ``(voice, text)`` segment; returns the PCM list + sample rate. + + Every segment must come back at one rate — the timeline math places segments + by sample position, so a mid-run rate change would silently shift timing. + """ + synthesizing = f"Synthesizing {len(segments)} segment(s)…" + with output.status(synthesizing, json_mode=json_mode, quiet=quiet): + results = [ + session.synthesize( + api_key, + SpeakConfig(text=text, voice=voice, language=language), + on_warning=lambda m: output.emit_warning(m, json_mode=json_mode), + ) + for voice, text in segments + ] + rates = {result.sample_rate for result in results} + if len(rates) > 1: + raise APIError(f"TTS service returned mixed sample rates ({sorted(rates)}).") + # `segments` is never empty (_utterances_of raised otherwise), so results[0] exists. + return [result.pcm for result in results], results[0].sample_rate + + +def _assign_voices( + utterances: list[_Utterance], + translations: list[str], + voice_values: list[str], +) -> tuple[list[tuple[str, str]], dict[str, str]]: + """Resolve each translated utterance to ``(voice, text)`` plus the speaker→voice map. + + A bare ``--voice`` dubs every speaker with that one voice; ``SPEAKER=VOICE`` + mappings pin individual speakers; everyone else takes the rotation in + first-appearance order (the same rules as `assembly speak`). + """ + bare_voice, overrides = dialogue.parse_voice_overrides(voice_values) + rotation = (bare_voice,) if bare_voice is not None else dialogue.DEFAULT_VOICE_ROTATION + segments = [ + dialogue.Segment(utterance.speaker, translated) + # strict=True is an invariant guard only: _translate returns exactly one + # translation per utterance, so the lengths can never differ. + for utterance, translated in zip(utterances, translations, strict=True) # pragma: no mutate + ] + return dialogue.assign_voices(segments, rotation, overrides) + + +def run_dub(opts: DubOptions, state: AppState, *, json_mode: bool) -> None: + """Execute one `assembly dub` invocation from already-parsed flags.""" + language = resolve_language(opts.language) + _require_sandbox() + media = Path(opts.media) + _validate_media(media) + out = opts.out if opts.out is not None else default_out_path(media, language) + _validate_out(out, media) + ffmpeg = _require_ffmpeg() + + transcript = _resolve_transcript(opts, media, state, json_mode=json_mode) + transcript_id = str(getattr(transcript, "id", "")) + utterances = _utterances_of(transcript) + api_key = state.resolve_api_key() + translations = _translate( + api_key, utterances, language, opts, json_mode=json_mode, quiet=state.quiet + ) + resolved, speakers = _assign_voices(utterances, translations, opts.voice) + pcm_segments, sample_rate = _synthesize( + api_key, resolved, language, json_mode=json_mode, quiet=state.quiet + ) + + # strict=True is an invariant guard only: _synthesize returns one PCM per segment. + starts = (u.start_ms for u in utterances) + placed = list(zip(starts, pcm_segments, strict=True)) # pragma: no mutate + track = assemble_timeline(placed, sample_rate, _total_seconds(transcript)) + with tempfile.TemporaryDirectory(prefix="aai-dub-") as tmp: + wav = Path(tmp) / "dub.wav" + audio.write_wav(wav, track, sample_rate) + with output.status("Writing the dubbed file…", json_mode=json_mode, quiet=state.quiet): + _mux(ffmpeg, media, wav, out) + + duration = round(_pcm_seconds(track, sample_rate), 3) + voices = ", ".join(f"{speaker}={voice}" for speaker, voice in speakers.items()) + payload: dict[str, object] = { + "source": opts.media, + "out": str(out), + "language": language, + "transcript_id": transcript_id, + "utterances": len(utterances), + "speakers": speakers, + "sample_rate": sample_rate, + "audio_duration_seconds": duration, + } + output.emit( + payload, + lambda _: output.success( + f"{escape(str(out))} dubbed to {language} ({len(utterances)} utterances, {voices})" + ), + json_mode=json_mode, + ) diff --git a/aai_cli/main.py b/aai_cli/main.py index e3d9def5..001c2434 100644 --- a/aai_cli/main.py +++ b/aai_cli/main.py @@ -30,6 +30,7 @@ dev, dictate, doctor, + dub, evaluate, init, keys, @@ -72,6 +73,7 @@ "speak", "llm", "clip", + "dub", "eval", "webhooks", # Setup & Tools — get set up & maintain @@ -412,6 +414,7 @@ def main( app.add_typer(speak.app) app.add_typer(llm.app) app.add_typer(clip.app) +app.add_typer(dub.app) app.add_typer(evaluate.app) # eval app.add_typer(account.app) # balance, usage, limits app.add_typer(login.app) # login, logout, whoami diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index 98c8b63d..b4476fdd 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -212,6 +212,68 @@ + ''' +# --- +# name: test_command_help_matches_snapshot[dub] + ''' + + Usage: assembly dub [OPTIONS] MEDIA + + Dub a video or audio file into another language (sandbox only). + + The whole platform in one command: the media is transcribed with diarized + utterance timestamps, each utterance is translated by an LLM Gateway model, + the translations are synthesized with streaming TTS (one voice per + speaker), and ffmpeg lays the new audio over the original — video copied + untouched. Streaming TTS only exists in the sandbox today — run it as + 'assembly --sandbox dub' (--sandbox goes before the subcommand). Requires + ffmpeg. + + ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ + │ * media TEXT Local audio/video file to dub (the video stream is │ + │ copied untouched). │ + │ [required] │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ Options ────────────────────────────────────────────────────────────────────╮ + │ * --lang -l TEXT Target language: an ISO code (de, fr, es, │ + │ …) or a language name (German). │ + │ [required] │ + │ --transcript-id -t TEXT Reuse an existing diarized transcript of │ + │ this media instead of transcribing it │ + │ again. │ + │ --voice TEXT Voice id for every speaker (e.g. jane, │ + │ michael, paul), or SPEAKER=VOICE to pin a │ + │ diarized speaker (repeatable, e.g. --voice │ + │ A=jane). │ + │ --out PATH Output file (default: │ + │ .dub. next to the input). │ + │ --json -j Emit JSON describing the dubbed file. │ + │ --help Show this message and exit. │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + ╭─ LLM Transform ──────────────────────────────────────────────────────────────╮ + │ --model TEXT LLM Gateway model that translates the │ + │ utterances. │ + │ [default: claude-haiku-4-5-20251001] │ + │ --max-tokens INTEGER Max tokens per utterance translation. │ + │ [default: 1000] │ + ╰──────────────────────────────────────────────────────────────────────────────╯ + + Examples + Dub a talk into German (sandbox only) + $ assembly --sandbox dub talk.mp4 --lang de + Use a language name instead of a code + $ assembly --sandbox dub talk.mp4 -l Spanish + Dub every speaker with one voice + $ assembly --sandbox dub talk.mp4 -l fr --voice paul + Pin a voice per diarized speaker + $ assembly --sandbox dub panel.mp4 -l de --voice A=jane --voice B=paul + Reuse a finished transcript instead of re-transcribing + $ assembly --sandbox dub talk.mp4 -l de -t TRANSCRIPT_ID + Choose the output file + $ assembly --sandbox dub talk.mp4 -l de --out talk-german.mp4 + + + ''' # --- # name: test_command_help_matches_snapshot[eval] diff --git a/tests/_dub_helpers.py b/tests/_dub_helpers.py new file mode 100644 index 00000000..d1670d9e --- /dev/null +++ b/tests/_dub_helpers.py @@ -0,0 +1,129 @@ +"""Shared builders for the `assembly dub` test modules. + +The dub suite is split across test_dub_exec.py (pure helpers + validation), +test_dub_pipeline.py (the faked transcribe → translate → synthesize → mux +runs), and test_dub_command.py (argv parsing); the option defaults, transcript +fakes, and boundary recorders they share live here. +""" + +from __future__ import annotations + +import re +import subprocess +import wave +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from aai_cli import client, config, dub_exec, llm +from aai_cli.dub_exec import DubOptions +from aai_cli.tts import session +from aai_cli.tts.session import SpeakResult + +# The CLI's flag defaults, as data. Tests override per-case with dataclasses.replace. +DEFAULTS = DubOptions( + media="talk.mp4", + language="de", + transcript_id=None, + voice=[], + model=llm.DEFAULT_MODEL, + max_tokens=llm.DEFAULT_MAX_TOKENS, + out=None, +) + +SAMPLE_RATE = 100 # tiny rate keeps the timeline byte math exact and readable + +_ANSI_SGR = re.compile(r"\x1b\[[0-9;]*m") + + +def plain(text: str) -> str: + """Strip SGR color codes (CI forces color on, splitting flags like --lang + with style sequences) for substring assertions.""" + return _ANSI_SGR.sub("", text) + + +def utterance(start, speaker, text): + return SimpleNamespace(start=start, end=None, speaker=speaker, text=text) + + +def fake_transcript(utterances, *, audio_duration=5): + return SimpleNamespace(id="tr_dub", utterances=utterances, audio_duration=audio_duration) + + +def completion(text): + """The slice of an OpenAI ChatCompletion that gateway.content_of reads.""" + return SimpleNamespace(choices=[SimpleNamespace(message=SimpleNamespace(content=text))]) + + +def write_media(tmp_path: Path) -> Path: + path = tmp_path / "talk.mp4" + path.write_bytes(b"\x00fake-media") + return path + + +def enable_sandbox(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(session, "is_available", lambda: True) + + +def patch_api_key(monkeypatch: pytest.MonkeyPatch) -> None: + monkeypatch.setattr(config, "resolve_api_key", lambda **_: "test-key") + + +def record_transcribe(monkeypatch: pytest.MonkeyPatch) -> dict[str, object]: + """Record the transcription request and return a two-speaker transcript.""" + calls: dict[str, object] = {} + + def _fake(api_key, audio, *, config): + calls["api_key"] = api_key + calls["audio"] = audio + calls["config"] = config + return fake_transcript([utterance(1000, "A", "Hello."), utterance(3000, "B", "World.")]) + + monkeypatch.setattr(client, "transcribe", _fake) + return calls + + +def record_translate(monkeypatch: pytest.MonkeyPatch) -> list[dict[str, object]]: + """Record each gateway call and reply with a marked 'DE:' translation.""" + calls: list[dict[str, object]] = [] + + def _fake(api_key, *, model, messages, max_tokens=llm.DEFAULT_MAX_TOKENS, transcript_id=None): + calls.append({"model": model, "messages": messages, "max_tokens": max_tokens}) + return completion(f"DE:{messages[-1]['content']}") + + monkeypatch.setattr(llm, "complete", _fake) + return calls + + +def record_synthesize(monkeypatch: pytest.MonkeyPatch) -> list[object]: + """Record each TTS request; segment i comes back as 100 bytes of 0xA1+i.""" + calls: list[object] = [] + + def _fake(api_key, cfg, *, connect=None, on_warning=None): + calls.append(cfg) + pcm = bytes([0xA0 + len(calls)]) * 100 + return SpeakResult(pcm=pcm, sample_rate=SAMPLE_RATE, audio_duration_seconds=0.5) + + monkeypatch.setattr(session, "synthesize", _fake) + return calls + + +def record_ffmpeg(monkeypatch: pytest.MonkeyPatch) -> dict[str, object]: + """Resolve ffmpeg and record the invocation plus the WAV it was handed. + + The temp WAV is deleted right after the mux, so its contents are captured + here, while the file still exists. + """ + monkeypatch.setattr("shutil.which", lambda name: f"/usr/bin/{name}") + recorded: dict[str, object] = {} + + def run(args: list[str]) -> subprocess.CompletedProcess[str]: + recorded["args"] = args + with wave.open(args[8], "rb") as wav: # args[8] is the dub.wav input + recorded["wav_params"] = wav.getparams() + recorded["wav_frames"] = wav.readframes(wav.getnframes()) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(dub_exec, "_run_ffmpeg", run) + return recorded diff --git a/tests/_snapshot_surface.py b/tests/_snapshot_surface.py index 8d0c26b9..447a63d1 100644 --- a/tests/_snapshot_surface.py +++ b/tests/_snapshot_surface.py @@ -24,7 +24,18 @@ HELP_GROUPS: dict[str, frozenset[str]] = { "build": frozenset({"onboard", "init", "dev", "share", "deploy"}), "run": frozenset( - {"transcribe", "stream", "dictate", "agent", "speak", "llm", "clip", "eval", "webhooks"} + { + "transcribe", + "stream", + "dictate", + "agent", + "speak", + "llm", + "clip", + "dub", + "eval", + "webhooks", + } ), "tools": frozenset({"doctor", "setup", "telemetry", "_update-check"}), "history": frozenset({"transcripts", "sessions"}), diff --git a/tests/test_dub_command.py b/tests/test_dub_command.py new file mode 100644 index 00000000..b30cb784 --- /dev/null +++ b/tests/test_dub_command.py @@ -0,0 +1,97 @@ +"""Argv parsing tests for `assembly dub` (aai_cli/commands/dub.py): the command +module only builds a DubOptions and hands it to dub_exec.run_dub, so these +tests pin the flag -> options mapping and the end-to-end sandbox guard; the +pipeline itself is covered in test_dub_exec.py.""" + +from __future__ import annotations + +from pathlib import Path + +import pytest +from typer.testing import CliRunner + +from aai_cli import dub_exec, llm +from aai_cli.main import app +from tests._dub_helpers import plain + +runner = CliRunner() + + +@pytest.fixture +def captured_run(monkeypatch: pytest.MonkeyPatch): + """Capture the (opts, json_mode) the command hands to run_dub.""" + seen: dict[str, object] = {} + + def fake_run(opts, state, *, json_mode): + seen["opts"] = opts + seen["json_mode"] = json_mode + + monkeypatch.setattr(dub_exec, "run_dub", fake_run) + return seen + + +def test_lang_is_required(): + result = runner.invoke(app, ["dub", "talk.mp4"]) + assert result.exit_code == 2 + assert "--lang" in plain(result.output) + + +def test_production_env_is_rejected_with_sandbox_hint(): + result = runner.invoke(app, ["dub", "talk.mp4", "--lang", "de"]) # default = production + assert result.exit_code == 2 + output = plain(result.output) + assert "only available in the sandbox" in output + # The suggestion spells out the exact corrected invocation: --sandbox is a root + # flag, so it must go before the command, not after it. + assert "Re-run as: assembly --sandbox dub" in output + + +def test_defaults_map_to_options(captured_run): + result = runner.invoke(app, ["dub", "talk.mp4", "--lang", "de"]) + assert result.exit_code == 0 + assert captured_run["json_mode"] is False + assert captured_run["opts"] == dub_exec.DubOptions( + media="talk.mp4", + language="de", + transcript_id=None, + voice=[], + model=llm.DEFAULT_MODEL, + max_tokens=llm.DEFAULT_MAX_TOKENS, + out=None, + ) + + +def test_every_flag_maps_to_options(captured_run): + result = runner.invoke( + app, + [ + "dub", + "talk.mp4", + "--lang", + "German", + "-t", + "tr_1", + "--voice", + "A=jane", + "--voice", + "paul", + "--model", + "gpt-5", + "--max-tokens", + "7", + "--out", + "dubbed.mp4", + "--json", + ], + ) + assert result.exit_code == 0 + assert captured_run["json_mode"] is True + assert captured_run["opts"] == dub_exec.DubOptions( + media="talk.mp4", + language="German", + transcript_id="tr_1", + voice=["A=jane", "paul"], + model="gpt-5", + max_tokens=7, + out=Path("dubbed.mp4"), + ) diff --git a/tests/test_dub_exec.py b/tests/test_dub_exec.py new file mode 100644 index 00000000..99cf99fc --- /dev/null +++ b/tests/test_dub_exec.py @@ -0,0 +1,238 @@ +"""Direct tests of the `assembly dub` options/run seam (aai_cli/dub_exec.py): +the pure helpers (language resolution, output naming, timeline assembly, +utterance extraction) and run_dub's validation order. Constructed-options +tests (dataclasses.replace off the shared defaults) avoid any argv +round-trip. The faked pipeline runs live in test_dub_pipeline.py; argv +parsing in test_dub_command.py.""" + +from __future__ import annotations + +import dataclasses +import sys +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from aai_cli import dub_exec +from aai_cli.context import AppState +from aai_cli.errors import CLIError, UsageError +from tests._dub_helpers import ( + DEFAULTS, + enable_sandbox, + fake_transcript, + patch_api_key, + utterance, + write_media, +) + + +@pytest.fixture +def media(tmp_path: Path) -> Path: + return write_media(tmp_path) + + +@pytest.fixture +def sandbox(monkeypatch: pytest.MonkeyPatch): + enable_sandbox(monkeypatch) + + +@pytest.fixture(autouse=True) +def _fake_key(monkeypatch: pytest.MonkeyPatch): + patch_api_key(monkeypatch) + + +# --- records and pure helpers -------------------------------------------------- + + +@pytest.mark.parametrize( + "instance", + [DEFAULTS, dub_exec._Utterance(start_ms=0, speaker="A", text="hi")], + ids=["options", "utterance"], +) +def test_records_are_immutable(instance): + field_name = dataclasses.fields(instance)[0].name + with pytest.raises(dataclasses.FrozenInstanceError): + setattr(instance, field_name, None) + + +def test_language_names_map_codes_to_names(): + # An independent copy of the expected table: a silently edited entry in the + # shipped map must fail here, not just round-trip through itself. + assert dub_exec.LANGUAGE_NAMES == { + "ar": "Arabic", + "de": "German", + "en": "English", + "es": "Spanish", + "fr": "French", + "hi": "Hindi", + "it": "Italian", + "ja": "Japanese", + "ko": "Korean", + "nl": "Dutch", + "pl": "Polish", + "pt": "Portuguese", + "ru": "Russian", + "tr": "Turkish", + "vi": "Vietnamese", + "zh": "Chinese", + } + + +@pytest.mark.parametrize( + ("value", "expected"), + [ + ("de", "German"), + (" DE ", "German"), # codes are trimmed and case-insensitive + ("German", "German"), # a full name passes through + (" Klingon ", "Klingon"), # unlisted languages pass through, trimmed + ], +) +def test_resolve_language(value, expected): + assert dub_exec.resolve_language(value) == expected + + +def test_resolve_language_rejects_blank(): + with pytest.raises(UsageError) as exc: + dub_exec.resolve_language(" ") + assert "--lang needs a language" in exc.value.message + assert "--lang de" in (exc.value.suggestion or "") + + +@pytest.mark.parametrize( + ("language", "expected"), + [ + ("German", "talk.dub.german.mp4"), + ("Brazilian Portuguese", "talk.dub.brazilian-portuguese.mp4"), + ], +) +def test_default_out_path(language, expected): + out = dub_exec.default_out_path(Path("/x/talk.mp4"), language) + assert out == Path("/x") / expected + + +def test_assemble_timeline_fills_gaps_and_pads_tail(): + # rate 1000: one second of 16-bit mono PCM is 2000 bytes. + track = dub_exec.assemble_timeline([(500, b"\x01\x02")], 1000, total_seconds=1.0) + # 0.5 s leading silence, the segment, then a 0.499 s tail pad to 1.0 s. + assert track == b"\x00" * 1000 + b"\x01\x02" + b"\x00" * 998 + + +def test_assemble_timeline_overlap_appends_without_silence(): + # The first segment runs to 0.1 s; the second "starts" at 0.05 s, so it is + # appended immediately (the dub drifts) rather than overlapping or crashing. + placed = [(0, b"\x01" * 200), (50, b"\x02\x02")] + track = dub_exec.assemble_timeline(placed, 1000, total_seconds=None) + assert track == b"\x01" * 200 + b"\x02\x02" + + +def test_assemble_timeline_skips_tail_when_track_is_long_enough(): + track = dub_exec.assemble_timeline([(0, b"\x01" * 200)], 1000, total_seconds=0.05) + assert track == b"\x01" * 200 + + +def test_utterances_of_defaults_and_filtering(): + transcript = fake_transcript( + [ + SimpleNamespace(speaker=None, text="Hi"), # no start attr, no speaker label + utterance(2000, "B", None), # no text -> dropped + utterance(3000, "B", " "), # blank text -> dropped + utterance(4000, "C", " Bye "), + ] + ) + assert dub_exec._utterances_of(transcript) == [ + dub_exec._Utterance(start_ms=0, speaker="A", text="Hi"), + dub_exec._Utterance(start_ms=4000, speaker="C", text="Bye"), + ] + + +@pytest.mark.parametrize( + "utterances", + [None, [], [utterance(0, "A", "")]], + ids=["missing", "empty", "all-blank"], +) +def test_utterances_of_requires_spoken_utterances(utterances): + with pytest.raises(CLIError) as exc: + dub_exec._utterances_of(SimpleNamespace(id="tr_x", utterances=utterances)) + assert exc.value.error_type == "no_utterances" + assert exc.value.exit_code == 2 + assert "Transcript tr_x has no utterances to dub" in exc.value.message + assert "--speaker-labels" in (exc.value.suggestion or "") + + +@pytest.mark.parametrize( + ("duration", "expected"), + [(12, 12.0), (4.5, 4.5), (None, None), (True, None), ("90", None)], + ids=["int", "float", "none", "bool", "str"], +) +def test_total_seconds(duration, expected): + transcript = SimpleNamespace(audio_duration=duration) + assert dub_exec._total_seconds(transcript) == expected + + +def test_run_ffmpeg_captures_output_and_does_not_raise(): + # The real boundary (not the fake): output is captured as text and a non-zero + # exit must not raise — _mux turns the exit code into a CLIError itself. + result = dub_exec._run_ffmpeg( + [ + sys.executable, + "-c", + "import sys; print('out'); print('err', file=sys.stderr); sys.exit(3)", + ] + ) + assert result.returncode == 3 + assert result.stdout == "out\n" + assert result.stderr == "err\n" + + +# --- validation order (cheap local checks before any credential or network) ---- + + +def test_run_dub_rejects_blank_language_first(): + opts = dataclasses.replace(DEFAULTS, language=" ") + with pytest.raises(UsageError): # not the sandbox CLIError: language wins + dub_exec.run_dub(opts, AppState(), json_mode=False) + + +def test_run_dub_requires_sandbox(): + # The active environment defaults to production, which has no streaming-TTS host. + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(DEFAULTS, AppState(), json_mode=False) + assert exc.value.error_type == "unsupported_environment" + assert exc.value.exit_code == 2 + assert "only available in the sandbox" in exc.value.message + assert "Re-run as: assembly --sandbox dub" in (exc.value.suggestion or "") + + +def test_run_dub_rejects_missing_file(sandbox, tmp_path): + opts = dataclasses.replace(DEFAULTS, media=str(tmp_path / "nope.mp4")) + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert exc.value.error_type == "file_not_found" + assert exc.value.exit_code == 2 + assert "local audio/video file" in (exc.value.suggestion or "") + + +def test_run_dub_rejects_directory(sandbox, tmp_path): + opts = dataclasses.replace(DEFAULTS, media=str(tmp_path)) + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert exc.value.error_type == "not_a_file" + assert exc.value.exit_code == 2 + assert "not a directory" in (exc.value.suggestion or "") + + +def test_run_dub_refuses_to_overwrite_the_input(sandbox, media): + opts = dataclasses.replace(DEFAULTS, media=str(media), out=media) + with pytest.raises(UsageError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert "overwrite the input file" in exc.value.message + + +def test_run_dub_requires_ffmpeg(sandbox, media, monkeypatch): + monkeypatch.setattr("shutil.which", lambda name: None) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(CLIError) as exc: + dub_exec.run_dub(opts, AppState(), json_mode=False) + assert exc.value.error_type == "missing_dependency" + assert "ffmpeg" in exc.value.message diff --git a/tests/test_dub_pipeline.py b/tests/test_dub_pipeline.py new file mode 100644 index 00000000..78990d0b --- /dev/null +++ b/tests/test_dub_pipeline.py @@ -0,0 +1,281 @@ +"""Faked end-to-end runs of the `assembly dub` pipeline (aai_cli/dub_exec.py): +the transcribe → translate → synthesize → ffmpeg mux orchestration, voice +assignment, and the failure modes of each boundary. The LLM Gateway, streaming +TTS, and ffmpeg are faked at the modules dub_exec calls into (`llm.complete`, +`session.synthesize`, `client.transcribe`) and at `dub_exec._run_ffmpeg`; the +pure helpers and validation order live in test_dub_exec.py.""" + +from __future__ import annotations + +import dataclasses +import json +import subprocess +from pathlib import Path +from types import SimpleNamespace + +import pytest + +from aai_cli import client, dub_exec, llm +from aai_cli.context import AppState +from aai_cli.errors import APIError, CLIError +from aai_cli.tts import session +from aai_cli.tts.session import SpeakResult +from tests._dub_helpers import ( + DEFAULTS, + SAMPLE_RATE, + completion, + enable_sandbox, + fake_transcript, + patch_api_key, + plain, + record_ffmpeg, + record_synthesize, + record_transcribe, + record_translate, + utterance, + write_media, +) + + +@pytest.fixture +def media(tmp_path: Path) -> Path: + return write_media(tmp_path) + + +@pytest.fixture(autouse=True) +def _sandbox_and_key(monkeypatch: pytest.MonkeyPatch): + enable_sandbox(monkeypatch) + patch_api_key(monkeypatch) + + +@pytest.fixture +def fake_transcribe(monkeypatch: pytest.MonkeyPatch): + return record_transcribe(monkeypatch) + + +@pytest.fixture +def fake_translate(monkeypatch: pytest.MonkeyPatch): + return record_translate(monkeypatch) + + +@pytest.fixture +def fake_synthesize(monkeypatch: pytest.MonkeyPatch): + return record_synthesize(monkeypatch) + + +@pytest.fixture +def fake_ffmpeg(monkeypatch: pytest.MonkeyPatch): + return record_ffmpeg(monkeypatch) + + +def _run(opts, *, json_mode): + dub_exec.run_dub(opts, AppState(), json_mode=json_mode) + + +def test_run_dub_pipeline_end_to_end( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg, capsys +): + opts = dataclasses.replace(DEFAULTS, media=str(media)) + _run(opts, json_mode=True) + + # Transcription: the local file, diarized so speakers keep distinct voices. + assert fake_transcribe["audio"] == str(media) + assert fake_transcribe["config"].speaker_labels is True + + # Translation: one gateway call per utterance, in order, with the dubbing + # system prompt naming the resolved language ("de" -> "German"). + assert [c["messages"][-1]["content"] for c in fake_translate] == ["Hello.", "World."] + for call in fake_translate: + assert call["model"] == llm.DEFAULT_MODEL + assert call["max_tokens"] == llm.DEFAULT_MAX_TOKENS + system = call["messages"][0] + assert system["role"] == "system" + assert "dubbing" in system["content"] + assert "German" in system["content"] + + # Synthesis: the translated text, rotation voices in speaker order, target language. + assert [(cfg.voice, cfg.text) for cfg in fake_synthesize] == [ + ("jane", "DE:Hello."), + ("michael", "DE:World."), + ] + assert all(cfg.language == "German" for cfg in fake_synthesize) + + # The dubbed track: silence to 1.0 s, segment 1, silence to 3.0 s, segment 2, + # then a tail pad out to the source's 5 s duration (rate 100 -> 200 bytes/s). + expected_track = b"\x00" * 200 + b"\xa1" * 100 + b"\x00" * 300 + b"\xa2" * 100 + b"\x00" * 300 + assert fake_ffmpeg["wav_frames"] == expected_track + params = fake_ffmpeg["wav_params"] + assert (params.nchannels, params.sampwidth, params.framerate) == (1, 2, SAMPLE_RATE) + + # The mux: video copied, WAV swapped in as the only audio, default out path. + out = media.parent / "talk.dub.german.mp4" + wav_path = fake_ffmpeg["args"][8] + assert fake_ffmpeg["args"] == [ + "/usr/bin/ffmpeg", + "-hide_banner", + "-loglevel", + "error", + "-y", + "-i", + str(media), + "-i", + wav_path, + "-map", + "0:v?", + "-map", + "1:a", + "-c:v", + "copy", + str(out), + ] + assert wav_path.endswith("dub.wav") + + payload = json.loads(capsys.readouterr().out) + assert payload == { + "source": str(media), + "out": str(out), + "language": "German", + "transcript_id": "tr_dub", + "utterances": 2, + "speakers": {"A": "jane", "B": "michael"}, + "sample_rate": SAMPLE_RATE, + "audio_duration_seconds": 5.0, + } + + +def test_run_dub_human_summary( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg, capsys +): + # A short --out keeps the one-line summary under the 80-column console width: + # with the default (tmp_path-prefixed) out path, Rich would hard-wrap the line + # mid-word and these substring asserts would depend on where the break lands. + opts = dataclasses.replace(DEFAULTS, media=str(media), out=Path("dub.de.mp4")) + _run(opts, json_mode=False) + # plain(): under FORCE_COLOR (CI) Rich's repr highlighter interleaves style + # codes inside the line ("(2 utterances" renders with the 2 colored). + out = plain(capsys.readouterr().out) + assert "dub.de.mp4" in out + assert "dubbed to German" in out + assert "2 utterances" in out + assert "A=jane, B=michael" in out + + +def test_bare_voice_dubs_every_speaker( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg +): + opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["paul"]) + _run(opts, json_mode=True) + assert [cfg.voice for cfg in fake_synthesize] == ["paul", "paul"] + + +def test_voice_overrides_pin_speakers_without_consuming_rotation( + media, fake_transcribe, fake_translate, fake_synthesize, fake_ffmpeg +): + opts = dataclasses.replace(DEFAULTS, media=str(media), voice=["A=mary"]) + _run(opts, json_mode=True) + # A is pinned; B still takes the first rotation voice (overrides don't consume slots). + assert [cfg.voice for cfg in fake_synthesize] == ["mary", "jane"] + + +def test_transcript_id_reuses_existing_transcript( + media, fake_translate, fake_ffmpeg, monkeypatch, capsys +): + fetched: dict[str, str] = {} + + def get_transcript(api_key, transcript_id): + fetched["id"] = transcript_id + return SimpleNamespace( + id=transcript_id, + utterances=[utterance(0, "A", "Hello.")], + audio_duration=None, # duration unknown -> no tail pad + ) + + monkeypatch.setattr(client, "get_transcript", get_transcript) + monkeypatch.setattr( + client, + "transcribe", + lambda *a, **k: pytest.fail("must not re-transcribe with --transcript-id"), + ) + monkeypatch.setattr( + session, + "synthesize", + lambda api_key, cfg, **_: SpeakResult( + pcm=b"\xaa" * 2000, sample_rate=300, audio_duration_seconds=0.0 + ), + ) + + opts = dataclasses.replace(DEFAULTS, media=str(media), transcript_id="tr_99") + _run(opts, json_mode=True) + assert fetched["id"] == "tr_99" + payload = json.loads(capsys.readouterr().out) + assert payload["transcript_id"] == "tr_99" + # 1000 samples at 300 Hz, rounded to milliseconds: 3.3333... -> 3.333. + assert payload["audio_duration_seconds"] == 3.333 + + +def test_empty_translation_is_an_api_error(media, fake_synthesize, fake_ffmpeg, monkeypatch): + long_text = "a" * 50 + "TAIL!" + transcript = fake_transcript([utterance(0, "A", "Hello."), utterance(1000, "B", long_text)]) + monkeypatch.setattr(client, "transcribe", lambda *a, **k: transcript) + replies = iter(["Hallo.", " "]) + monkeypatch.setattr(llm, "complete", lambda *a, **k: completion(next(replies))) + + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(APIError) as exc: + _run(opts, json_mode=False) + # The 1-based index and the (50-char) text preview pin which utterance failed. + assert f"empty translation for utterance 2 ({'a' * 50!r})." in exc.value.message + + +def test_mixed_sample_rates_are_an_api_error( + media, fake_transcribe, fake_translate, fake_ffmpeg, monkeypatch +): + rates = iter([100, 200]) + monkeypatch.setattr( + session, + "synthesize", + lambda api_key, cfg, **_: SpeakResult( + pcm=b"\x01\x02", sample_rate=next(rates), audio_duration_seconds=0.0 + ), + ) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(APIError) as exc: + _run(opts, json_mode=False) + assert "mixed sample rates ([100, 200])" in exc.value.message + + +def test_ffmpeg_failure_reports_last_stderr_line( + media, fake_transcribe, fake_translate, fake_synthesize, monkeypatch +): + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg") + monkeypatch.setattr( + dub_exec, + "_run_ffmpeg", + lambda args: subprocess.CompletedProcess( + args=args, returncode=1, stdout="", stderr="noise\nInvalid data found\n" + ), + ) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(CLIError) as exc: + _run(opts, json_mode=False) + assert exc.value.error_type == "dub_failed" + assert "Could not write talk.dub.german.mp4" in exc.value.message + # The last stderr line is the reason ffmpeg gives; earlier noise is dropped. + assert "Invalid data found" in exc.value.message + assert "noise" not in exc.value.message + assert "readable audio/video file" in (exc.value.suggestion or "") + + +def test_ffmpeg_silent_failure_reports_exit_code( + media, fake_transcribe, fake_translate, fake_synthesize, monkeypatch +): + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg") + monkeypatch.setattr( + dub_exec, + "_run_ffmpeg", + lambda args: subprocess.CompletedProcess(args=args, returncode=3, stdout="", stderr=""), + ) + opts = dataclasses.replace(DEFAULTS, media=str(media)) + with pytest.raises(CLIError) as exc: + _run(opts, json_mode=False) + assert "ffmpeg exited with code 3" in exc.value.message diff --git a/tests/test_smoke.py b/tests/test_smoke.py index 48e5bc79..bc16fee1 100644 --- a/tests/test_smoke.py +++ b/tests/test_smoke.py @@ -158,6 +158,7 @@ def test_help_lists_commands_in_workflow_order(): "speak", "llm", "clip", + "dub", "eval", "webhooks", # Setup & Tools