AssemblyAI · alexkroman · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.importlinter b/.importlinter
@@ -20,6 +20,7 @@ source_modules =
     aai_cli.context
     aai_cli.debuglog
     aai_cli.dictate_exec
+    aai_cli.dub_exec
     aai_cli.environments
     aai_cli.errors
     aai_cli.eval_data
@@ -65,6 +66,7 @@ modules =
     aai_cli.commands.dev
     aai_cli.commands.dictate
     aai_cli.commands.doctor
+    aai_cli.commands.dub
     aai_cli.commands.evaluate
     aai_cli.commands.init
     aai_cli.commands.keys

diff --git a/README.md b/README.md
@@ -32,6 +32,14 @@ assembly transcribe "https://www.youtube.com/watch?v=awmCtXzFsJo" --speaker-labe
 
 `speak` auto-detects `Speaker A:` labels, merges each speaker's turns, and rotates voices. (`speak` is sandbox-only today, hence `--sandbox`.)
 
+**Dub a video into another language** — the whole platform in one command: transcription with utterance timestamps, per-utterance LLM translation, TTS for each line (one voice per speaker), and ffmpeg laying the new track over the original video:
+
+```sh
+assembly --sandbox dub talk.mp4 --lang de
+```
+
+The video stream is copied untouched; each dubbed line lands at its original start time. (Sandbox-only, like `speak`.)
+
 **Turn a podcast into audio** — Apple and Spotify podcast pages work too (yt-dlp ingestion):
 
 ```sh
@@ -183,6 +191,7 @@ assembly init                  # scaffold a starter app
 - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal.
 - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`).
 - **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times.
+- **Dubbing**: `assembly dub` re-voices an audio/video file in another language (`assembly --sandbox dub talk.mp4 --lang de`): diarized transcription, per-utterance LLM translation, streaming TTS per speaker, and an ffmpeg track-swap that leaves the video untouched. Sandbox-only today, like `speak`.
 - **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model.
 - **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io.
 - **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`.

diff --git a/aai_cli/commands/dub.py b/aai_cli/commands/dub.py
@@ -0,0 +1,106 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import typer
+
+from aai_cli import dub_exec, help_panels, llm, options
+from aai_cli.context import run_command
+from aai_cli.help_text import examples_epilog
+
+app = typer.Typer()
+
+
+@app.command(
+    rich_help_panel=help_panels.TRANSCRIPTION,
+    # --sandbox is a root flag, so it must come before the subcommand in every example.
+    epilog=examples_epilog(
+        [
+            ("Dub a talk into German (sandbox only)", "assembly --sandbox dub talk.mp4 --lang de"),
+            ("Use a language name instead of a code", "assembly --sandbox dub talk.mp4 -l Spanish"),
+            (
+                "Dub every speaker with one voice",
+                "assembly --sandbox dub talk.mp4 -l fr --voice paul",
+            ),
+            (
+                "Pin a voice per diarized speaker",
+                "assembly --sandbox dub panel.mp4 -l de --voice A=jane --voice B=paul",
+            ),
+            (
+                "Reuse a finished transcript instead of re-transcribing",
+                "assembly --sandbox dub talk.mp4 -l de -t TRANSCRIPT_ID",
+            ),
+            (
+                "Choose the output file",
+                "assembly --sandbox dub talk.mp4 -l de --out talk-german.mp4",
+            ),
+        ]
+    ),
+)
+def dub(
+    ctx: typer.Context,
+    media: str = typer.Argument(
+        ...,
+        help="Local audio/video file to dub (the video stream is copied untouched).",
+    ),
+    lang: str = typer.Option(
+        ...,
+        "--lang",
+        "-l",
+        help="Target language: an ISO code (de, fr, es, …) or a language name (German).",
+    ),
+    transcript_id: str | None = typer.Option(
+        None,
+        "--transcript-id",
+        "-t",
+        help="Reuse an existing diarized transcript of this media instead of "
+        "transcribing it again.",
+    ),
+    voice: list[str] = typer.Option(
+        [],
+        "--voice",
+        help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE "
+        "to pin a diarized speaker (repeatable, e.g. --voice A=jane).",
+    ),
+    model: str = typer.Option(
+        llm.DEFAULT_MODEL,
+        "--model",
+        help="LLM Gateway model that translates the utterances.",
+        rich_help_panel=help_panels.OPT_LLM,
+        autocompletion=llm.complete_model,
+    ),
+    max_tokens: int = typer.Option(
+        llm.DEFAULT_MAX_TOKENS,
+        "--max-tokens",
+        help="Max tokens per utterance translation.",
+        rich_help_panel=help_panels.OPT_LLM,
+    ),
+    out: Path | None = typer.Option(
+        None, "--out", help="Output file (default: <name>.dub.<lang><ext> next to the input)."
+    ),
+    json_out: bool = options.json_option("Emit JSON describing the dubbed file."),
+) -> None:
+    """Dub a video or audio file into another language (sandbox only).
+
+    The whole platform in one command: the media is transcribed with diarized
+    utterance timestamps, each utterance is translated by an LLM Gateway model,
+    the translations are synthesized with streaming TTS (one voice per
+    speaker), and ffmpeg lays the new audio over the original — video copied
+    untouched. Streaming TTS only exists in the sandbox today — run it as
+    'assembly --sandbox dub' (--sandbox goes before the subcommand). Requires
+    ffmpeg.
+    """
+    opts = dub_exec.DubOptions(
+        media=media,
+        language=lang,
+        transcript_id=transcript_id,
+        voice=voice,
+        model=model,
+        max_tokens=max_tokens,
+        out=out,
+    )
+    run_command(
+        ctx,
+        lambda state, json_mode: dub_exec.run_dub(opts, state, json_mode=json_mode),
+        json=json_out,
+    )