Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .importlinter
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ source_modules =
aai_cli.context
aai_cli.debuglog
aai_cli.dictate_exec
aai_cli.dub_exec
aai_cli.environments
aai_cli.errors
aai_cli.eval_data
Expand Down Expand Up @@ -65,6 +66,7 @@ modules =
aai_cli.commands.dev
aai_cli.commands.dictate
aai_cli.commands.doctor
aai_cli.commands.dub
aai_cli.commands.evaluate
aai_cli.commands.init
aai_cli.commands.keys
Expand Down
9 changes: 9 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,14 @@ assembly transcribe "https://www.youtube.com/watch?v=awmCtXzFsJo" --speaker-labe

`speak` auto-detects `Speaker A:` labels, merges each speaker's turns, and rotates voices. (`speak` is sandbox-only today, hence `--sandbox`.)

**Dub a video into another language** — the whole platform in one command: transcription with utterance timestamps, per-utterance LLM translation, TTS for each line (one voice per speaker), and ffmpeg laying the new track over the original video:

```sh
assembly --sandbox dub talk.mp4 --lang de
```

The video stream is copied untouched; each dubbed line lands at its original start time. (Sandbox-only, like `speak`.)

**Turn a podcast into audio** — Apple and Spotify podcast pages work too (yt-dlp ingestion):

```sh
Expand Down Expand Up @@ -183,6 +191,7 @@ assembly init # scaffold a starter app
- **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal.
- **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`).
- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times.
- **Dubbing**: `assembly dub` re-voices an audio/video file in another language (`assembly --sandbox dub talk.mp4 --lang de`): diarized transcription, per-utterance LLM translation, streaming TTS per speaker, and an ffmpeg track-swap that leaves the video untouched. Sandbox-only today, like `speak`.
- **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model.
- **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io.
- **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`.
Expand Down
106 changes: 106 additions & 0 deletions aai_cli/commands/dub.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
from __future__ import annotations

from pathlib import Path

import typer

from aai_cli import dub_exec, help_panels, llm, options
from aai_cli.context import run_command
from aai_cli.help_text import examples_epilog

app = typer.Typer()


@app.command(
rich_help_panel=help_panels.TRANSCRIPTION,
# --sandbox is a root flag, so it must come before the subcommand in every example.
epilog=examples_epilog(
[
("Dub a talk into German (sandbox only)", "assembly --sandbox dub talk.mp4 --lang de"),
("Use a language name instead of a code", "assembly --sandbox dub talk.mp4 -l Spanish"),
(
"Dub every speaker with one voice",
"assembly --sandbox dub talk.mp4 -l fr --voice paul",
),
(
"Pin a voice per diarized speaker",
"assembly --sandbox dub panel.mp4 -l de --voice A=jane --voice B=paul",
),
(
"Reuse a finished transcript instead of re-transcribing",
"assembly --sandbox dub talk.mp4 -l de -t TRANSCRIPT_ID",
),
(
"Choose the output file",
"assembly --sandbox dub talk.mp4 -l de --out talk-german.mp4",
),
]
),
)
def dub(
ctx: typer.Context,
media: str = typer.Argument(
...,
help="Local audio/video file to dub (the video stream is copied untouched).",
),
lang: str = typer.Option(
...,
"--lang",
"-l",
help="Target language: an ISO code (de, fr, es, …) or a language name (German).",
),
transcript_id: str | None = typer.Option(
None,
"--transcript-id",
"-t",
help="Reuse an existing diarized transcript of this media instead of "
"transcribing it again.",
),
voice: list[str] = typer.Option(
[],
"--voice",
help="Voice id for every speaker (e.g. jane, michael, paul), or SPEAKER=VOICE "
"to pin a diarized speaker (repeatable, e.g. --voice A=jane).",
),
model: str = typer.Option(
llm.DEFAULT_MODEL,
"--model",
help="LLM Gateway model that translates the utterances.",
rich_help_panel=help_panels.OPT_LLM,
autocompletion=llm.complete_model,
),
max_tokens: int = typer.Option(
llm.DEFAULT_MAX_TOKENS,
"--max-tokens",
help="Max tokens per utterance translation.",
rich_help_panel=help_panels.OPT_LLM,
),
out: Path | None = typer.Option(
None, "--out", help="Output file (default: <name>.dub.<lang><ext> next to the input)."
),
json_out: bool = options.json_option("Emit JSON describing the dubbed file."),
) -> None:
"""Dub a video or audio file into another language (sandbox only).

The whole platform in one command: the media is transcribed with diarized
utterance timestamps, each utterance is translated by an LLM Gateway model,
the translations are synthesized with streaming TTS (one voice per
speaker), and ffmpeg lays the new audio over the original — video copied
untouched. Streaming TTS only exists in the sandbox today — run it as
'assembly --sandbox dub' (--sandbox goes before the subcommand). Requires
ffmpeg.
"""
opts = dub_exec.DubOptions(
media=media,
language=lang,
transcript_id=transcript_id,
voice=voice,
model=model,
max_tokens=max_tokens,
out=out,
)
run_command(
ctx,
lambda state, json_mode: dub_exec.run_dub(opts, state, json_mode=json_mode),
json=json_out,
)
Loading
Loading