AssemblyAI · alexkroman · Jun 12, 2026 · Jun 12, 2026 · Jun 12, 2026
diff --git a/.importlinter b/.importlinter
@@ -10,6 +10,7 @@ source_modules =
     aai_cli.agent_exec
     aai_cli.argscan
     aai_cli.auth
+    aai_cli.caption_exec
     aai_cli.client
     aai_cli.clip_exec
     aai_cli.clip_select
@@ -61,6 +62,7 @@ modules =
     aai_cli.commands.account
     aai_cli.commands.agent
     aai_cli.commands.audit
+    aai_cli.commands.caption
     aai_cli.commands.clip
     aai_cli.commands.deploy
     aai_cli.commands.dev

diff --git a/aai_cli/caption_exec.py b/aai_cli/caption_exec.py
@@ -0,0 +1,238 @@
+"""Run logic for `assembly caption`: transcribe → SRT export → ffmpeg burn-in.
+
+The command module (aai_cli/commands/caption.py) only parses argv — it builds a
+``CaptionOptions`` and hands it to ``run_caption`` via ``context.run_command``
+(the options/run split, see AGENTS.md), so tests drive the whole pipeline by
+constructing options directly.
+
+The pipeline: the video is transcribed (or an existing transcript is reused via
+``--transcript-id``), the transcript's SRT captions are fetched from the export
+endpoint, and ffmpeg's ``subtitles`` filter burns them into the picture (open
+captions, always visible) while the audio stream is copied untouched. A
+YouTube/media-page URL is downloaded first — always the full video, since the
+captions are burned into it.
+"""
+
+from __future__ import annotations
+
+import shutil
+import subprocess
+import tempfile
+from dataclasses import dataclass
+from pathlib import Path
+
+import assemblyai as aai
+from rich.markup import escape
+
+from aai_cli import client, output, youtube
+from aai_cli.context import AppState
+from aai_cli.errors import CLIError, UsageError
+
+
+@dataclass(frozen=True)
+class CaptionOptions:
+    """Every `assembly caption` flag as plain data (``--json`` excluded:
+    run_command resolves it into the ``json_mode`` argument)."""
+
+    # The raw source as typed: a local path, or a downloadable media-page URL
+    # (a pathlib.Path would collapse the "//" in "https://").
+    media: str
+    transcript_id: str | None
+    chars_per_caption: int | None
+    font_size: int | None
+    out: Path | None
+
+
+def default_out_path(media: Path) -> Path:
+    """The default output file: ``<stem>.captioned<ext>`` next to the input."""
+    return media.parent / f"{media.stem}.captioned{media.suffix}"
+
+
+# ffmpeg's filtergraph syntax gives these characters meaning (option/filter/chain
+# separators, stream labels, quoting), so a path embedded in `-vf subtitles=…`
+# must escape them or a TMPDIR containing one would corrupt the filter spec.
+_FILTER_ESCAPES = str.maketrans({ch: f"\\{ch}" for ch in "\\':,;[]"})
+
+
+def subtitles_filter(srt: Path, font_size: int | None) -> str:
+    """The ``-vf`` filtergraph burning ``srt`` into the video."""
+    spec = f"subtitles={str(srt).translate(_FILTER_ESCAPES)}"
+    if font_size is not None:
+        spec += f":force_style=FontSize={font_size}"
+    return spec
+
+
+def _validate_media(media: Path) -> None:
+    """Reject a missing local source before credential resolution, so a typo'd
+    path reads as "file not found", never as a login prompt or an ffmpeg error."""
+    if not media.exists():
+        raise CLIError(
+            f"File not found: {media}",
+            error_type="file_not_found",
+            exit_code=2,
+            suggestion="Check the path. assembly caption needs a local video file.",
+        )
+    if not media.is_file():
+        raise CLIError(
+            f"Not a file: {media}",
+            error_type="not_a_file",
+            exit_code=2,
+            suggestion="Pass a video file, not a directory.",
+        )
+
+
+def _validate_out(out: Path, media: Path) -> None:
+    """The captioned file must never overwrite its own input: ffmpeg would read
+    and write the same file concurrently, corrupting it."""
+    if out.resolve() == media.resolve():
+        raise UsageError(
+            "--out would overwrite the input file.",
+            suggestion="Pick a different output path.",
+        )
+
+
+def _require_ffmpeg() -> str:
+    """The ffmpeg executable; checked before any (billed) transcription work."""
+    path = shutil.which("ffmpeg")
+    if path is None:
+        raise CLIError(
+            "ffmpeg is required to burn captions into video, but it isn't on PATH.",
+            error_type="missing_dependency",
+            suggestion="Install it (brew install ffmpeg / apt install ffmpeg) and re-run.",
+        )
+    return path
+
+
+def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]:
+    """Boundary seam for tests: one ffmpeg invocation, output captured."""
+    return subprocess.run(args, capture_output=True, text=True, check=False)
+
+
+def _burn(ffmpeg: str, media: Path, srt: Path, out: Path, font_size: int | None) -> None:
+    """Burn the ``srt`` captions into ``media``'s video stream, writing ``out``.
+
+    The video is necessarily re-encoded (the captions become pixels); ``-c:a
+    copy`` carries the audio over untouched. The explicit ``-map 0:v`` makes
+    audio-only input an ffmpeg error ("matches no streams") instead of a silent
+    uncaptioned copy; ``-map 0:a?`` keeps a silent video legal. ``-y`` makes a
+    re-run overwrite its own earlier output instead of stalling on ffmpeg's
+    prompt.
+    """
+    result = _run_ffmpeg(
+        [
+            ffmpeg,
+            "-hide_banner",
+            "-loglevel",
+            "error",
+            "-y",
+            "-i",
+            str(media),
+            "-vf",
+            subtitles_filter(srt, font_size),
+            "-map",
+            "0:v",
+            "-map",
+            "0:a?",
+            "-c:a",
+            "copy",
+            str(out),
+        ]
+    )
+    if result.returncode != 0:
+        detail = result.stderr.strip().splitlines()
+        reason = detail[-1] if detail else f"ffmpeg exited with code {result.returncode}"
+        raise CLIError(
+            f"Could not write {out.name}: {reason}",
+            error_type="caption_failed",
+            suggestion="Check that the input is a readable video file — captions "
+            "can't be burned into audio-only media.",
+        )
+
+
+def _resolve_transcript(
+    opts: CaptionOptions, media: Path, state: AppState, *, json_mode: bool
+) -> object:
+    """The transcript whose captions are burned in: fetched by id, or made fresh
+    from the (already local) media file."""
+    if opts.transcript_id is not None:
+        return client.get_transcript(state.resolve_api_key(), opts.transcript_id)
+    api_key = state.resolve_api_key()
+    with output.status("Transcribing for captions…", json_mode=json_mode, quiet=state.quiet):
+        return client.transcribe(api_key, str(media), config=aai.TranscriptionConfig())
+
+
+def _fetch_srt(transcript: object, opts: CaptionOptions, *, json_mode: bool, quiet: bool) -> str:
+    """The transcript's SRT captions from the export endpoint; empty is an error."""
+    with output.status("Fetching captions…", json_mode=json_mode, quiet=quiet):
+        srt = client.select_transcript_field(
+            transcript, "srt", chars_per_caption=opts.chars_per_caption
+        )
+    if not srt.strip():
+        transcript_id = str(getattr(transcript, "id", ""))
+        raise CLIError(
+            f"Transcript {transcript_id} has no captions to burn in.",
+            error_type="no_captions",
+            exit_code=2,
+            suggestion="The media may contain no speech; check it with "
+            "'assembly transcribe <file>'.",
+        )
+    return srt
+
+
+def run_caption(opts: CaptionOptions, state: AppState, *, json_mode: bool) -> None:
+    """Execute one `assembly caption` invocation from already-parsed flags."""
+    ffmpeg = _require_ffmpeg()
+    if youtube.is_downloadable_url(opts.media):
+        # A media-page URL (YouTube, …) is downloaded once — always the full
+        # video, since the captions are burned into it. The download dir is
+        # temporary, so the default output lands in the current directory.
+        with tempfile.TemporaryDirectory(prefix="aai-caption-src-") as td:
+            with output.status("Downloading video…", json_mode=json_mode, quiet=state.quiet):
+                local = youtube.download_media(opts.media, Path(td), video=True)
+            out = opts.out if opts.out is not None else Path.cwd() / default_out_path(local).name
+            _validate_out(out, local)
+            _caption_and_emit(opts, local, out, ffmpeg, state, json_mode=json_mode)
+        return
+    if opts.media.startswith(("http://", "https://")):
+        raise UsageError(
+            "assembly caption can't fetch this URL; it captions a local file or a "
+            "media-page URL yt-dlp can download (YouTube, …).",
+            suggestion="Download the video first, then caption the local copy.",
+        )
+    media = Path(opts.media)
+    _validate_media(media)
+    out = opts.out if opts.out is not None else default_out_path(media)
+    _validate_out(out, media)
+    _caption_and_emit(opts, media, out, ffmpeg, state, json_mode=json_mode)
+
+
+def _caption_and_emit(
+    opts: CaptionOptions,
+    media: Path,
+    out: Path,
+    ffmpeg: str,
+    state: AppState,
+    *,
+    json_mode: bool,
+) -> None:
+    """Caption an already-local video file into ``out`` and report the result."""
+    transcript = _resolve_transcript(opts, media, state, json_mode=json_mode)
+    transcript_id = str(getattr(transcript, "id", ""))
+    srt = _fetch_srt(transcript, opts, json_mode=json_mode, quiet=state.quiet)
+    captions = srt.count("-->")  # one arrow per SRT cue timing line
+    with tempfile.TemporaryDirectory(prefix="aai-caption-") as tmp:
+        srt_path = Path(tmp) / "captions.srt"
+        srt_path.write_text(srt, encoding="utf-8")
+        with output.status("Burning captions…", json_mode=json_mode, quiet=state.quiet):
+            _burn(ffmpeg, media, srt_path, out, opts.font_size)
+    payload: dict[str, object] = {
+        "source": opts.media,
+        "out": str(out),
+        "transcript_id": transcript_id,
+        "captions": captions,
+    }
+    output.emit(
+        payload,
+        lambda _: output.success(f"{escape(str(out))}  {captions} caption(s) burned in"),
+        json_mode=json_mode,
+    )
diff --git a/aai_cli/clip_exec.py b/aai_cli/clip_exec.py
@@ -54,6 +54,7 @@ class ClipOptions:
     padding: float
     snap: bool
     out_dir: Path | None
+    video: bool
 
 
 def _llm_segments(
@@ -347,15 +348,19 @@ def run_clip(opts: ClipOptions, state: AppState, *, json_mode: bool) -> None:
     """Execute one `assembly clip` invocation from already-parsed flags."""
     _validate_out_dir(opts.out_dir)
     _validate_selection(opts)
+    youtube.validate_video_flag(opts.media, video=opts.video)
     explicit = [clip_select.parse_range(value) for value in opts.ranges]
     ffmpeg = _require_ffmpeg()
     if youtube.is_downloadable_url(opts.media):
-        # A media-page URL (YouTube, podcast page, …) is downloaded once and
-        # clipped locally. The download dir is temporary, so the clips land in
-        # --out-dir or the current directory — never next to the temp file.
+        # A media-page URL (YouTube, podcast page, …) is downloaded once — the
+        # audio track by default, the full video with --video so the clips carry
+        # video too — and clipped locally. The download dir is temporary, so the
+        # clips land in --out-dir or the current directory — never next to the
+        # temp file.
+        downloading = "Downloading video…" if opts.video else "Downloading audio…"
         with tempfile.TemporaryDirectory(prefix="aai-clip-") as td:
-            with output.status("Downloading audio…", json_mode=json_mode, quiet=state.quiet):
-                local = youtube.download_audio(opts.media, Path(td))
+            with output.status(downloading, json_mode=json_mode, quiet=state.quiet):
+                local = youtube.download_media(opts.media, Path(td), video=opts.video)
             out_dir = opts.out_dir if opts.out_dir is not None else Path.cwd()
             _cut_and_emit(opts, local, out_dir, explicit, ffmpeg, state, json_mode=json_mode)
         return

diff --git a/aai_cli/commands/caption.py b/aai_cli/commands/caption.py
@@ -0,0 +1,85 @@
+from __future__ import annotations
+
+from pathlib import Path
+
+import typer
+
+from aai_cli import caption_exec, help_panels, options
+from aai_cli.context import run_command
+from aai_cli.help_text import examples_epilog
+
+app = typer.Typer()
+
+
+@app.command(
+    rich_help_panel=help_panels.TRANSCRIPTION,
+    epilog=examples_epilog(
+        [
+            ("Burn captions into a video", "assembly caption talk.mp4"),
+            (
+                "Caption a YouTube video (downloaded via yt-dlp)",
+                'assembly caption "https://youtube.com/watch?v=ID"',
+            ),
+            (
+                "Reuse a finished transcript instead of re-transcribing",
+                "assembly caption talk.mp4 -t TRANSCRIPT_ID",
+            ),
+            (
+                "Shorter caption lines in a bigger font",
+                "assembly caption talk.mp4 --chars-per-caption 32 --font-size 28",
+            ),
+            ("Choose the output file", "assembly caption talk.mp4 --out talk-captioned.mp4"),
+        ]
+    ),
+)
+def caption(
+    ctx: typer.Context,
+    media: str = typer.Argument(
+        ...,
+        help="Video to caption: a local file, or a YouTube/media-page URL "
+        "(the full video is downloaded via yt-dlp).",
+    ),
+    transcript_id: str | None = typer.Option(
+        None,
+        "--transcript-id",
+        "-t",
+        help="Reuse an existing transcript of this media instead of transcribing it again.",
+    ),
+    chars_per_caption: int | None = typer.Option(
+        None,
+        "--chars-per-caption",
+        min=1,
+        help="Max characters per caption line.",
+    ),
+    font_size: int | None = typer.Option(
+        None,
+        "--font-size",
+        min=1,
+        help="Font size of the burned-in captions (ffmpeg's default styling when omitted).",
+    ),
+    out: Path | None = typer.Option(
+        None, "--out", help="Output file (default: <name>.captioned<ext> next to the input)."
+    ),
+    json_out: bool = options.json_option("Emit JSON describing the captioned file."),
+) -> None:
+    """Burn always-visible captions into a video.
+
+    The video is transcribed (or an existing transcript is reused with
+    --transcript-id), the transcript's SRT captions are fetched, and ffmpeg
+    (which must be installed) burns them into the picture as open captions —
+    the audio stream is copied untouched. A YouTube/media-page URL is
+    downloaded first (always the full video); its output lands in --out or
+    the current directory.
+    """
+    opts = caption_exec.CaptionOptions(
+        media=media,
+        transcript_id=transcript_id,
+        chars_per_caption=chars_per_caption,
+        font_size=font_size,
+        out=out,
+    )
+    run_command(
+        ctx,
+        lambda state, json_mode: caption_exec.run_caption(opts, state, json_mode=json_mode),
+        json=json_out,
+    )