Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .importlinter
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ source_modules =
aai_cli.agent_exec
aai_cli.argscan
aai_cli.auth
aai_cli.caption_exec
aai_cli.client
aai_cli.clip_exec
aai_cli.clip_select
Expand Down Expand Up @@ -61,6 +62,7 @@ modules =
aai_cli.commands.account
aai_cli.commands.agent
aai_cli.commands.audit
aai_cli.commands.caption
aai_cli.commands.clip
aai_cli.commands.deploy
aai_cli.commands.dev
Expand Down
238 changes: 238 additions & 0 deletions aai_cli/caption_exec.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,238 @@
"""Run logic for `assembly caption`: transcribe → SRT export → ffmpeg burn-in.

The command module (aai_cli/commands/caption.py) only parses argv — it builds a
``CaptionOptions`` and hands it to ``run_caption`` via ``context.run_command``
(the options/run split, see AGENTS.md), so tests drive the whole pipeline by
constructing options directly.

The pipeline: the video is transcribed (or an existing transcript is reused via
``--transcript-id``), the transcript's SRT captions are fetched from the export
endpoint, and ffmpeg's ``subtitles`` filter burns them into the picture (open
captions, always visible) while the audio stream is copied untouched. A
YouTube/media-page URL is downloaded first — always the full video, since the
captions are burned into it.
"""

from __future__ import annotations

import shutil
import subprocess
import tempfile
from dataclasses import dataclass
from pathlib import Path

import assemblyai as aai
from rich.markup import escape

from aai_cli import client, output, youtube
from aai_cli.context import AppState
from aai_cli.errors import CLIError, UsageError


@dataclass(frozen=True)
class CaptionOptions:
"""Every `assembly caption` flag as plain data (``--json`` excluded:
run_command resolves it into the ``json_mode`` argument)."""

# The raw source as typed: a local path, or a downloadable media-page URL
# (a pathlib.Path would collapse the "//" in "https://").
media: str
transcript_id: str | None
chars_per_caption: int | None
font_size: int | None
out: Path | None


def default_out_path(media: Path) -> Path:
"""The default output file: ``<stem>.captioned<ext>`` next to the input."""
return media.parent / f"{media.stem}.captioned{media.suffix}"


# ffmpeg's filtergraph syntax gives these characters meaning (option/filter/chain
# separators, stream labels, quoting), so a path embedded in `-vf subtitles=…`
# must escape them or a TMPDIR containing one would corrupt the filter spec.
_FILTER_ESCAPES = str.maketrans({ch: f"\\{ch}" for ch in "\\':,;[]"})


def subtitles_filter(srt: Path, font_size: int | None) -> str:
"""The ``-vf`` filtergraph burning ``srt`` into the video."""
spec = f"subtitles={str(srt).translate(_FILTER_ESCAPES)}"
if font_size is not None:
spec += f":force_style=FontSize={font_size}"
return spec


def _validate_media(media: Path) -> None:
"""Reject a missing local source before credential resolution, so a typo'd
path reads as "file not found", never as a login prompt or an ffmpeg error."""
if not media.exists():
raise CLIError(
f"File not found: {media}",
error_type="file_not_found",
exit_code=2,
suggestion="Check the path. assembly caption needs a local video file.",
)
if not media.is_file():
raise CLIError(
f"Not a file: {media}",
error_type="not_a_file",
exit_code=2,
suggestion="Pass a video file, not a directory.",
)


def _validate_out(out: Path, media: Path) -> None:
"""The captioned file must never overwrite its own input: ffmpeg would read
and write the same file concurrently, corrupting it."""
if out.resolve() == media.resolve():
raise UsageError(
"--out would overwrite the input file.",
suggestion="Pick a different output path.",
)


def _require_ffmpeg() -> str:
"""The ffmpeg executable; checked before any (billed) transcription work."""
path = shutil.which("ffmpeg")
if path is None:
raise CLIError(
"ffmpeg is required to burn captions into video, but it isn't on PATH.",
error_type="missing_dependency",
suggestion="Install it (brew install ffmpeg / apt install ffmpeg) and re-run.",
)
return path


def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]:
"""Boundary seam for tests: one ffmpeg invocation, output captured."""
return subprocess.run(args, capture_output=True, text=True, check=False)


def _burn(ffmpeg: str, media: Path, srt: Path, out: Path, font_size: int | None) -> None:
"""Burn the ``srt`` captions into ``media``'s video stream, writing ``out``.

The video is necessarily re-encoded (the captions become pixels); ``-c:a
copy`` carries the audio over untouched. The explicit ``-map 0:v`` makes
audio-only input an ffmpeg error ("matches no streams") instead of a silent
uncaptioned copy; ``-map 0:a?`` keeps a silent video legal. ``-y`` makes a
re-run overwrite its own earlier output instead of stalling on ffmpeg's
prompt.
"""
result = _run_ffmpeg(
[
ffmpeg,
"-hide_banner",
"-loglevel",
"error",
"-y",
"-i",
str(media),
"-vf",
subtitles_filter(srt, font_size),
"-map",
"0:v",
"-map",
"0:a?",
"-c:a",
"copy",
str(out),
]
)
if result.returncode != 0:
detail = result.stderr.strip().splitlines()
reason = detail[-1] if detail else f"ffmpeg exited with code {result.returncode}"
raise CLIError(
f"Could not write {out.name}: {reason}",
error_type="caption_failed",
suggestion="Check that the input is a readable video file — captions "
"can't be burned into audio-only media.",
)


def _resolve_transcript(
opts: CaptionOptions, media: Path, state: AppState, *, json_mode: bool
) -> object:
"""The transcript whose captions are burned in: fetched by id, or made fresh
from the (already local) media file."""
if opts.transcript_id is not None:
return client.get_transcript(state.resolve_api_key(), opts.transcript_id)
api_key = state.resolve_api_key()
with output.status("Transcribing for captions…", json_mode=json_mode, quiet=state.quiet):
return client.transcribe(api_key, str(media), config=aai.TranscriptionConfig())


def _fetch_srt(transcript: object, opts: CaptionOptions, *, json_mode: bool, quiet: bool) -> str:
"""The transcript's SRT captions from the export endpoint; empty is an error."""
with output.status("Fetching captions…", json_mode=json_mode, quiet=quiet):
srt = client.select_transcript_field(
transcript, "srt", chars_per_caption=opts.chars_per_caption
)
if not srt.strip():
transcript_id = str(getattr(transcript, "id", ""))
raise CLIError(
f"Transcript {transcript_id} has no captions to burn in.",
error_type="no_captions",
exit_code=2,
suggestion="The media may contain no speech; check it with "
"'assembly transcribe <file>'.",
)
return srt


def run_caption(opts: CaptionOptions, state: AppState, *, json_mode: bool) -> None:
"""Execute one `assembly caption` invocation from already-parsed flags."""
ffmpeg = _require_ffmpeg()
if youtube.is_downloadable_url(opts.media):
# A media-page URL (YouTube, …) is downloaded once — always the full
# video, since the captions are burned into it. The download dir is
# temporary, so the default output lands in the current directory.
with tempfile.TemporaryDirectory(prefix="aai-caption-src-") as td:
with output.status("Downloading video…", json_mode=json_mode, quiet=state.quiet):
local = youtube.download_media(opts.media, Path(td), video=True)
out = opts.out if opts.out is not None else Path.cwd() / default_out_path(local).name
_validate_out(out, local)
_caption_and_emit(opts, local, out, ffmpeg, state, json_mode=json_mode)
return
if opts.media.startswith(("http://", "https://")):
raise UsageError(
"assembly caption can't fetch this URL; it captions a local file or a "
"media-page URL yt-dlp can download (YouTube, …).",
suggestion="Download the video first, then caption the local copy.",
)
media = Path(opts.media)
_validate_media(media)
out = opts.out if opts.out is not None else default_out_path(media)
_validate_out(out, media)
_caption_and_emit(opts, media, out, ffmpeg, state, json_mode=json_mode)


def _caption_and_emit(
opts: CaptionOptions,
media: Path,
out: Path,
ffmpeg: str,
state: AppState,
*,
json_mode: bool,
) -> None:
"""Caption an already-local video file into ``out`` and report the result."""
transcript = _resolve_transcript(opts, media, state, json_mode=json_mode)
transcript_id = str(getattr(transcript, "id", ""))
srt = _fetch_srt(transcript, opts, json_mode=json_mode, quiet=state.quiet)
captions = srt.count("-->") # one arrow per SRT cue timing line
with tempfile.TemporaryDirectory(prefix="aai-caption-") as tmp:
srt_path = Path(tmp) / "captions.srt"
srt_path.write_text(srt, encoding="utf-8")
with output.status("Burning captions…", json_mode=json_mode, quiet=state.quiet):
_burn(ffmpeg, media, srt_path, out, opts.font_size)
payload: dict[str, object] = {
"source": opts.media,
"out": str(out),
"transcript_id": transcript_id,
"captions": captions,
}
output.emit(
payload,
lambda _: output.success(f"{escape(str(out))} {captions} caption(s) burned in"),
json_mode=json_mode,
)
15 changes: 10 additions & 5 deletions aai_cli/clip_exec.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,7 @@ class ClipOptions:
padding: float
snap: bool
out_dir: Path | None
video: bool


def _llm_segments(
Expand Down Expand Up @@ -347,15 +348,19 @@ def run_clip(opts: ClipOptions, state: AppState, *, json_mode: bool) -> None:
"""Execute one `assembly clip` invocation from already-parsed flags."""
_validate_out_dir(opts.out_dir)
_validate_selection(opts)
youtube.validate_video_flag(opts.media, video=opts.video)
explicit = [clip_select.parse_range(value) for value in opts.ranges]
ffmpeg = _require_ffmpeg()
if youtube.is_downloadable_url(opts.media):
# A media-page URL (YouTube, podcast page, …) is downloaded once and
# clipped locally. The download dir is temporary, so the clips land in
# --out-dir or the current directory — never next to the temp file.
# A media-page URL (YouTube, podcast page, …) is downloaded once — the
# audio track by default, the full video with --video so the clips carry
# video too — and clipped locally. The download dir is temporary, so the
# clips land in --out-dir or the current directory — never next to the
# temp file.
downloading = "Downloading video…" if opts.video else "Downloading audio…"
with tempfile.TemporaryDirectory(prefix="aai-clip-") as td:
with output.status("Downloading audio…", json_mode=json_mode, quiet=state.quiet):
local = youtube.download_audio(opts.media, Path(td))
with output.status(downloading, json_mode=json_mode, quiet=state.quiet):
local = youtube.download_media(opts.media, Path(td), video=opts.video)
out_dir = opts.out_dir if opts.out_dir is not None else Path.cwd()
_cut_and_emit(opts, local, out_dir, explicit, ffmpeg, state, json_mode=json_mode)
return
Expand Down
85 changes: 85 additions & 0 deletions aai_cli/commands/caption.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from __future__ import annotations

from pathlib import Path

import typer

from aai_cli import caption_exec, help_panels, options
from aai_cli.context import run_command
from aai_cli.help_text import examples_epilog

app = typer.Typer()


@app.command(
rich_help_panel=help_panels.TRANSCRIPTION,
epilog=examples_epilog(
[
("Burn captions into a video", "assembly caption talk.mp4"),
(
"Caption a YouTube video (downloaded via yt-dlp)",
'assembly caption "https://youtube.com/watch?v=ID"',
),
(
"Reuse a finished transcript instead of re-transcribing",
"assembly caption talk.mp4 -t TRANSCRIPT_ID",
),
(
"Shorter caption lines in a bigger font",
"assembly caption talk.mp4 --chars-per-caption 32 --font-size 28",
),
("Choose the output file", "assembly caption talk.mp4 --out talk-captioned.mp4"),
]
),
)
def caption(
ctx: typer.Context,
media: str = typer.Argument(
...,
help="Video to caption: a local file, or a YouTube/media-page URL "
"(the full video is downloaded via yt-dlp).",
),
transcript_id: str | None = typer.Option(
None,
"--transcript-id",
"-t",
help="Reuse an existing transcript of this media instead of transcribing it again.",
),
chars_per_caption: int | None = typer.Option(
None,
"--chars-per-caption",
min=1,
help="Max characters per caption line.",
),
font_size: int | None = typer.Option(
None,
"--font-size",
min=1,
help="Font size of the burned-in captions (ffmpeg's default styling when omitted).",
),
out: Path | None = typer.Option(
None, "--out", help="Output file (default: <name>.captioned<ext> next to the input)."
),
json_out: bool = options.json_option("Emit JSON describing the captioned file."),
) -> None:
"""Burn always-visible captions into a video.

The video is transcribed (or an existing transcript is reused with
--transcript-id), the transcript's SRT captions are fetched, and ffmpeg
(which must be installed) burns them into the picture as open captions —
the audio stream is copied untouched. A YouTube/media-page URL is
downloaded first (always the full video); its output lands in --out or
the current directory.
"""
opts = caption_exec.CaptionOptions(
media=media,
transcript_id=transcript_id,
chars_per_caption=chars_per_caption,
font_size=font_size,
out=out,
)
run_command(
ctx,
lambda state, json_mode: caption_exec.run_caption(opts, state, json_mode=json_mode),
json=json_out,
)
Loading
Loading