AssemblyAI · alexkroman · Jun 12, 2026 · Jun 12, 2026
diff --git a/aai_cli/choices.py b/aai_cli/choices.py
@@ -17,6 +17,7 @@ class TranscriptOutput(enum.StrEnum):
     status = "status"
     utterances = "utterances"
     srt = "srt"
+    vtt = "vtt"
     json = "json"
 
 

diff --git a/aai_cli/client.py b/aai_cli/client.py
@@ -3,6 +3,7 @@
 import contextlib
 import json
 import re
+from abc import abstractmethod
 from collections.abc import Callable, Generator, Iterable
 from pathlib import Path
 from typing import Any, Literal, Protocol
@@ -222,24 +223,61 @@ def _render_utterances(transcript: Any) -> str:
     )
 
 
-def _export_srt(transcript: Any) -> str:
+class _SubtitleTranscript(Protocol):
+    """The slice of ``aai.Transcript`` the subtitle renderers touch."""
+
+    @abstractmethod
+    def export_subtitles_srt(self, chars_per_caption: int | None) -> str:
+        """Fetch the transcript's SRT captions."""
+
+    @abstractmethod
+    def export_subtitles_vtt(self, chars_per_caption: int | None) -> str:
+        """Fetch the transcript's VTT captions."""
+
+
+def _export_srt(transcript: _SubtitleTranscript, chars_per_caption: int | None) -> str:
     # The SDK fetches SRT from the `/srt` export endpoint, so this hits the network.
     with _sdk_errors("Could not export SRT subtitles"):
-        return str(transcript.export_subtitles_srt())
+        return str(transcript.export_subtitles_srt(chars_per_caption=chars_per_caption))
+
+
+def _export_vtt(transcript: _SubtitleTranscript, chars_per_caption: int | None) -> str:
+    # The SDK fetches VTT from the `/vtt` export endpoint, so this hits the network.
+    with _sdk_errors("Could not export VTT subtitles"):
+        return str(transcript.export_subtitles_vtt(chars_per_caption=chars_per_caption))
+
 
+# Subtitle fields hit an export endpoint and take the --chars-per-caption knob.
+_SUBTITLE_RENDERERS: dict[str, Callable[[_SubtitleTranscript, int | None], str]] = {
+    "srt": _export_srt,
+    "vtt": _export_vtt,
+}
 
 # Output field -> renderer. Fields absent here fall back to the plain transcript text.
 _FIELD_RENDERERS: dict[str, Callable[[Any], str]] = {
     "id": lambda t: str(getattr(t, "id", "") or ""),
     "status": status_str,
     "utterances": _render_utterances,
-    "srt": _export_srt,
     "json": lambda t: json.dumps(transcript_json_payload(t), default=str),
 }
 
 
-def select_transcript_field(transcript: Any, field: str) -> str:
+def validate_chars_per_caption(chars_per_caption: int | None, field: str | None) -> None:
+    """``--chars-per-caption`` only shapes subtitle exports; any other ``-o`` contradicts it."""
+    if chars_per_caption is not None and field not in _SUBTITLE_RENDERERS:
+        raise UsageError(
+            "--chars-per-caption only applies to subtitle output.",
+            suggestion="Add -o srt or -o vtt.",
+        )
+
+
+def select_transcript_field(
+    transcript: Any, field: str, *, chars_per_caption: int | None = None
+) -> str:
     """Render a single transcript field for ``-o/--output``."""
+    subtitles = _SUBTITLE_RENDERERS.get(field)
+    if subtitles is not None:
+        return subtitles(transcript, chars_per_caption)
     return _FIELD_RENDERERS.get(field, _transcript_text)(transcript)
 
 

diff --git a/aai_cli/code_gen/transcribe.py b/aai_cli/code_gen/transcribe.py
@@ -7,7 +7,7 @@
 
 # ``-o/--output`` choice -> printed-result code, mirroring the run path's
 # ``client._FIELD_RENDERERS`` semantics: plain fields, the speaker-labeled
-# utterances loop, the SRT export endpoint, and the raw ``json_response`` payload.
+# utterances loop, the SRT/VTT export endpoints, and the raw ``json_response`` payload.
 _OUTPUT_SNIPPETS: dict[str, str] = {
     "text": "print(transcript.text)",
     "id": "print(transcript.id)",
@@ -16,16 +16,21 @@
         'for utt in transcript.utterances or []:\n    print(f"Speaker {utt.speaker}: {utt.text}")'
     ),
     "srt": "print(transcript.export_subtitles_srt())",
+    "vtt": "print(transcript.export_subtitles_vtt())",
     "json": "print(json.dumps(transcript.json_response, default=str))",
 }
 
+# The subtitle exports take the --chars-per-caption knob as a kwarg.
+_SUBTITLE_FORMATS = ("srt", "vtt")
+
 
 def render(
     merged: dict[str, object],
     source: str,
     *,
     llm_gateway: dict[str, object] | None = None,
     output: str | None = None,
+    chars_per_caption: int | None = None,
     download_sections: list[str] | None = None,
 ) -> str:
     """Generate a runnable transcribe script reproducing this CLI invocation.
@@ -37,7 +42,7 @@ def render(
 
     When `output` (a ``-o/--output`` field name) is given, the script prints that one
     field instead — and, as in the real command, it takes precedence over the LLM chain
-    and the analysis sections.
+    and the analysis sections. `chars_per_caption` shapes the srt/vtt export calls.
 
     When `download_sections` (yt-dlp ``--download-sections`` specs) is given for a
     downloadable URL, the generated yt-dlp call fetches only those parts of the source.
@@ -57,7 +62,7 @@ def render(
             has_sections=ranges_expr is not None,
         )
         + _transcribe_block(merged, source, needs_download=needs_download, ranges_expr=ranges_expr)
-        + _result_block(merged, llm_gateway, output)
+        + _result_block(merged, llm_gateway, output, chars_per_caption)
     )
     parts.append("")
     return "\n".join(parts)
@@ -183,10 +188,18 @@ def _transcribe_block(
 
 
 def _result_block(
-    merged: dict[str, object], llm_gateway: dict[str, object] | None, output: str | None
+    merged: dict[str, object],
+    llm_gateway: dict[str, object] | None,
+    output: str | None,
+    chars_per_caption: int | None,
 ) -> list[str]:
     """The printed-result lines: one ``-o`` field, the LLM chain, or the analysis sections."""
     if output is not None:
+        if output in _SUBTITLE_FORMATS and chars_per_caption is not None:
+            return [
+                f"print(transcript.export_subtitles_{output}"
+                f"(chars_per_caption={chars_per_caption}))"
+            ]
         # Unknown names fall back to the plain text, like select_transcript_field does.
         return [_OUTPUT_SNIPPETS.get(output, _OUTPUT_SNIPPETS["text"])]
     if llm_gateway:

diff --git a/aai_cli/commands/transcribe.py b/aai_cli/commands/transcribe.py
@@ -320,8 +320,9 @@ def transcribe(
         None,
         "-o",
         "--output",
-        help="Print one field: text, id, status, utterances, srt (captions), or json.",
+        help="Print one field: text, id, status, utterances, srt or vtt (captions), or json.",
     ),
+    chars_per_caption: int | None = options.chars_per_caption_option(),
     out: Path | None = typer.Option(
         None,
         "--out",
@@ -397,6 +398,7 @@ def transcribe(
         model=model,
         max_tokens=max_tokens,
         output_field=output_field,
+        chars_per_caption=chars_per_caption,
         out=out,
         show_code=show_code,
     )

diff --git a/aai_cli/commands/transcripts.py b/aai_cli/commands/transcripts.py
@@ -62,6 +62,7 @@ def render(data: list[dict[str, object]]) -> object:
             ("Fetch a transcript's text by id", "assembly transcripts get 5551234-abcd"),
             ("Speaker-labeled turns", "assembly transcripts get 5551234-abcd -o utterances"),
             ("Save SRT subtitles", "assembly transcripts get 5551234-abcd -o srt > captions.srt"),
+            ("Save VTT subtitles", "assembly transcripts get 5551234-abcd -o vtt > captions.vtt"),
             ("Get the raw JSON", "assembly transcripts get 5551234-abcd --json"),
         ]
     )
@@ -75,14 +76,16 @@ def get(
         "--output",
         help="Print one field of the result.",
     ),
+    chars_per_caption: int | None = options.chars_per_caption_option(),
     json_out: bool = options.json_option(),
 ) -> None:
     """Fetch a past transcript by id and print its text."""
 
     def body(state: AppState, json_mode: bool) -> None:
-        # Cheap local id validation first: a malformed id is a usage error whether
-        # or not the user is signed in, so it must not trigger auth/login first.
+        # Cheap local validation first: a malformed id or flag conflict is a usage
+        # error whether or not the user is signed in, so it must not trigger auth.
         client.validate_transcript_id(transcript_id)
+        client.validate_chars_per_caption(chars_per_caption, output_field)
         api_key = state.resolve_api_key()
         transcript = client.get_transcript(api_key, transcript_id)
         if client.status_str(transcript) == "error":
@@ -92,7 +95,11 @@ def body(state: AppState, json_mode: bool) -> None:
             )
         if output_field is not None:
             # Raw single-field output for pipelines (overrides --json), matching `transcribe`.
-            output.emit_text(client.select_transcript_field(transcript, output_field))
+            output.emit_text(
+                client.select_transcript_field(
+                    transcript, output_field, chars_per_caption=chars_per_caption
+                )
+            )
             return
         if json_mode:
             # The full SDK payload, identical to `assembly transcribe … --json`, so the

diff --git a/aai_cli/options.py b/aai_cli/options.py
@@ -19,6 +19,17 @@ def json_option(help_text: str = "Output raw JSON.") -> bool:
     return flag
 
 
+def chars_per_caption_option() -> int | None:
+    """The ``--chars-per-caption`` knob for the ``-o srt``/``-o vtt`` subtitle exports."""
+    value: int | None = typer.Option(
+        None,
+        "--chars-per-caption",
+        min=1,
+        help="Max characters per caption line (only with -o srt or -o vtt).",
+    )
+    return value
+
+
 # Batch-mode flags for `transcribe` (see transcribe_batch.py). Defined here because
 # this module owns the FBT003 carve-out for Typer's boolean positional defaults.
 

diff --git a/aai_cli/skills/aai-cli/references/history.md b/aai_cli/skills/aai-cli/references/history.md
@@ -32,8 +32,9 @@ Fetch a past transcript by id and print its text.
 
 Key options:
 
-- `-o/--output text|id|status|utterances|srt|json` — print one field; omit for
-  the default human view.
+- `-o/--output text|id|status|utterances|srt|vtt|json` — print one field; omit
+  for the default human view. `--chars-per-caption N` caps caption line length
+  for the srt/vtt exports.
 - `--json` — full raw JSON.
 
 Examples:

diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md
@@ -25,7 +25,8 @@ High-value flags (run `assembly transcribe --help` for the full set):
   `--config-file config.json`.
 - Post-process: `--llm "PROMPT"` (repeatable; chains over the transcript via LLM
   Gateway), `--translate-to es` (repeatable).
-- Output: `-o text|id|status|utterances|srt|json`, `--json`, `--show-code`.
+- Output: `-o text|id|status|utterances|srt|vtt|json`, `--chars-per-caption N`
+  (caption line length, with `-o srt`/`-o vtt`), `--json`, `--show-code`.
 
 Examples:
 

diff --git a/aai_cli/transcribe_exec.py b/aai_cli/transcribe_exec.py
@@ -135,12 +135,15 @@ def out_payload(
     transcript: aai.Transcript,
     output_field: choices.TranscriptOutput | None,
     *,
+    chars_per_caption: int | None,
     json_mode: bool,
 ) -> str:
     """The text to write for ``--out``: the chosen ``-o`` field, the ``--json`` payload,
     or the plain transcript text — the same content stdout would get, as a file artifact."""
     if output_field is not None:
-        return client.select_transcript_field(transcript, output_field)
+        return client.select_transcript_field(
+            transcript, output_field, chars_per_caption=chars_per_caption
+        )
     if json_mode:
         return json.dumps(client.transcript_json_payload(transcript), default=str)
     return client.select_transcript_field(transcript, choices.TranscriptOutput.text)
@@ -197,6 +200,7 @@ def deliver_result(
     api_key: str,
     out: Path | None,
     output_field: choices.TranscriptOutput | None,
+    chars_per_caption: int | None,
     transform: TransformOptions,
     json_mode: bool,
     quiet: bool,
@@ -206,14 +210,23 @@ def deliver_result(
     if out is not None:
         # Write a clean file artifact and confirm on stderr; stdout stays empty.
         # The path itself was validated up front by validate_out_path.
-        out.write_text(out_payload(transcript, output_field, json_mode=json_mode) + "\n")
+        out.write_text(
+            out_payload(
+                transcript, output_field, chars_per_caption=chars_per_caption, json_mode=json_mode
+            )
+            + "\n"
+        )
         if not quiet:
             output.error_console.print(output.success(f"Saved to {escape(str(out))}"))
         return
 
     if output_field is not None:
         # Raw single-field output for pipelines (overrides --json and analysis render).
-        output.emit_text(client.select_transcript_field(transcript, output_field))
+        output.emit_text(
+            client.select_transcript_field(
+                transcript, output_field, chars_per_caption=chars_per_caption
+            )
+        )
         return
 
     if transform.prompts:
@@ -295,6 +308,7 @@ class TranscribeOptions:
     model: str
     max_tokens: int
     output_field: choices.TranscriptOutput | None
+    chars_per_caption: int | None
     out: Path | None
     show_code: bool
 
@@ -366,6 +380,7 @@ def _print_show_code(opts: TranscribeOptions, merged: dict[str, object]) -> None
             audio,
             llm_gateway=gateway,
             output=opts.output_field,
+            chars_per_caption=opts.chars_per_caption,
             download_sections=list(opts.download_sections or []),
         )
     )
@@ -384,6 +399,7 @@ def run_transcribe(opts: TranscribeOptions, state: AppState, *, json_mode: bool)
     validate_out_with_llm(opts.out, opts.llm_prompt)
     validate_out_path(opts.out)
     validate_json_with_output(opts.output_field, json_mode=json_mode)
+    client.validate_chars_per_caption(opts.chars_per_caption, opts.output_field)
 
     merged = config_builder.merge_transcribe_config(
         flags=flags, overrides=opts.config_kv, config_file=opts.config_file
@@ -438,6 +454,7 @@ def run_transcribe(opts: TranscribeOptions, state: AppState, *, json_mode: bool)
         api_key=api_key,
         out=opts.out,
         output_field=opts.output_field,
+        chars_per_caption=opts.chars_per_caption,
         transform=TransformOptions(
             prompts=list(opts.llm_prompt or []), model=opts.model, max_tokens=opts.max_tokens
         ),

diff --git a/tests/__snapshots__/test_snapshots_help_history.ambr b/tests/__snapshots__/test_snapshots_help_history.ambr
@@ -69,10 +69,14 @@
   │ *    transcript_id      TEXT  Transcript id. [required]                      │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
-  │ --output  -o      [text|id|status|utterances|s  Print one field of the       │
-  │                   rt|json]                      result.                      │
-  │ --json    -j                                    Output raw JSON.             │
-  │ --help                                          Show this message and exit.  │
+  │ --output             -o      [text|id|status|uttera  Print one field of the  │
+  │                              nces|srt|vtt|json]      result.                 │
+  │ --chars-per-caption          INTEGER RANGE [x>=1]    Max characters per      │
+  │                                                      caption line (only with │
+  │                                                      -o srt or -o vtt).      │
+  │ --json               -j                              Output raw JSON.        │
+  │ --help                                               Show this message and   │
+  │                                                      exit.                   │
   ╰──────────────────────────────────────────────────────────────────────────────╯
 
    Examples
@@ -82,6 +86,8 @@
    $ assembly transcripts get 5551234-abcd -o utterances
    Save SRT subtitles
    $ assembly transcripts get 5551234-abcd -o srt > captions.srt
+   Save VTT subtitles
+   $ assembly transcripts get 5551234-abcd -o vtt > captions.vtt
    Get the raw JSON
    $ assembly transcripts get 5551234-abcd --json