AssemblyAI · alexkroman · Jun 12, 2026 · Jun 12, 2026
diff --git a/README.md b/README.md
@@ -39,6 +39,14 @@ assembly transcribe "https://podcasts.apple.com/us/podcast/id1516093381" --speak
   | assembly --sandbox speak --out episode.wav
 ```
 
+**Cut the highlight reel from a speech** — `clip` downloads the audio, transcribes it, has an LLM pick the windows, and cuts each one into its own file with ffmpeg (here: Steve Jobs' Stanford commencement address):
+
+```sh
+assembly clip "https://www.youtube.com/watch?v=UF8uR6Z6KLc" \
+  --llm "the most quotable 20-40 seconds from each of the stories" \
+  --padding 0.5 --out-dir .
+```
+
 **Burn karaoke subtitles into a music video** — `-o srt` prints captions to stdout, and `--chars-per-caption` keeps the lines short so they flip with the vocals; ffmpeg renders them onto the video (`-f srt -i pipe:` muxes a toggleable soft-subtitle track instead, no re-encode):
 
 ```sh
@@ -173,7 +181,7 @@ assembly init                  # scaffold a starter app
 - **Real-time streaming**: `assembly stream` transcribes the microphone, a file, or a URL live — on macOS it can capture system audio too.
 - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal.
 - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`).
-- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`).
+- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times.
 - **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model.
 - **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io.
 - **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`.

diff --git a/aai_cli/clip_exec.py b/aai_cli/clip_exec.py
@@ -12,8 +12,9 @@
 ``--transcript-id``, or piped on stdin with ``-t -``), ``--llm`` hands the
 timestamped utterances to the LLM Gateway and lets the model pick the windows,
 and ``--range`` adds explicit ones. The selected segments are padded, merged
-where they touch, and each surviving segment is re-encoded into its own file
-with ffmpeg.
+where they touch, snapped into nearby silence (one ffmpeg ``silencedetect``
+pass over the source, skipped with ``--no-snap``), and each surviving segment
+is re-encoded into its own file with ffmpeg.
 """
 
 from __future__ import annotations
@@ -51,6 +52,7 @@ class ClipOptions:
     max_tokens: int
     ranges: list[str]
     padding: float
+    snap: bool
     out_dir: Path | None
 
 
@@ -250,6 +252,37 @@ def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]:
     return subprocess.run(args, capture_output=True, text=True, check=False)
 
 
+# -30dB for at least 0.2s reads as a pause in normal speech recordings.
+_SILENCE_FILTER = "silencedetect=noise=-30dB:d=0.2"
+
+
+def _detect_silences(ffmpeg: str, media: Path) -> list[Segment]:
+    """The silence intervals ffmpeg hears in ``media`` (one decode pass).
+
+    Snapping is best-effort: a failed detection returns no silences (so the
+    cut proceeds at the selected times) rather than failing the command.
+    silencedetect logs at info level on stderr, so the usual ``-loglevel
+    error`` would silence the very lines this parses.
+    """
+    result = _run_ffmpeg(
+        [
+            ffmpeg,
+            "-hide_banner",
+            "-nostats",
+            "-i",
+            str(media),
+            "-af",
+            _SILENCE_FILTER,
+            "-f",
+            "null",
+            "-",
+        ]
+    )
+    if result.returncode != 0:
+        return []
+    return clip_select.parse_silences(result.stderr)
+
+
 def _cut_clip(ffmpeg: str, media: Path, segment: Segment, dest: Path) -> None:
     """Re-encode one segment of ``media`` into ``dest``.
 
@@ -350,6 +383,10 @@ def _cut_and_emit(
     """Select, cut, and report the clips for an already-local media file."""
     matched, transcript_id = _transcript_segments(opts, media, state, json_mode=json_mode)
     segments = clip_select.merge_segments([*matched, *explicit], opts.padding)
+    if opts.snap:
+        with output.status("Detecting silence…", json_mode=json_mode, quiet=state.quiet):
+            silences = _detect_silences(ffmpeg, media)
+        segments = clip_select.snap_to_silences(segments, silences)
     written: list[WrittenClip] = []
     cutting = f"Cutting {len(segments)} clip(s)…"
     with output.status(cutting, json_mode=json_mode, quiet=state.quiet):

diff --git a/aai_cli/clip_select.py b/aai_cli/clip_select.py
@@ -3,14 +3,16 @@
 Everything here turns user selectors into :class:`Segment` lists — parsing
 ``--range`` values, filtering diarized utterances for ``--speaker``/``--search``,
 rendering the timestamped listing an ``--llm`` model selects from, parsing the
-model's reply, and merging the combined selection. The orchestration (transcript
-fetch, LLM call, ffmpeg) lives in ``clip_exec``.
+model's reply, merging the combined selection, and snapping the merged
+boundaries into detected silence. The orchestration (transcript fetch, LLM
+call, ffmpeg — including the silencedetect pass) lives in ``clip_exec``.
 """
 
 from __future__ import annotations
 
 import json
 import math
+import re
 from dataclasses import dataclass
 
 from aai_cli import jsonshape
@@ -88,6 +90,76 @@ def merge_segments(segments: list[Segment], padding: float) -> list[Segment]:
     return merged
 
 
+# One silencedetect log edge: "silence_start: 12.34" or "silence_end: 13.01".
+_SILENCE_EDGE = re.compile(r"silence_(start|end):\s*(-?\d+(?:\.\d+)?)")
+
+SNAP_REACH = 1.5  # how far (seconds) a boundary may move to reach silence
+SNAP_LEAD = 0.25  # silence kept next to the speech once a boundary snaps
+
+
+def parse_silences(detect_log: str) -> list[Segment]:
+    """The silence intervals in ffmpeg ``silencedetect`` output, in order.
+
+    A trailing ``silence_start`` with no matching end runs to end-of-file; a
+    small negative start (decoder priming samples) clamps to 0.
+    """
+    silences: list[Segment] = []
+    pending: float | None = None
+    for kind, value in _SILENCE_EDGE.findall(detect_log):
+        if kind == "start":
+            pending = max(0.0, float(value))
+        elif pending is not None:
+            silences.append(Segment(pending, float(value)))
+            pending = None
+    if pending is not None:
+        silences.append(Segment(pending, math.inf))
+    return silences
+
+
+def _snap_start(t: float, silences: list[Segment]) -> float:
+    """A clip start moved back into the silence just before its speech.
+
+    A start already inside silence stays put (it honors --padding exactly);
+    one that lands mid-speech moves into the last silence before it, to
+    ``SNAP_LEAD`` before the speech resumes — only ever widening the clip, so
+    selected content is never dropped. With no silence within ``SNAP_REACH``
+    (continuous speech) it stays where asked. ``silences`` must be sorted.
+    """
+    gap = next((s for s in reversed(silences) if s.start <= t), None)
+    if gap is None:
+        return t  # before the first silence ever begins
+    if t <= gap.end:
+        return t  # already in silence
+    if t - gap.end > SNAP_REACH:
+        return t  # continuous speech as far back as snapping may reach
+    return max(gap.start, gap.end - SNAP_LEAD)
+
+
+def _snap_end(t: float, silences: list[Segment]) -> float:
+    """A clip end moved forward into the next silence (mirror of _snap_start)."""
+    gap = next((s for s in silences if s.end >= t), None)
+    if gap is None:
+        return t  # after the last silence ends
+    if gap.start <= t:
+        return t  # already in silence
+    if gap.start - t > SNAP_REACH:
+        return t
+    return min(gap.end, gap.start + SNAP_LEAD)
+
+
+def snap_to_silences(segments: list[Segment], silences: list[Segment]) -> list[Segment]:
+    """Segments with any boundary that lands on speech moved into adjacent
+    silence, so cuts don't fall mid-word; re-coalesced afterwards since a
+    start moving back can reach the previous segment's (moved) end."""
+    if not silences:
+        return segments
+    ordered = sorted(silences, key=lambda s: s.start)
+    snapped = [
+        Segment(_snap_start(seg.start, ordered), _snap_end(seg.end, ordered)) for seg in segments
+    ]
+    return merge_segments(snapped, 0.0)
+
+
 def matching_utterances(
     utterances: list[object], speakers: list[str], search: str | None
 ) -> list[object]:

diff --git a/aai_cli/commands/clip.py b/aai_cli/commands/clip.py
@@ -95,6 +95,12 @@ def clip(
     padding: float = typer.Option(
         0.0, "--padding", min=0.0, help="Seconds of padding to add around each clip."
     ),
+    snap: bool = typer.Option(
+        True,
+        "--snap/--no-snap",
+        help="Snap clip boundaries into nearby silence (detected with ffmpeg) so cuts "
+        "don't land mid-word; --no-snap cuts at the exact selected times.",
+    ),
     out_dir: Path | None = typer.Option(
         None, "--out-dir", help="Directory for the clip files (default: next to the input)."
     ),
@@ -104,10 +110,11 @@ def clip(
 
     --speaker and --search select from a diarized transcript (made on the fly,
     or reused with --transcript-id); --llm has an LLM Gateway model pick the
-    windows; --range adds explicit ones. Overlapping selections merge, and each
-    surviving segment is written as <name>.clipNN<ext> using ffmpeg (which must
-    be installed). A YouTube/media-page source is downloaded first; its clips
-    land in --out-dir or the current directory.
+    windows; --range adds explicit ones. Overlapping selections merge, clip
+    boundaries snap into nearby silence so cuts don't land mid-word (--no-snap
+    disables), and each surviving segment is written as <name>.clipNN<ext>
+    using ffmpeg (which must be installed). A YouTube/media-page source is
+    downloaded first; its clips land in --out-dir or the current directory.
     """
     opts = clip_exec.ClipOptions(
         media=media,
@@ -119,6 +126,7 @@ def clip(
         max_tokens=max_tokens,
         ranges=ranges,
         padding=padding,
+        snap=snap,
         out_dir=out_dir,
     )
     run_command(

diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md
@@ -125,16 +125,18 @@ fly, or pass `-t/--transcript-id` (an id, or `-` to read an id or
 `transcribe --json` output from stdin). `--llm "instruction"` sends the
 timestamped utterances to LLM Gateway and the model picks the windows.
 `--range START-END` adds explicit windows (seconds or `[HH:]MM:SS`).
-Overlapping selections merge; each surviving segment is written as
-`<name>.clipNN<ext>`.
+Overlapping selections merge, and clip boundaries snap into nearby silence
+(one ffmpeg `silencedetect` pass) so cuts don't land mid-word; each surviving
+segment is written as `<name>.clipNN<ext>`.
 
 High-value flags:
 
 - Selection: `--speaker A` (repeatable), `--search "topic"` (case-insensitive),
   `--llm "the best moments"` (composes with the filters), `--range 1:30-2:45`
   (repeatable).
 - LLM: `--model` (default `claude-haiku-4-5-20251001`), `--max-tokens N`.
-- Shaping: `--padding 0.5` (seconds around each clip), `--out-dir clips/`.
+- Shaping: `--padding 0.5` (seconds around each clip), `--no-snap` (cut at the
+  exact selected times instead of snapping into silence), `--out-dir clips/`.
 - Output: `--json` (paths + start/end/duration of each clip written).
 
 Examples: