diff --git a/README.md b/README.md
index bcbbdbf2..1d99d469 100644
--- a/README.md
+++ b/README.md
@@ -39,6 +39,14 @@ assembly transcribe "https://podcasts.apple.com/us/podcast/id1516093381" --speak
   | assembly --sandbox speak --out episode.wav
 ```
 
+**Cut the highlight reel from a speech** — `clip` downloads the audio, transcribes it, has an LLM pick the windows, and cuts each one into its own file with ffmpeg (here: Steve Jobs' Stanford commencement address):
+
+```sh
+assembly clip "https://www.youtube.com/watch?v=UF8uR6Z6KLc" \
+  --llm "the most quotable 20-40 seconds from each of the stories" \
+  --padding 0.5 --out-dir .
+```
+
 **Burn karaoke subtitles into a music video** — `-o srt` prints captions to stdout, and `--chars-per-caption` keeps the lines short so they flip with the vocals; ffmpeg renders them onto the video (`-f srt -i pipe:` muxes a toggleable soft-subtitle track instead, no re-encode):
 
 ```sh
@@ -173,7 +181,7 @@ assembly init                  # scaffold a starter app
 - **Real-time streaming**: `assembly stream` transcribes the microphone, a file, or a URL live — on macOS it can capture system audio too.
 - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal.
 - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`).
-- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`).
+- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times.
 - **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model.
 - **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io.
 - **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`.
diff --git a/aai_cli/clip_exec.py b/aai_cli/clip_exec.py
index 090f2c5d..aae96f48 100644
--- a/aai_cli/clip_exec.py
+++ b/aai_cli/clip_exec.py
@@ -12,8 +12,9 @@
 ``--transcript-id``, or piped on stdin with ``-t -``), ``--llm`` hands the
 timestamped utterances to the LLM Gateway and lets the model pick the windows,
 and ``--range`` adds explicit ones. The selected segments are padded, merged
-where they touch, and each surviving segment is re-encoded into its own file
-with ffmpeg.
+where they touch, snapped into nearby silence (one ffmpeg ``silencedetect``
+pass over the source, skipped with ``--no-snap``), and each surviving segment
+is re-encoded into its own file with ffmpeg.
 """
 
 from __future__ import annotations
@@ -51,6 +52,7 @@ class ClipOptions:
     max_tokens: int
     ranges: list[str]
     padding: float
+    snap: bool
     out_dir: Path | None
 
 
@@ -250,6 +252,37 @@ def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]:
     return subprocess.run(args, capture_output=True, text=True, check=False)
 
 
+# -30dB for at least 0.2s reads as a pause in normal speech recordings.
+_SILENCE_FILTER = "silencedetect=noise=-30dB:d=0.2"
+
+
+def _detect_silences(ffmpeg: str, media: Path) -> list[Segment]:
+    """The silence intervals ffmpeg hears in ``media`` (one decode pass).
+
+    Snapping is best-effort: a failed detection returns no silences (so the
+    cut proceeds at the selected times) rather than failing the command.
+    silencedetect logs at info level on stderr, so the usual ``-loglevel
+    error`` would silence the very lines this parses.
+    """
+    result = _run_ffmpeg(
+        [
+            ffmpeg,
+            "-hide_banner",
+            "-nostats",
+            "-i",
+            str(media),
+            "-af",
+            _SILENCE_FILTER,
+            "-f",
+            "null",
+            "-",
+        ]
+    )
+    if result.returncode != 0:
+        return []
+    return clip_select.parse_silences(result.stderr)
+
+
 def _cut_clip(ffmpeg: str, media: Path, segment: Segment, dest: Path) -> None:
     """Re-encode one segment of ``media`` into ``dest``.
 
@@ -350,6 +383,10 @@ def _cut_and_emit(
     """Select, cut, and report the clips for an already-local media file."""
     matched, transcript_id = _transcript_segments(opts, media, state, json_mode=json_mode)
     segments = clip_select.merge_segments([*matched, *explicit], opts.padding)
+    if opts.snap:
+        with output.status("Detecting silence…", json_mode=json_mode, quiet=state.quiet):
+            silences = _detect_silences(ffmpeg, media)
+        segments = clip_select.snap_to_silences(segments, silences)
     written: list[WrittenClip] = []
     cutting = f"Cutting {len(segments)} clip(s)…"
     with output.status(cutting, json_mode=json_mode, quiet=state.quiet):
diff --git a/aai_cli/clip_select.py b/aai_cli/clip_select.py
index ab1d4b1e..1b4d21bb 100644
--- a/aai_cli/clip_select.py
+++ b/aai_cli/clip_select.py
@@ -3,14 +3,16 @@
 Everything here turns user selectors into :class:`Segment` lists — parsing
 ``--range`` values, filtering diarized utterances for ``--speaker``/``--search``,
 rendering the timestamped listing an ``--llm`` model selects from, parsing the
-model's reply, and merging the combined selection. The orchestration (transcript
-fetch, LLM call, ffmpeg) lives in ``clip_exec``.
+model's reply, merging the combined selection, and snapping the merged
+boundaries into detected silence. The orchestration (transcript fetch, LLM
+call, ffmpeg — including the silencedetect pass) lives in ``clip_exec``.
 """
 
 from __future__ import annotations
 
 import json
 import math
+import re
 from dataclasses import dataclass
 
 from aai_cli import jsonshape
@@ -88,6 +90,76 @@ def merge_segments(segments: list[Segment], padding: float) -> list[Segment]:
     return merged
 
 
+# One silencedetect log edge: "silence_start: 12.34" or "silence_end: 13.01".
+_SILENCE_EDGE = re.compile(r"silence_(start|end):\s*(-?\d+(?:\.\d+)?)")
+
+SNAP_REACH = 1.5  # how far (seconds) a boundary may move to reach silence
+SNAP_LEAD = 0.25  # silence kept next to the speech once a boundary snaps
+
+
+def parse_silences(detect_log: str) -> list[Segment]:
+    """The silence intervals in ffmpeg ``silencedetect`` output, in order.
+
+    A trailing ``silence_start`` with no matching end runs to end-of-file; a
+    small negative start (decoder priming samples) clamps to 0.
+    """
+    silences: list[Segment] = []
+    pending: float | None = None
+    for kind, value in _SILENCE_EDGE.findall(detect_log):
+        if kind == "start":
+            pending = max(0.0, float(value))
+        elif pending is not None:
+            silences.append(Segment(pending, float(value)))
+            pending = None
+    if pending is not None:
+        silences.append(Segment(pending, math.inf))
+    return silences
+
+
+def _snap_start(t: float, silences: list[Segment]) -> float:
+    """A clip start moved back into the silence just before its speech.
+
+    A start already inside silence stays put (it honors --padding exactly);
+    one that lands mid-speech moves into the last silence before it, to
+    ``SNAP_LEAD`` before the speech resumes — only ever widening the clip, so
+    selected content is never dropped. With no silence within ``SNAP_REACH``
+    (continuous speech) it stays where asked. ``silences`` must be sorted.
+    """
+    gap = next((s for s in reversed(silences) if s.start <= t), None)
+    if gap is None:
+        return t  # before the first silence ever begins
+    if t <= gap.end:
+        return t  # already in silence
+    if t - gap.end > SNAP_REACH:
+        return t  # continuous speech as far back as snapping may reach
+    return max(gap.start, gap.end - SNAP_LEAD)
+
+
+def _snap_end(t: float, silences: list[Segment]) -> float:
+    """A clip end moved forward into the next silence (mirror of _snap_start)."""
+    gap = next((s for s in silences if s.end >= t), None)
+    if gap is None:
+        return t  # after the last silence ends
+    if gap.start <= t:
+        return t  # already in silence
+    if gap.start - t > SNAP_REACH:
+        return t
+    return min(gap.end, gap.start + SNAP_LEAD)
+
+
+def snap_to_silences(segments: list[Segment], silences: list[Segment]) -> list[Segment]:
+    """Segments with any boundary that lands on speech moved into adjacent
+    silence, so cuts don't fall mid-word; re-coalesced afterwards since a
+    start moving back can reach the previous segment's (moved) end."""
+    if not silences:
+        return segments
+    ordered = sorted(silences, key=lambda s: s.start)
+    snapped = [
+        Segment(_snap_start(seg.start, ordered), _snap_end(seg.end, ordered)) for seg in segments
+    ]
+    return merge_segments(snapped, 0.0)
+
+
 def matching_utterances(
     utterances: list[object], speakers: list[str], search: str | None
 ) -> list[object]:
diff --git a/aai_cli/commands/clip.py b/aai_cli/commands/clip.py
index 7a5b3d52..8f9ba866 100644
--- a/aai_cli/commands/clip.py
+++ b/aai_cli/commands/clip.py
@@ -95,6 +95,12 @@ def clip(
     padding: float = typer.Option(
         0.0, "--padding", min=0.0, help="Seconds of padding to add around each clip."
     ),
+    snap: bool = typer.Option(
+        True,
+        "--snap/--no-snap",
+        help="Snap clip boundaries into nearby silence (detected with ffmpeg) so cuts "
+        "don't land mid-word; --no-snap cuts at the exact selected times.",
+    ),
     out_dir: Path | None = typer.Option(
         None, "--out-dir", help="Directory for the clip files (default: next to the input)."
     ),
@@ -104,10 +110,11 @@ def clip(
 
     --speaker and --search select from a diarized transcript (made on the fly,
     or reused with --transcript-id); --llm has an LLM Gateway model pick the
-    windows; --range adds explicit ones. Overlapping selections merge, and each
-    surviving segment is written as <name>.clipNN<ext> using ffmpeg (which must
-    be installed). A YouTube/media-page source is downloaded first; its clips
-    land in --out-dir or the current directory.
+    windows; --range adds explicit ones. Overlapping selections merge, clip
+    boundaries snap into nearby silence so cuts don't land mid-word (--no-snap
+    disables), and each surviving segment is written as <name>.clipNN<ext>
+    using ffmpeg (which must be installed). A YouTube/media-page source is
+    downloaded first; its clips land in --out-dir or the current directory.
     """
     opts = clip_exec.ClipOptions(
         media=media,
@@ -119,6 +126,7 @@ def clip(
         max_tokens=max_tokens,
         ranges=ranges,
         padding=padding,
+        snap=snap,
         out_dir=out_dir,
     )
     run_command(
diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md
index 1d05a300..6575d6ef 100644
--- a/aai_cli/skills/aai-cli/references/transcription.md
+++ b/aai_cli/skills/aai-cli/references/transcription.md
@@ -125,8 +125,9 @@ fly, or pass `-t/--transcript-id` (an id, or `-` to read an id or
 `transcribe --json` output from stdin). `--llm "instruction"` sends the
 timestamped utterances to LLM Gateway and the model picks the windows.
 `--range START-END` adds explicit windows (seconds or `[HH:]MM:SS`).
-Overlapping selections merge; each surviving segment is written as
-`<name>.clipNN<ext>`.
+Overlapping selections merge, and clip boundaries snap into nearby silence
+(one ffmpeg `silencedetect` pass) so cuts don't land mid-word; each surviving
+segment is written as `<name>.clipNN<ext>`.
 
 High-value flags:
 
@@ -134,7 +135,8 @@ High-value flags:
   `--llm "the best moments"` (composes with the filters), `--range 1:30-2:45`
   (repeatable).
 - LLM: `--model` (default `claude-haiku-4-5-20251001`), `--max-tokens N`.
-- Shaping: `--padding 0.5` (seconds around each clip), `--out-dir clips/`.
+- Shaping: `--padding 0.5` (seconds around each clip), `--no-snap` (cut at the
+  exact selected times instead of snapping into silence), `--out-dir clips/`.
 - Output: `--json` (paths + start/end/duration of each clip written).
 
 Examples:
diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr
index d4f7c2b9..90848c89 100644
--- a/tests/__snapshots__/test_snapshots_help_run.ambr
+++ b/tests/__snapshots__/test_snapshots_help_run.ambr
@@ -76,10 +76,11 @@
   
    --speaker and --search select from a diarized transcript (made on the fly,
    or reused with --transcript-id); --llm has an LLM Gateway model pick the
-   windows; --range adds explicit ones. Overlapping selections merge, and each
-   surviving segment is written as <name>.clipNN<ext> using ffmpeg (which must
-   be installed). A YouTube/media-page source is downloaded first; its clips
-   land in --out-dir or the current directory.
+   windows; --range adds explicit ones. Overlapping selections merge, clip
+   boundaries snap into nearby silence so cuts don't land mid-word (--no-snap
+   disables), and each surviving segment is written as <name>.clipNN<ext>
+   using ffmpeg (which must be installed). A YouTube/media-page source is
+   downloaded first; its clips land in --out-dir or the current directory.
   
   ╭─ Arguments ──────────────────────────────────────────────────────────────────╮
   │ *    media      TEXT  Audio/video to cut clips from: a local file, or a      │
@@ -87,30 +88,47 @@
   │                       [required]                                             │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ Options ────────────────────────────────────────────────────────────────────╮
-  │ --transcript-id  -t      TEXT                  Reuse an existing transcript  │
-  │                                                of this media instead of      │
-  │                                                transcribing it again: an id, │
-  │                                                or '-' to read an id or       │
-  │                                                'transcribe --json' output    │
-  │                                                from stdin.                   │
-  │ --speaker                TEXT                  Keep segments spoken by this  │
-  │                                                diarized speaker label        │
-  │                                                (repeatable, e.g. --speaker   │
-  │                                                A).                           │
-  │ --search                 TEXT                  Keep segments whose text      │
-  │                                                contains this                 │
-  │                                                (case-insensitive).           │
-  │ --range                  TEXT                  Keep an explicit START-END    │
-  │                                                window (seconds or            │
-  │                                                [HH:]MM:SS; repeatable).      │
-  │ --padding                FLOAT RANGE [x>=0.0]  Seconds of padding to add     │
-  │                                                around each clip.             │
-  │                                                [default: 0.0]                │
-  │ --out-dir                PATH                  Directory for the clip files  │
-  │                                                (default: next to the input). │
-  │ --json           -j                            Emit JSON describing the      │
-  │                                                clips written.                │
-  │ --help                                         Show this message and exit.   │
+  │ --transcript-id  -t               TEXT                  Reuse an existing    │
+  │                                                         transcript of this   │
+  │                                                         media instead of     │
+  │                                                         transcribing it      │
+  │                                                         again: an id, or '-' │
+  │                                                         to read an id or     │
+  │                                                         'transcribe --json'  │
+  │                                                         output from stdin.   │
+  │ --speaker                         TEXT                  Keep segments spoken │
+  │                                                         by this diarized     │
+  │                                                         speaker label        │
+  │                                                         (repeatable, e.g.    │
+  │                                                         --speaker A).        │
+  │ --search                          TEXT                  Keep segments whose  │
+  │                                                         text contains this   │
+  │                                                         (case-insensitive).  │
+  │ --range                           TEXT                  Keep an explicit     │
+  │                                                         START-END window     │
+  │                                                         (seconds or          │
+  │                                                         [HH:]MM:SS;          │
+  │                                                         repeatable).         │
+  │ --padding                         FLOAT RANGE [x>=0.0]  Seconds of padding   │
+  │                                                         to add around each   │
+  │                                                         clip.                │
+  │                                                         [default: 0.0]       │
+  │ --snap               --no-snap                          Snap clip boundaries │
+  │                                                         into nearby silence  │
+  │                                                         (detected with       │
+  │                                                         ffmpeg) so cuts      │
+  │                                                         don't land mid-word; │
+  │                                                         --no-snap cuts at    │
+  │                                                         the exact selected   │
+  │                                                         times.               │
+  │                                                         [default: snap]      │
+  │ --out-dir                         PATH                  Directory for the    │
+  │                                                         clip files (default: │
+  │                                                         next to the input).  │
+  │ --json           -j                                     Emit JSON describing │
+  │                                                         the clips written.   │
+  │ --help                                                  Show this message    │
+  │                                                         and exit.            │
   ╰──────────────────────────────────────────────────────────────────────────────╯
   ╭─ LLM Transform ──────────────────────────────────────────────────────────────╮
   │ --llm               TEXT     Let an LLM Gateway model pick the windows to    │
diff --git a/tests/_clip_helpers.py b/tests/_clip_helpers.py
index 8dcdba6f..86e969fe 100644
--- a/tests/_clip_helpers.py
+++ b/tests/_clip_helpers.py
@@ -30,6 +30,7 @@
     max_tokens=llm.DEFAULT_MAX_TOKENS,
     ranges=[],
     padding=0.0,
+    snap=True,
     out_dir=None,
 )
 
@@ -54,14 +55,20 @@ def fake_transcript(utterances):
     return SimpleNamespace(id="tr_123", utterances=utterances)
 
 
-def record_ffmpeg(monkeypatch: pytest.MonkeyPatch) -> list[list[str]]:
-    """Resolve ffmpeg and record every invocation, succeeding with no output."""
+def record_ffmpeg(monkeypatch: pytest.MonkeyPatch, detect_log: str = "") -> list[list[str]]:
+    """Resolve ffmpeg and record every invocation, succeeding with no output.
+
+    With snapping on (the default) the first recorded call is the silencedetect
+    pass; ``detect_log`` is what it reports on stderr (empty: no silences, so
+    snapping is a no-op and cut times stay exactly as selected).
+    """
     monkeypatch.setattr("shutil.which", lambda name: f"/usr/bin/{name}")
     calls: list[list[str]] = []
 
     def run(args: list[str]) -> subprocess.CompletedProcess[str]:
         calls.append(args)
-        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+        stderr = detect_log if "-af" in args else ""
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr=stderr)
 
     monkeypatch.setattr(clip_exec, "_run_ffmpeg", run)
     return calls
diff --git a/tests/test_clip_command.py b/tests/test_clip_command.py
index 9c7d0454..9cfafe0c 100644
--- a/tests/test_clip_command.py
+++ b/tests/test_clip_command.py
@@ -60,6 +60,7 @@ def test_clip_parses_every_flag_into_options(monkeypatch, tmp_path):
             "1:30-2:00",
             "--padding",
             "0.5",
+            "--no-snap",
             "--out-dir",
             str(tmp_path),
             "--json",
@@ -76,6 +77,7 @@ def test_clip_parses_every_flag_into_options(monkeypatch, tmp_path):
         max_tokens=64,
         ranges=["5-10", "1:30-2:00"],
         padding=0.5,
+        snap=False,
         out_dir=tmp_path,
     )
     assert captured["json_mode"] is True
@@ -95,6 +97,7 @@ def test_clip_defaults_when_only_media_is_given(monkeypatch):
         max_tokens=llm.DEFAULT_MAX_TOKENS,
         ranges=[],
         padding=0.0,
+        snap=True,
         out_dir=None,
     )
     assert captured["json_mode"] is False
@@ -138,7 +141,8 @@ def fake_run(args: list[str]) -> subprocess.CompletedProcess[str]:
     monkeypatch.setattr(clip_exec, "_run_ffmpeg", fake_run)
     result = runner.invoke(app, ["clip", str(media), "--range", "1-2", "--json"])
     assert result.exit_code == 0, result.output
-    assert calls[0][-1] == str(tmp_path / "talk.clip01.mp3")
+    # calls[0] is the silencedetect pass; calls[1] the cut.
+    assert calls[1][-1] == str(tmp_path / "talk.clip01.mp3")
     payload = json.loads(result.output.strip().splitlines()[-1])
     assert payload["clips"][0]["duration"] == 1.0
 
diff --git a/tests/test_clip_exec.py b/tests/test_clip_exec.py
index 0e399495..2f365071 100644
--- a/tests/test_clip_exec.py
+++ b/tests/test_clip_exec.py
@@ -138,6 +138,18 @@ def test_run_clip_range_only_cuts_and_emits_json(media, fake_ffmpeg, capsys):
     clip_exec.run_clip(opts, AppState(), json_mode=True)
     dest = media.parent / "meeting.clip01.mp4"
     assert fake_ffmpeg == [
+        [
+            "/usr/bin/ffmpeg",
+            "-hide_banner",
+            "-nostats",
+            "-i",
+            str(media),
+            "-af",
+            "silencedetect=noise=-30dB:d=0.2",
+            "-f",
+            "null",
+            "-",
+        ],
         [
             "/usr/bin/ffmpeg",
             "-hide_banner",
@@ -151,7 +163,7 @@ def test_run_clip_range_only_cuts_and_emits_json(media, fake_ffmpeg, capsys):
             "-to",
             "12.500",
             str(dest),
-        ]
+        ],
     ]
     payload = json.loads(capsys.readouterr().out)
     assert payload == {
@@ -182,7 +194,7 @@ def test_run_clip_human_mode_prints_one_line_per_clip(tmp_path, fake_ffmpeg, cap
 def test_run_clip_applies_padding_to_explicit_ranges(media, fake_ffmpeg, capsys):
     opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-10"], padding=1.0)
     clip_exec.run_clip(opts, AppState(), json_mode=True)
-    assert fake_ffmpeg[0][7:11] == ["-ss", "4.000", "-to", "11.000"]
+    assert fake_ffmpeg[1][7:11] == ["-ss", "4.000", "-to", "11.000"]
     clips = json.loads(capsys.readouterr().out)["clips"]
     assert (clips[0]["start"], clips[0]["end"]) == (4.0, 11.0)
 
@@ -207,10 +219,63 @@ def test_run_clip_honors_out_dir(media, tmp_path, fake_ffmpeg, capsys):
     opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["1-2"], out_dir=out_dir)
     clip_exec.run_clip(opts, AppState(), json_mode=True)
     dest = out_dir / "meeting.clip01.mp4"
-    assert fake_ffmpeg[0][-1] == str(dest)
+    assert fake_ffmpeg[1][-1] == str(dest)
     assert json.loads(capsys.readouterr().out)["clips"][0]["path"] == str(dest)
 
 
+# --- silence snapping ---------------------------------------------------------
+
+DETECT_LOG = (
+    "[silencedetect @ 0x1] silence_start: 4\n"
+    "[silencedetect @ 0x1] silence_end: 4.6 | silence_duration: 0.6\n"
+    "[silencedetect @ 0x1] silence_start: 13\n"
+    "[silencedetect @ 0x1] silence_end: 14 | silence_duration: 1.0\n"
+)
+
+
+def test_run_clip_snaps_boundaries_into_detected_silence(media, capsys, monkeypatch):
+    # Both 5.0 and 12.5 land on speech: the start snaps back into the 4.0-4.6
+    # silence (0.25 before speech resumes at 4.6), the end snaps forward into
+    # the 13.0-14.0 silence (0.25 past where speech stops at 13.0).
+    calls = record_ffmpeg(monkeypatch, detect_log=DETECT_LOG)
+    opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-12.5"])
+    clip_exec.run_clip(opts, AppState(), json_mode=True)
+    assert calls[1][7:11] == ["-ss", "4.350", "-to", "13.250"]
+    clips = json.loads(capsys.readouterr().out)["clips"]
+    assert (clips[0]["start"], clips[0]["end"]) == (4.35, 13.25)
+
+
+def test_run_clip_failed_silence_detection_cuts_at_selected_times(media, capsys, monkeypatch):
+    # Snapping is best-effort: a broken silencedetect pass must not fail (or
+    # shift) the cut.
+    monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg")
+
+    def run(args: list[str]) -> subprocess.CompletedProcess[str]:
+        if "-af" in args:
+            return subprocess.CompletedProcess(
+                args=args, returncode=1, stdout="", stderr=DETECT_LOG
+            )
+        return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="")
+
+    monkeypatch.setattr(clip_exec, "_run_ffmpeg", run)
+    opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-12.5"])
+    clip_exec.run_clip(opts, AppState(), json_mode=True)
+    clips = json.loads(capsys.readouterr().out)["clips"]
+    assert (clips[0]["start"], clips[0]["end"]) == (5.0, 12.5)
+
+
+def test_run_clip_no_snap_skips_detection(media, capsys, monkeypatch):
+    calls = record_ffmpeg(monkeypatch, detect_log=DETECT_LOG)
+    opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-12.5"], snap=False)
+    clip_exec.run_clip(opts, AppState(), json_mode=True)
+    # Exactly one ffmpeg call: the cut, at the exact selected times.
+    assert len(calls) == 1
+    assert "-af" not in calls[0]
+    assert calls[0][7:11] == ["-ss", "5.000", "-to", "12.500"]
+    clips = json.loads(capsys.readouterr().out)["clips"]
+    assert (clips[0]["start"], clips[0]["end"]) == (5.0, 12.5)
+
+
 def test_run_clip_surfaces_ffmpeg_failure(media, monkeypatch):
     monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg")
 
@@ -282,8 +347,8 @@ def fake_transcribe(api_key, audio, *, config):
     assert payload["transcript_id"] == "tr_123"
     # Speaker A's two utterances: 1.5-2.5s and 5-6s.
     assert [(c["start"], c["end"]) for c in payload["clips"]] == [(1.5, 2.5), (5.0, 6.0)]
-    assert fake_ffmpeg[0][-1] == str(media.parent / "meeting.clip01.mp4")
-    assert fake_ffmpeg[1][-1] == str(media.parent / "meeting.clip02.mp4")
+    assert fake_ffmpeg[1][-1] == str(media.parent / "meeting.clip01.mp4")
+    assert fake_ffmpeg[2][-1] == str(media.parent / "meeting.clip02.mp4")
 
 
 def test_run_clip_reuses_transcript_by_id(media, fake_ffmpeg, capsys, monkeypatch):
@@ -359,4 +424,8 @@ def fake_status(message, *, json_mode, quiet):
     monkeypatch.setattr(clip_exec.output, "status", fake_status)
     opts = dataclasses.replace(DEFAULTS, media=str(media), speakers=["A"])
     clip_exec.run_clip(opts, AppState(), json_mode=False)
-    assert messages == ["Transcribing for clip selection…", "Cutting 2 clip(s)…"]
+    assert messages == [
+        "Transcribing for clip selection…",
+        "Detecting silence…",
+        "Cutting 2 clip(s)…",
+    ]
diff --git a/tests/test_clip_select.py b/tests/test_clip_select.py
index 7754ec7b..55e9ffc1 100644
--- a/tests/test_clip_select.py
+++ b/tests/test_clip_select.py
@@ -1,9 +1,11 @@
 """Tests for the pure clip selection logic (aai_cli/clip_select.py): --range
 parsing, segment merging, utterance filtering, the LLM listing/reply contract,
-and clock formatting."""
+silencedetect parsing, boundary snapping, and clock formatting."""
 
 from __future__ import annotations
 
+import math
+
 import pytest
 
 from aai_cli import clip_select
@@ -193,6 +195,120 @@ def test_segment_is_immutable():
         setattr(segment, field_name, 5.0)
 
 
+# --- silence detection & snapping ---------------------------------------------
+
+
+def test_parse_silences_pairs_start_end_edges_in_order():
+    log = (
+        "Stream mapping: ...\n"
+        "[silencedetect @ 0x6000] silence_start: 4\n"
+        "[silencedetect @ 0x6000] silence_end: 4.6 | silence_duration: 0.6\n"
+        "[silencedetect @ 0x6000] silence_start: 13.25\n"
+        "[silencedetect @ 0x6000] silence_end: 14 | silence_duration: 0.75\n"
+    )
+    assert clip_select.parse_silences(log) == [Segment(4.0, 4.6), Segment(13.25, 14.0)]
+
+
+def test_parse_silences_trailing_start_runs_to_end_of_file():
+    assert clip_select.parse_silences("silence_start: 7.5\n") == [Segment(7.5, math.inf)]
+
+
+def test_parse_silences_clamps_negative_start_to_zero():
+    # ffmpeg can report a small negative start from decoder priming samples.
+    log = "silence_start: -0.011\nsilence_end: 1.5 | silence_duration: 1.511\n"
+    assert clip_select.parse_silences(log) == [Segment(0.0, 1.5)]
+
+
+def test_parse_silences_ignores_an_unpaired_end():
+    log = "silence_start: 1\nsilence_end: 2\nsilence_end: 3\n"
+    assert clip_select.parse_silences(log) == [Segment(1.0, 2.0)]
+
+
+def test_parse_silences_empty_log_finds_nothing():
+    assert clip_select.parse_silences("") == []
+
+
+SILENCES = [Segment(4.0, 4.6), Segment(13.0, 14.0)]
+
+
+def test_snap_moves_speech_boundaries_into_adjacent_silence():
+    # 5.0 and 12.5 both land on speech: the start backs into the 4.0-4.6
+    # silence (SNAP_LEAD before speech resumes at 4.6), the end runs forward
+    # into the 13.0-14.0 silence (SNAP_LEAD past where speech stops at 13.0).
+    snapped = clip_select.snap_to_silences([Segment(5.0, 12.5)], SILENCES)
+    assert snapped == [Segment(4.35, 13.25)]
+
+
+def test_snap_clamps_inside_a_narrow_silence():
+    # Silences narrower than SNAP_LEAD: the boundary stays within the silence.
+    silences = [Segment(4.5, 4.6), Segment(13.0, 13.1)]
+    snapped = clip_select.snap_to_silences([Segment(5.0, 12.5)], silences)
+    assert snapped == [Segment(4.5, 13.1)]
+
+
+def test_snap_leaves_boundaries_already_in_silence_alone():
+    # Both boundaries sit in silence already (one mid-silence, one exactly at
+    # a silence edge): they honor the selection (and --padding) exactly.
+    snapped = clip_select.snap_to_silences([Segment(4.2, 13.0)], SILENCES)
+    assert snapped == [Segment(4.2, 13.0)]
+    snapped = clip_select.snap_to_silences([Segment(4.6, 14.0)], SILENCES)
+    assert snapped == [Segment(4.6, 14.0)]
+
+
+def test_snap_prefers_the_silence_a_boundary_touches_over_a_nearby_one():
+    # A start exactly at a silence's start (and an end exactly at a silence's
+    # end) belongs to that silence — not snapped toward the neighbouring one.
+    silences = [Segment(3.2, 4.4), Segment(4.6, 5.0), Segment(5.2, 6.0)]
+    snapped = clip_select.snap_to_silences([Segment(4.6, 5.0)], silences)
+    assert snapped == [Segment(4.6, 5.0)]
+
+
+@pytest.mark.parametrize(
+    ("segment", "expected"),
+    [
+        # Exactly SNAP_REACH from the silences on both sides: still snaps.
+        (Segment(5.0, 12.5), Segment(3.25, 14.25)),
+        # Just beyond SNAP_REACH on both sides: continuous speech, stays put.
+        (Segment(5.1, 12.4), Segment(5.1, 12.4)),
+    ],
+)
+def test_snap_reach_bounds_how_far_a_boundary_moves(segment, expected):
+    silences = [Segment(1.0, 3.5), Segment(14.0, 15.0)]
+    assert clip_select.snap_to_silences([segment], silences) == [expected]
+
+
+def test_snap_boundaries_beyond_all_silences_stay_put():
+    snapped = clip_select.snap_to_silences([Segment(0.5, 20.0)], SILENCES)
+    assert snapped == [Segment(0.5, 20.0)]
+
+
+def test_snap_end_into_a_trailing_silence_that_runs_to_end_of_file():
+    silences = [Segment(10.0, math.inf)]
+    snapped = clip_select.snap_to_silences([Segment(5.0, 9.9)], silences)
+    assert snapped == [Segment(5.0, 10.25)]
+
+
+def test_snap_sorts_the_silences_it_is_given():
+    silences = [Segment(13.0, 14.0), Segment(4.0, 4.6)]
+    snapped = clip_select.snap_to_silences([Segment(14.2, 20.0)], silences)
+    assert snapped == [Segment(13.75, 20.0)]
+
+
+def test_snap_merges_segments_that_meet_inside_one_silence():
+    # Both boundaries snap into the same 4.6-5.0 silence and now overlap, so
+    # the two clips fold into one instead of duplicating the pause.
+    silences = [Segment(4.6, 5.0)]
+    snapped = clip_select.snap_to_silences([Segment(2.0, 4.5), Segment(5.2, 7.0)], silences)
+    assert snapped == [Segment(2.0, 7.0)]
+
+
+def test_snap_without_silences_changes_nothing():
+    # No detected silences (or a failed detection): segments pass through
+    # untouched — not even re-merged.
+    segments = [Segment(5.0, 6.0), Segment(5.5, 7.0)]
+    assert clip_select.snap_to_silences(segments, []) == segments
+
+
 # --- clock formatting --------------------------------------------------------
 
 
diff --git a/tests/test_clip_sources.py b/tests/test_clip_sources.py
index 2601ae2e..9db83d01 100644
--- a/tests/test_clip_sources.py
+++ b/tests/test_clip_sources.py
@@ -60,9 +60,9 @@ def test_run_clip_downloads_youtube_audio_into_cwd(
     assert fake_download["url"] == YT_URL
     # ffmpeg reads the downloaded temp file; the clip lands in the cwd, named
     # after the download (the temp dir is gone after the run).
-    assert fake_ffmpeg[0][6] == str(fake_download["path"])
+    assert fake_ffmpeg[1][6] == str(fake_download["path"])
     dest = tmp_path / "vid123.clip01.m4a"
-    assert fake_ffmpeg[0][-1] == str(dest)
+    assert fake_ffmpeg[1][-1] == str(dest)
     payload = json.loads(capsys.readouterr().out)
     assert payload["source"] == YT_URL
     assert payload["clips"][0]["path"] == str(dest)
@@ -73,7 +73,7 @@ def test_run_clip_youtube_honors_out_dir(tmp_path, fake_ffmpeg, fake_download, c
     out_dir.mkdir()
     opts = dataclasses.replace(DEFAULTS, media=YT_URL, ranges=["1-2"], out_dir=out_dir)
     clip_exec.run_clip(opts, AppState(), json_mode=True)
-    assert fake_ffmpeg[0][-1] == str(out_dir / "vid123.clip01.m4a")
+    assert fake_ffmpeg[1][-1] == str(out_dir / "vid123.clip01.m4a")
 
 
 def test_run_clip_youtube_transcribes_the_downloaded_file(
@@ -112,7 +112,7 @@ def fake_status(message, *, json_mode, quiet):
     monkeypatch.setattr(clip_exec.output, "status", fake_status)
     opts = dataclasses.replace(DEFAULTS, media=YT_URL, ranges=["1-2"])
     clip_exec.run_clip(opts, AppState(), json_mode=False)
-    assert messages == ["Downloading audio…", "Cutting 1 clip(s)…"]
+    assert messages == ["Downloading audio…", "Detecting silence…", "Cutting 1 clip(s)…"]
 
 
 # --- transcript piped on stdin (-t -) -------------------------------------------
@@ -216,7 +216,7 @@ def fake_transform(api_key, *, prompt, transcript_text, model, max_tokens):
     payload = json.loads(capsys.readouterr().out)
     assert payload["transcript_id"] == "tr_123"
     assert [(c["start"], c["end"]) for c in payload["clips"]] == [(1.5, 4.0)]
-    assert fake_ffmpeg[0][7:11] == ["-ss", "1.500", "-to", "4.000"]
+    assert fake_ffmpeg[1][7:11] == ["-ss", "1.500", "-to", "4.000"]
 
 
 def test_run_clip_llm_composes_with_speaker_filter(media, fake_ffmpeg, capsys, monkeypatch):
@@ -290,5 +290,6 @@ def fake_status(message, *, json_mode, quiet):
     assert messages == [
         "Transcribing for clip selection…",
         "Selecting segments with gpt-5…",
+        "Detecting silence…",
         "Cutting 1 clip(s)…",
     ]