diff --git a/README.md b/README.md index bcbbdbf2..1d99d469 100644 --- a/README.md +++ b/README.md @@ -39,6 +39,14 @@ assembly transcribe "https://podcasts.apple.com/us/podcast/id1516093381" --speak | assembly --sandbox speak --out episode.wav ``` +**Cut the highlight reel from a speech** — `clip` downloads the audio, transcribes it, has an LLM pick the windows, and cuts each one into its own file with ffmpeg (here: Steve Jobs' Stanford commencement address): + +```sh +assembly clip "https://www.youtube.com/watch?v=UF8uR6Z6KLc" \ + --llm "the most quotable 20-40 seconds from each of the stories" \ + --padding 0.5 --out-dir . +``` + **Burn karaoke subtitles into a music video** — `-o srt` prints captions to stdout, and `--chars-per-caption` keeps the lines short so they flip with the vocals; ffmpeg renders them onto the video (`-f srt -i pipe:` muxes a toggleable soft-subtitle track instead, no re-encode): ```sh @@ -173,7 +181,7 @@ assembly init # scaffold a starter app - **Real-time streaming**: `assembly stream` transcribes the microphone, a file, or a URL live — on macOS it can capture system audio too. - **Voice agent**: `assembly agent` runs a full-duplex spoken conversation in your terminal. - **LLM Gateway**: `assembly llm` prompts an LLM over a transcript, stdin, or a live stream (`assembly stream --llm "summarize as I talk"`). -- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). +- **Transcript-driven clipping**: `assembly clip` cuts an audio/video file (or a YouTube/podcast URL) with ffmpeg by diarized speaker (`--speaker A`), text match (`--search "pricing"`), LLM pick (`--llm "the three best moments"`), or explicit time range (`--range 1:30-2:45`) — transcribing on the fly, reusing a finished transcript with `-t ID`, or reading one from a pipe (`assembly transcribe x.mp4 --speaker-labels --json | assembly clip x.mp4 -t - --llm "…"`). Clip boundaries snap into nearby silence (ffmpeg `silencedetect`) so cuts don't land mid-word; `--no-snap` cuts at the exact selected times. - **Model evaluation**: `assembly eval` transcribes a Hugging Face dataset (with built-in aliases for common benchmarks: `assembly eval tedlium`) or a local `.csv`/`.jsonl` manifest and scores WER against its references — handy for picking a speech model. - **Starter apps**: `assembly init` scaffolds a self-contained FastAPI + HTML app (`audio-transcription`, `live-captions`, `voice-agent`); `assembly dev` runs it, `assembly share` exposes it on a public URL, and `assembly deploy` ships it to Vercel, Railway, or Fly.io. - **Webhook testing**: `assembly webhooks listen` opens a public dev URL (cloudflared quick tunnel) that prints webhook deliveries as they arrive and can forward them to your local app with `--forward-to`. diff --git a/aai_cli/clip_exec.py b/aai_cli/clip_exec.py index 090f2c5d..aae96f48 100644 --- a/aai_cli/clip_exec.py +++ b/aai_cli/clip_exec.py @@ -12,8 +12,9 @@ ``--transcript-id``, or piped on stdin with ``-t -``), ``--llm`` hands the timestamped utterances to the LLM Gateway and lets the model pick the windows, and ``--range`` adds explicit ones. The selected segments are padded, merged -where they touch, and each surviving segment is re-encoded into its own file -with ffmpeg. +where they touch, snapped into nearby silence (one ffmpeg ``silencedetect`` +pass over the source, skipped with ``--no-snap``), and each surviving segment +is re-encoded into its own file with ffmpeg. """ from __future__ import annotations @@ -51,6 +52,7 @@ class ClipOptions: max_tokens: int ranges: list[str] padding: float + snap: bool out_dir: Path | None @@ -250,6 +252,37 @@ def _run_ffmpeg(args: list[str]) -> subprocess.CompletedProcess[str]: return subprocess.run(args, capture_output=True, text=True, check=False) +# -30dB for at least 0.2s reads as a pause in normal speech recordings. +_SILENCE_FILTER = "silencedetect=noise=-30dB:d=0.2" + + +def _detect_silences(ffmpeg: str, media: Path) -> list[Segment]: + """The silence intervals ffmpeg hears in ``media`` (one decode pass). + + Snapping is best-effort: a failed detection returns no silences (so the + cut proceeds at the selected times) rather than failing the command. + silencedetect logs at info level on stderr, so the usual ``-loglevel + error`` would silence the very lines this parses. + """ + result = _run_ffmpeg( + [ + ffmpeg, + "-hide_banner", + "-nostats", + "-i", + str(media), + "-af", + _SILENCE_FILTER, + "-f", + "null", + "-", + ] + ) + if result.returncode != 0: + return [] + return clip_select.parse_silences(result.stderr) + + def _cut_clip(ffmpeg: str, media: Path, segment: Segment, dest: Path) -> None: """Re-encode one segment of ``media`` into ``dest``. @@ -350,6 +383,10 @@ def _cut_and_emit( """Select, cut, and report the clips for an already-local media file.""" matched, transcript_id = _transcript_segments(opts, media, state, json_mode=json_mode) segments = clip_select.merge_segments([*matched, *explicit], opts.padding) + if opts.snap: + with output.status("Detecting silence…", json_mode=json_mode, quiet=state.quiet): + silences = _detect_silences(ffmpeg, media) + segments = clip_select.snap_to_silences(segments, silences) written: list[WrittenClip] = [] cutting = f"Cutting {len(segments)} clip(s)…" with output.status(cutting, json_mode=json_mode, quiet=state.quiet): diff --git a/aai_cli/clip_select.py b/aai_cli/clip_select.py index ab1d4b1e..1b4d21bb 100644 --- a/aai_cli/clip_select.py +++ b/aai_cli/clip_select.py @@ -3,14 +3,16 @@ Everything here turns user selectors into :class:`Segment` lists — parsing ``--range`` values, filtering diarized utterances for ``--speaker``/``--search``, rendering the timestamped listing an ``--llm`` model selects from, parsing the -model's reply, and merging the combined selection. The orchestration (transcript -fetch, LLM call, ffmpeg) lives in ``clip_exec``. +model's reply, merging the combined selection, and snapping the merged +boundaries into detected silence. The orchestration (transcript fetch, LLM +call, ffmpeg — including the silencedetect pass) lives in ``clip_exec``. """ from __future__ import annotations import json import math +import re from dataclasses import dataclass from aai_cli import jsonshape @@ -88,6 +90,76 @@ def merge_segments(segments: list[Segment], padding: float) -> list[Segment]: return merged +# One silencedetect log edge: "silence_start: 12.34" or "silence_end: 13.01". +_SILENCE_EDGE = re.compile(r"silence_(start|end):\s*(-?\d+(?:\.\d+)?)") + +SNAP_REACH = 1.5 # how far (seconds) a boundary may move to reach silence +SNAP_LEAD = 0.25 # silence kept next to the speech once a boundary snaps + + +def parse_silences(detect_log: str) -> list[Segment]: + """The silence intervals in ffmpeg ``silencedetect`` output, in order. + + A trailing ``silence_start`` with no matching end runs to end-of-file; a + small negative start (decoder priming samples) clamps to 0. + """ + silences: list[Segment] = [] + pending: float | None = None + for kind, value in _SILENCE_EDGE.findall(detect_log): + if kind == "start": + pending = max(0.0, float(value)) + elif pending is not None: + silences.append(Segment(pending, float(value))) + pending = None + if pending is not None: + silences.append(Segment(pending, math.inf)) + return silences + + +def _snap_start(t: float, silences: list[Segment]) -> float: + """A clip start moved back into the silence just before its speech. + + A start already inside silence stays put (it honors --padding exactly); + one that lands mid-speech moves into the last silence before it, to + ``SNAP_LEAD`` before the speech resumes — only ever widening the clip, so + selected content is never dropped. With no silence within ``SNAP_REACH`` + (continuous speech) it stays where asked. ``silences`` must be sorted. + """ + gap = next((s for s in reversed(silences) if s.start <= t), None) + if gap is None: + return t # before the first silence ever begins + if t <= gap.end: + return t # already in silence + if t - gap.end > SNAP_REACH: + return t # continuous speech as far back as snapping may reach + return max(gap.start, gap.end - SNAP_LEAD) + + +def _snap_end(t: float, silences: list[Segment]) -> float: + """A clip end moved forward into the next silence (mirror of _snap_start).""" + gap = next((s for s in silences if s.end >= t), None) + if gap is None: + return t # after the last silence ends + if gap.start <= t: + return t # already in silence + if gap.start - t > SNAP_REACH: + return t + return min(gap.end, gap.start + SNAP_LEAD) + + +def snap_to_silences(segments: list[Segment], silences: list[Segment]) -> list[Segment]: + """Segments with any boundary that lands on speech moved into adjacent + silence, so cuts don't fall mid-word; re-coalesced afterwards since a + start moving back can reach the previous segment's (moved) end.""" + if not silences: + return segments + ordered = sorted(silences, key=lambda s: s.start) + snapped = [ + Segment(_snap_start(seg.start, ordered), _snap_end(seg.end, ordered)) for seg in segments + ] + return merge_segments(snapped, 0.0) + + def matching_utterances( utterances: list[object], speakers: list[str], search: str | None ) -> list[object]: diff --git a/aai_cli/commands/clip.py b/aai_cli/commands/clip.py index 7a5b3d52..8f9ba866 100644 --- a/aai_cli/commands/clip.py +++ b/aai_cli/commands/clip.py @@ -95,6 +95,12 @@ def clip( padding: float = typer.Option( 0.0, "--padding", min=0.0, help="Seconds of padding to add around each clip." ), + snap: bool = typer.Option( + True, + "--snap/--no-snap", + help="Snap clip boundaries into nearby silence (detected with ffmpeg) so cuts " + "don't land mid-word; --no-snap cuts at the exact selected times.", + ), out_dir: Path | None = typer.Option( None, "--out-dir", help="Directory for the clip files (default: next to the input)." ), @@ -104,10 +110,11 @@ def clip( --speaker and --search select from a diarized transcript (made on the fly, or reused with --transcript-id); --llm has an LLM Gateway model pick the - windows; --range adds explicit ones. Overlapping selections merge, and each - surviving segment is written as .clipNN using ffmpeg (which must - be installed). A YouTube/media-page source is downloaded first; its clips - land in --out-dir or the current directory. + windows; --range adds explicit ones. Overlapping selections merge, clip + boundaries snap into nearby silence so cuts don't land mid-word (--no-snap + disables), and each surviving segment is written as .clipNN + using ffmpeg (which must be installed). A YouTube/media-page source is + downloaded first; its clips land in --out-dir or the current directory. """ opts = clip_exec.ClipOptions( media=media, @@ -119,6 +126,7 @@ def clip( max_tokens=max_tokens, ranges=ranges, padding=padding, + snap=snap, out_dir=out_dir, ) run_command( diff --git a/aai_cli/skills/aai-cli/references/transcription.md b/aai_cli/skills/aai-cli/references/transcription.md index 1d05a300..6575d6ef 100644 --- a/aai_cli/skills/aai-cli/references/transcription.md +++ b/aai_cli/skills/aai-cli/references/transcription.md @@ -125,8 +125,9 @@ fly, or pass `-t/--transcript-id` (an id, or `-` to read an id or `transcribe --json` output from stdin). `--llm "instruction"` sends the timestamped utterances to LLM Gateway and the model picks the windows. `--range START-END` adds explicit windows (seconds or `[HH:]MM:SS`). -Overlapping selections merge; each surviving segment is written as -`.clipNN`. +Overlapping selections merge, and clip boundaries snap into nearby silence +(one ffmpeg `silencedetect` pass) so cuts don't land mid-word; each surviving +segment is written as `.clipNN`. High-value flags: @@ -134,7 +135,8 @@ High-value flags: `--llm "the best moments"` (composes with the filters), `--range 1:30-2:45` (repeatable). - LLM: `--model` (default `claude-haiku-4-5-20251001`), `--max-tokens N`. -- Shaping: `--padding 0.5` (seconds around each clip), `--out-dir clips/`. +- Shaping: `--padding 0.5` (seconds around each clip), `--no-snap` (cut at the + exact selected times instead of snapping into silence), `--out-dir clips/`. - Output: `--json` (paths + start/end/duration of each clip written). Examples: diff --git a/tests/__snapshots__/test_snapshots_help_run.ambr b/tests/__snapshots__/test_snapshots_help_run.ambr index d4f7c2b9..90848c89 100644 --- a/tests/__snapshots__/test_snapshots_help_run.ambr +++ b/tests/__snapshots__/test_snapshots_help_run.ambr @@ -76,10 +76,11 @@ --speaker and --search select from a diarized transcript (made on the fly, or reused with --transcript-id); --llm has an LLM Gateway model pick the - windows; --range adds explicit ones. Overlapping selections merge, and each - surviving segment is written as .clipNN using ffmpeg (which must - be installed). A YouTube/media-page source is downloaded first; its clips - land in --out-dir or the current directory. + windows; --range adds explicit ones. Overlapping selections merge, clip + boundaries snap into nearby silence so cuts don't land mid-word (--no-snap + disables), and each surviving segment is written as .clipNN + using ffmpeg (which must be installed). A YouTube/media-page source is + downloaded first; its clips land in --out-dir or the current directory. ╭─ Arguments ──────────────────────────────────────────────────────────────────╮ │ * media TEXT Audio/video to cut clips from: a local file, or a │ @@ -87,30 +88,47 @@ │ [required] │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ Options ────────────────────────────────────────────────────────────────────╮ - │ --transcript-id -t TEXT Reuse an existing transcript │ - │ of this media instead of │ - │ transcribing it again: an id, │ - │ or '-' to read an id or │ - │ 'transcribe --json' output │ - │ from stdin. │ - │ --speaker TEXT Keep segments spoken by this │ - │ diarized speaker label │ - │ (repeatable, e.g. --speaker │ - │ A). │ - │ --search TEXT Keep segments whose text │ - │ contains this │ - │ (case-insensitive). │ - │ --range TEXT Keep an explicit START-END │ - │ window (seconds or │ - │ [HH:]MM:SS; repeatable). │ - │ --padding FLOAT RANGE [x>=0.0] Seconds of padding to add │ - │ around each clip. │ - │ [default: 0.0] │ - │ --out-dir PATH Directory for the clip files │ - │ (default: next to the input). │ - │ --json -j Emit JSON describing the │ - │ clips written. │ - │ --help Show this message and exit. │ + │ --transcript-id -t TEXT Reuse an existing │ + │ transcript of this │ + │ media instead of │ + │ transcribing it │ + │ again: an id, or '-' │ + │ to read an id or │ + │ 'transcribe --json' │ + │ output from stdin. │ + │ --speaker TEXT Keep segments spoken │ + │ by this diarized │ + │ speaker label │ + │ (repeatable, e.g. │ + │ --speaker A). │ + │ --search TEXT Keep segments whose │ + │ text contains this │ + │ (case-insensitive). │ + │ --range TEXT Keep an explicit │ + │ START-END window │ + │ (seconds or │ + │ [HH:]MM:SS; │ + │ repeatable). │ + │ --padding FLOAT RANGE [x>=0.0] Seconds of padding │ + │ to add around each │ + │ clip. │ + │ [default: 0.0] │ + │ --snap --no-snap Snap clip boundaries │ + │ into nearby silence │ + │ (detected with │ + │ ffmpeg) so cuts │ + │ don't land mid-word; │ + │ --no-snap cuts at │ + │ the exact selected │ + │ times. │ + │ [default: snap] │ + │ --out-dir PATH Directory for the │ + │ clip files (default: │ + │ next to the input). │ + │ --json -j Emit JSON describing │ + │ the clips written. │ + │ --help Show this message │ + │ and exit. │ ╰──────────────────────────────────────────────────────────────────────────────╯ ╭─ LLM Transform ──────────────────────────────────────────────────────────────╮ │ --llm TEXT Let an LLM Gateway model pick the windows to │ diff --git a/tests/_clip_helpers.py b/tests/_clip_helpers.py index 8dcdba6f..86e969fe 100644 --- a/tests/_clip_helpers.py +++ b/tests/_clip_helpers.py @@ -30,6 +30,7 @@ max_tokens=llm.DEFAULT_MAX_TOKENS, ranges=[], padding=0.0, + snap=True, out_dir=None, ) @@ -54,14 +55,20 @@ def fake_transcript(utterances): return SimpleNamespace(id="tr_123", utterances=utterances) -def record_ffmpeg(monkeypatch: pytest.MonkeyPatch) -> list[list[str]]: - """Resolve ffmpeg and record every invocation, succeeding with no output.""" +def record_ffmpeg(monkeypatch: pytest.MonkeyPatch, detect_log: str = "") -> list[list[str]]: + """Resolve ffmpeg and record every invocation, succeeding with no output. + + With snapping on (the default) the first recorded call is the silencedetect + pass; ``detect_log`` is what it reports on stderr (empty: no silences, so + snapping is a no-op and cut times stay exactly as selected). + """ monkeypatch.setattr("shutil.which", lambda name: f"/usr/bin/{name}") calls: list[list[str]] = [] def run(args: list[str]) -> subprocess.CompletedProcess[str]: calls.append(args) - return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + stderr = detect_log if "-af" in args else "" + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr=stderr) monkeypatch.setattr(clip_exec, "_run_ffmpeg", run) return calls diff --git a/tests/test_clip_command.py b/tests/test_clip_command.py index 9c7d0454..9cfafe0c 100644 --- a/tests/test_clip_command.py +++ b/tests/test_clip_command.py @@ -60,6 +60,7 @@ def test_clip_parses_every_flag_into_options(monkeypatch, tmp_path): "1:30-2:00", "--padding", "0.5", + "--no-snap", "--out-dir", str(tmp_path), "--json", @@ -76,6 +77,7 @@ def test_clip_parses_every_flag_into_options(monkeypatch, tmp_path): max_tokens=64, ranges=["5-10", "1:30-2:00"], padding=0.5, + snap=False, out_dir=tmp_path, ) assert captured["json_mode"] is True @@ -95,6 +97,7 @@ def test_clip_defaults_when_only_media_is_given(monkeypatch): max_tokens=llm.DEFAULT_MAX_TOKENS, ranges=[], padding=0.0, + snap=True, out_dir=None, ) assert captured["json_mode"] is False @@ -138,7 +141,8 @@ def fake_run(args: list[str]) -> subprocess.CompletedProcess[str]: monkeypatch.setattr(clip_exec, "_run_ffmpeg", fake_run) result = runner.invoke(app, ["clip", str(media), "--range", "1-2", "--json"]) assert result.exit_code == 0, result.output - assert calls[0][-1] == str(tmp_path / "talk.clip01.mp3") + # calls[0] is the silencedetect pass; calls[1] the cut. + assert calls[1][-1] == str(tmp_path / "talk.clip01.mp3") payload = json.loads(result.output.strip().splitlines()[-1]) assert payload["clips"][0]["duration"] == 1.0 diff --git a/tests/test_clip_exec.py b/tests/test_clip_exec.py index 0e399495..2f365071 100644 --- a/tests/test_clip_exec.py +++ b/tests/test_clip_exec.py @@ -138,6 +138,18 @@ def test_run_clip_range_only_cuts_and_emits_json(media, fake_ffmpeg, capsys): clip_exec.run_clip(opts, AppState(), json_mode=True) dest = media.parent / "meeting.clip01.mp4" assert fake_ffmpeg == [ + [ + "/usr/bin/ffmpeg", + "-hide_banner", + "-nostats", + "-i", + str(media), + "-af", + "silencedetect=noise=-30dB:d=0.2", + "-f", + "null", + "-", + ], [ "/usr/bin/ffmpeg", "-hide_banner", @@ -151,7 +163,7 @@ def test_run_clip_range_only_cuts_and_emits_json(media, fake_ffmpeg, capsys): "-to", "12.500", str(dest), - ] + ], ] payload = json.loads(capsys.readouterr().out) assert payload == { @@ -182,7 +194,7 @@ def test_run_clip_human_mode_prints_one_line_per_clip(tmp_path, fake_ffmpeg, cap def test_run_clip_applies_padding_to_explicit_ranges(media, fake_ffmpeg, capsys): opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-10"], padding=1.0) clip_exec.run_clip(opts, AppState(), json_mode=True) - assert fake_ffmpeg[0][7:11] == ["-ss", "4.000", "-to", "11.000"] + assert fake_ffmpeg[1][7:11] == ["-ss", "4.000", "-to", "11.000"] clips = json.loads(capsys.readouterr().out)["clips"] assert (clips[0]["start"], clips[0]["end"]) == (4.0, 11.0) @@ -207,10 +219,63 @@ def test_run_clip_honors_out_dir(media, tmp_path, fake_ffmpeg, capsys): opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["1-2"], out_dir=out_dir) clip_exec.run_clip(opts, AppState(), json_mode=True) dest = out_dir / "meeting.clip01.mp4" - assert fake_ffmpeg[0][-1] == str(dest) + assert fake_ffmpeg[1][-1] == str(dest) assert json.loads(capsys.readouterr().out)["clips"][0]["path"] == str(dest) +# --- silence snapping --------------------------------------------------------- + +DETECT_LOG = ( + "[silencedetect @ 0x1] silence_start: 4\n" + "[silencedetect @ 0x1] silence_end: 4.6 | silence_duration: 0.6\n" + "[silencedetect @ 0x1] silence_start: 13\n" + "[silencedetect @ 0x1] silence_end: 14 | silence_duration: 1.0\n" +) + + +def test_run_clip_snaps_boundaries_into_detected_silence(media, capsys, monkeypatch): + # Both 5.0 and 12.5 land on speech: the start snaps back into the 4.0-4.6 + # silence (0.25 before speech resumes at 4.6), the end snaps forward into + # the 13.0-14.0 silence (0.25 past where speech stops at 13.0). + calls = record_ffmpeg(monkeypatch, detect_log=DETECT_LOG) + opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-12.5"]) + clip_exec.run_clip(opts, AppState(), json_mode=True) + assert calls[1][7:11] == ["-ss", "4.350", "-to", "13.250"] + clips = json.loads(capsys.readouterr().out)["clips"] + assert (clips[0]["start"], clips[0]["end"]) == (4.35, 13.25) + + +def test_run_clip_failed_silence_detection_cuts_at_selected_times(media, capsys, monkeypatch): + # Snapping is best-effort: a broken silencedetect pass must not fail (or + # shift) the cut. + monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg") + + def run(args: list[str]) -> subprocess.CompletedProcess[str]: + if "-af" in args: + return subprocess.CompletedProcess( + args=args, returncode=1, stdout="", stderr=DETECT_LOG + ) + return subprocess.CompletedProcess(args=args, returncode=0, stdout="", stderr="") + + monkeypatch.setattr(clip_exec, "_run_ffmpeg", run) + opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-12.5"]) + clip_exec.run_clip(opts, AppState(), json_mode=True) + clips = json.loads(capsys.readouterr().out)["clips"] + assert (clips[0]["start"], clips[0]["end"]) == (5.0, 12.5) + + +def test_run_clip_no_snap_skips_detection(media, capsys, monkeypatch): + calls = record_ffmpeg(monkeypatch, detect_log=DETECT_LOG) + opts = dataclasses.replace(DEFAULTS, media=str(media), ranges=["5-12.5"], snap=False) + clip_exec.run_clip(opts, AppState(), json_mode=True) + # Exactly one ffmpeg call: the cut, at the exact selected times. + assert len(calls) == 1 + assert "-af" not in calls[0] + assert calls[0][7:11] == ["-ss", "5.000", "-to", "12.500"] + clips = json.loads(capsys.readouterr().out)["clips"] + assert (clips[0]["start"], clips[0]["end"]) == (5.0, 12.5) + + def test_run_clip_surfaces_ffmpeg_failure(media, monkeypatch): monkeypatch.setattr("shutil.which", lambda name: "/usr/bin/ffmpeg") @@ -282,8 +347,8 @@ def fake_transcribe(api_key, audio, *, config): assert payload["transcript_id"] == "tr_123" # Speaker A's two utterances: 1.5-2.5s and 5-6s. assert [(c["start"], c["end"]) for c in payload["clips"]] == [(1.5, 2.5), (5.0, 6.0)] - assert fake_ffmpeg[0][-1] == str(media.parent / "meeting.clip01.mp4") - assert fake_ffmpeg[1][-1] == str(media.parent / "meeting.clip02.mp4") + assert fake_ffmpeg[1][-1] == str(media.parent / "meeting.clip01.mp4") + assert fake_ffmpeg[2][-1] == str(media.parent / "meeting.clip02.mp4") def test_run_clip_reuses_transcript_by_id(media, fake_ffmpeg, capsys, monkeypatch): @@ -359,4 +424,8 @@ def fake_status(message, *, json_mode, quiet): monkeypatch.setattr(clip_exec.output, "status", fake_status) opts = dataclasses.replace(DEFAULTS, media=str(media), speakers=["A"]) clip_exec.run_clip(opts, AppState(), json_mode=False) - assert messages == ["Transcribing for clip selection…", "Cutting 2 clip(s)…"] + assert messages == [ + "Transcribing for clip selection…", + "Detecting silence…", + "Cutting 2 clip(s)…", + ] diff --git a/tests/test_clip_select.py b/tests/test_clip_select.py index 7754ec7b..55e9ffc1 100644 --- a/tests/test_clip_select.py +++ b/tests/test_clip_select.py @@ -1,9 +1,11 @@ """Tests for the pure clip selection logic (aai_cli/clip_select.py): --range parsing, segment merging, utterance filtering, the LLM listing/reply contract, -and clock formatting.""" +silencedetect parsing, boundary snapping, and clock formatting.""" from __future__ import annotations +import math + import pytest from aai_cli import clip_select @@ -193,6 +195,120 @@ def test_segment_is_immutable(): setattr(segment, field_name, 5.0) +# --- silence detection & snapping --------------------------------------------- + + +def test_parse_silences_pairs_start_end_edges_in_order(): + log = ( + "Stream mapping: ...\n" + "[silencedetect @ 0x6000] silence_start: 4\n" + "[silencedetect @ 0x6000] silence_end: 4.6 | silence_duration: 0.6\n" + "[silencedetect @ 0x6000] silence_start: 13.25\n" + "[silencedetect @ 0x6000] silence_end: 14 | silence_duration: 0.75\n" + ) + assert clip_select.parse_silences(log) == [Segment(4.0, 4.6), Segment(13.25, 14.0)] + + +def test_parse_silences_trailing_start_runs_to_end_of_file(): + assert clip_select.parse_silences("silence_start: 7.5\n") == [Segment(7.5, math.inf)] + + +def test_parse_silences_clamps_negative_start_to_zero(): + # ffmpeg can report a small negative start from decoder priming samples. + log = "silence_start: -0.011\nsilence_end: 1.5 | silence_duration: 1.511\n" + assert clip_select.parse_silences(log) == [Segment(0.0, 1.5)] + + +def test_parse_silences_ignores_an_unpaired_end(): + log = "silence_start: 1\nsilence_end: 2\nsilence_end: 3\n" + assert clip_select.parse_silences(log) == [Segment(1.0, 2.0)] + + +def test_parse_silences_empty_log_finds_nothing(): + assert clip_select.parse_silences("") == [] + + +SILENCES = [Segment(4.0, 4.6), Segment(13.0, 14.0)] + + +def test_snap_moves_speech_boundaries_into_adjacent_silence(): + # 5.0 and 12.5 both land on speech: the start backs into the 4.0-4.6 + # silence (SNAP_LEAD before speech resumes at 4.6), the end runs forward + # into the 13.0-14.0 silence (SNAP_LEAD past where speech stops at 13.0). + snapped = clip_select.snap_to_silences([Segment(5.0, 12.5)], SILENCES) + assert snapped == [Segment(4.35, 13.25)] + + +def test_snap_clamps_inside_a_narrow_silence(): + # Silences narrower than SNAP_LEAD: the boundary stays within the silence. + silences = [Segment(4.5, 4.6), Segment(13.0, 13.1)] + snapped = clip_select.snap_to_silences([Segment(5.0, 12.5)], silences) + assert snapped == [Segment(4.5, 13.1)] + + +def test_snap_leaves_boundaries_already_in_silence_alone(): + # Both boundaries sit in silence already (one mid-silence, one exactly at + # a silence edge): they honor the selection (and --padding) exactly. + snapped = clip_select.snap_to_silences([Segment(4.2, 13.0)], SILENCES) + assert snapped == [Segment(4.2, 13.0)] + snapped = clip_select.snap_to_silences([Segment(4.6, 14.0)], SILENCES) + assert snapped == [Segment(4.6, 14.0)] + + +def test_snap_prefers_the_silence_a_boundary_touches_over_a_nearby_one(): + # A start exactly at a silence's start (and an end exactly at a silence's + # end) belongs to that silence — not snapped toward the neighbouring one. + silences = [Segment(3.2, 4.4), Segment(4.6, 5.0), Segment(5.2, 6.0)] + snapped = clip_select.snap_to_silences([Segment(4.6, 5.0)], silences) + assert snapped == [Segment(4.6, 5.0)] + + +@pytest.mark.parametrize( + ("segment", "expected"), + [ + # Exactly SNAP_REACH from the silences on both sides: still snaps. + (Segment(5.0, 12.5), Segment(3.25, 14.25)), + # Just beyond SNAP_REACH on both sides: continuous speech, stays put. + (Segment(5.1, 12.4), Segment(5.1, 12.4)), + ], +) +def test_snap_reach_bounds_how_far_a_boundary_moves(segment, expected): + silences = [Segment(1.0, 3.5), Segment(14.0, 15.0)] + assert clip_select.snap_to_silences([segment], silences) == [expected] + + +def test_snap_boundaries_beyond_all_silences_stay_put(): + snapped = clip_select.snap_to_silences([Segment(0.5, 20.0)], SILENCES) + assert snapped == [Segment(0.5, 20.0)] + + +def test_snap_end_into_a_trailing_silence_that_runs_to_end_of_file(): + silences = [Segment(10.0, math.inf)] + snapped = clip_select.snap_to_silences([Segment(5.0, 9.9)], silences) + assert snapped == [Segment(5.0, 10.25)] + + +def test_snap_sorts_the_silences_it_is_given(): + silences = [Segment(13.0, 14.0), Segment(4.0, 4.6)] + snapped = clip_select.snap_to_silences([Segment(14.2, 20.0)], silences) + assert snapped == [Segment(13.75, 20.0)] + + +def test_snap_merges_segments_that_meet_inside_one_silence(): + # Both boundaries snap into the same 4.6-5.0 silence and now overlap, so + # the two clips fold into one instead of duplicating the pause. + silences = [Segment(4.6, 5.0)] + snapped = clip_select.snap_to_silences([Segment(2.0, 4.5), Segment(5.2, 7.0)], silences) + assert snapped == [Segment(2.0, 7.0)] + + +def test_snap_without_silences_changes_nothing(): + # No detected silences (or a failed detection): segments pass through + # untouched — not even re-merged. + segments = [Segment(5.0, 6.0), Segment(5.5, 7.0)] + assert clip_select.snap_to_silences(segments, []) == segments + + # --- clock formatting -------------------------------------------------------- diff --git a/tests/test_clip_sources.py b/tests/test_clip_sources.py index 2601ae2e..9db83d01 100644 --- a/tests/test_clip_sources.py +++ b/tests/test_clip_sources.py @@ -60,9 +60,9 @@ def test_run_clip_downloads_youtube_audio_into_cwd( assert fake_download["url"] == YT_URL # ffmpeg reads the downloaded temp file; the clip lands in the cwd, named # after the download (the temp dir is gone after the run). - assert fake_ffmpeg[0][6] == str(fake_download["path"]) + assert fake_ffmpeg[1][6] == str(fake_download["path"]) dest = tmp_path / "vid123.clip01.m4a" - assert fake_ffmpeg[0][-1] == str(dest) + assert fake_ffmpeg[1][-1] == str(dest) payload = json.loads(capsys.readouterr().out) assert payload["source"] == YT_URL assert payload["clips"][0]["path"] == str(dest) @@ -73,7 +73,7 @@ def test_run_clip_youtube_honors_out_dir(tmp_path, fake_ffmpeg, fake_download, c out_dir.mkdir() opts = dataclasses.replace(DEFAULTS, media=YT_URL, ranges=["1-2"], out_dir=out_dir) clip_exec.run_clip(opts, AppState(), json_mode=True) - assert fake_ffmpeg[0][-1] == str(out_dir / "vid123.clip01.m4a") + assert fake_ffmpeg[1][-1] == str(out_dir / "vid123.clip01.m4a") def test_run_clip_youtube_transcribes_the_downloaded_file( @@ -112,7 +112,7 @@ def fake_status(message, *, json_mode, quiet): monkeypatch.setattr(clip_exec.output, "status", fake_status) opts = dataclasses.replace(DEFAULTS, media=YT_URL, ranges=["1-2"]) clip_exec.run_clip(opts, AppState(), json_mode=False) - assert messages == ["Downloading audio…", "Cutting 1 clip(s)…"] + assert messages == ["Downloading audio…", "Detecting silence…", "Cutting 1 clip(s)…"] # --- transcript piped on stdin (-t -) ------------------------------------------- @@ -216,7 +216,7 @@ def fake_transform(api_key, *, prompt, transcript_text, model, max_tokens): payload = json.loads(capsys.readouterr().out) assert payload["transcript_id"] == "tr_123" assert [(c["start"], c["end"]) for c in payload["clips"]] == [(1.5, 4.0)] - assert fake_ffmpeg[0][7:11] == ["-ss", "1.500", "-to", "4.000"] + assert fake_ffmpeg[1][7:11] == ["-ss", "1.500", "-to", "4.000"] def test_run_clip_llm_composes_with_speaker_filter(media, fake_ffmpeg, capsys, monkeypatch): @@ -290,5 +290,6 @@ def fake_status(message, *, json_mode, quiet): assert messages == [ "Transcribing for clip selection…", "Selecting segments with gpt-5…", + "Detecting silence…", "Cutting 1 clip(s)…", ]