diff --git a/.gitignore b/.gitignore index f47e72d..c323e0f 100644 --- a/.gitignore +++ b/.gitignore @@ -17,4 +17,11 @@ /target Cargo.lock -**.claude/ \ No newline at end of file +**.claude/ + +# Parity harness (manual; outputs are per-machine fixtures). +# `target/` and `Cargo.lock` are already covered by the rules above. +tests/parity/out/ +tests/parity/python/.venv/ +tests/parity/python/uv.lock +tests/parity/python/*.egg-info/ diff --git a/CHANGELOG.md b/CHANGELOG.md index 1412816..ff21711 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -7,6 +7,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [Unreleased] +## [0.3.0] - 2026-05-02 + +### Changed + +- **Behaviour change** — `SpeechSegmenter::push_probability` now closes + speech segments when the silence counter matches the upstream Python + `silero-vad` package's semantics. Previously the crate's silence + counter was evaluated AFTER the current frame's contribution had been + added to `current_sample`, while upstream Python evaluates the + equivalent `cur_sample - temp_end` BEFORE the current frame is + consumed. The crate's counter therefore fired one model frame + (32 ms at 16 kHz / 512-sample windows) too early — at the default + `min_silence_duration_ms = 100`, the crate closed a segment after 4 + consecutive low-probability frames where Python tolerates the dip and + closes after 5. The same off-by-one applied to the + `min_silence_at_max_speech_samples` comparator on the same code path. + Discovered by the parity harness in `tests/parity/`. + +### Migration + +Callers who hand-tuned `min_silence_duration_ms` against the v0.2.x +response curve may want to subtract ~32 ms from their value to keep the +same effective behaviour against v0.3.0+. Default callers do not need +to change anything — defaults still match upstream silero-vad PyPI +defaults verbatim, and the response curve is now strictly closer to +upstream than it was in v0.2.x. + +### Verified + +- `cargo test` +- `cargo test --no-default-features` +- `cargo build --release` +- `tests/parity/run.sh` on the five short dia parity fixtures + (`01_dialogue`, `02_pyannote_sample`, `03_dual_speaker`, + `04_three_speaker`, `05_four_speaker`): median IoU 1.0000 and + segment counts match exactly against upstream Python silero-vad + (51/51, 4/4, 14/14, 6/6, 14/14) WITHOUT the previous + `--min-silence-ms 132` override. + ## [0.2.0] - 2026-04-21 ### Added diff --git a/Cargo.toml b/Cargo.toml index 7b57d8f..bfbfe5e 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "silero" -version = "0.2.1" +version = "0.3.0" edition = "2024" rust-version = "1.85" repository = "https://github.com/Findit-AI/silero" diff --git a/src/detector.rs b/src/detector.rs index fa4d7bf..4d0243c 100644 --- a/src/detector.rs +++ b/src/detector.rs @@ -176,8 +176,18 @@ impl SpeechSegmenter { return None; } + // Silence-counter is evaluated against `frame_start` (the start sample + // of the current frame), not `current_sample` (which is already the + // *end* of the current frame). This matches upstream Python + // `silero-vad`'s `sil_dur_now = cur_sample - temp_end` semantics, + // where `cur_sample` is read BEFORE the model consumes the current + // window. Without this, the comparator fires one frame early — a + // 4-frame (128 ms) silence dip would close a segment at default + // `min_silence_duration_ms = 100`, where Python tolerates it and + // closes after 5 consecutive low-probability frames. See the parity + // harness in `tests/parity/` and the v0.3.0 CHANGELOG entry. let silence_start = *self.tentative_end.get_or_insert(frame_start); - let silence_samples = self.current_sample.saturating_sub(silence_start); + let silence_samples = frame_start.saturating_sub(silence_start); if silence_samples > self.options.min_silence_at_max_speech_samples() { self.max_split_end = Some(silence_start); } @@ -390,6 +400,18 @@ mod tests { #[test] fn middle_band_frames_do_not_reset_tentative_end() { + // Verifies that mid-band probabilities (between the end_threshold and + // start_threshold, e.g. `0.4` against the default `0.5` start) do NOT + // reset the silence accumulator — they're treated as "not yet + // confirmed speech". + // + // Updated 0.3.0: post the silence-counter off-by-one fix, the segment + // closes after FIVE consecutive low-or-mid-band frames at the default + // `min_silence_duration_ms = 100` (1600 samples / 512 per frame = + // 3.125 → 4 prior frames + the close-firing 5th frame), matching + // upstream Python silero-vad. The pre-0.3.0 crate closed after FOUR + // frames (one frame too eager). See `tests/parity/README.md` and the + // 0.3.0 CHANGELOG entry for the full derivation. let config = SpeechOptions::default() .with_min_speech_duration(Duration::ZERO) .with_speech_pad(Duration::ZERO) @@ -397,24 +419,44 @@ mod tests { let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; - probabilities.extend([0.0, 0.4, 0.0, 0.0]); + // Five low/mid frames so the segment closes via push_probability. + // The mid-band 0.4 frame in the middle must NOT reset the silence + // accumulator — that's the actual property under test. + probabilities.extend([0.0, 0.4, 0.0, 0.0, 0.0]); probabilities.extend(vec![0.9; 4]); let segments = collect(&mut segmenter, &probabilities); assert_eq!(segments.len(), 2); assert_eq!(segments[0].start_sample(), 0); assert_eq!(segments[0].end_sample(), 2_048); - assert_eq!(segments[1].start_sample(), 4_096); + // Segment two starts on the first speech frame after the closed + // silence (4 high + 5 silence = frame index 9, sample 4_608). + assert_eq!(segments[1].start_sample(), 4_608); } #[test] fn min_speech_duration_is_checked_before_padding() { + // A speech burst of 6 frames * 32 ms = 192 ms is shorter than the + // default `min_speech_duration_ms = 250`, so the segment that the + // trailing silence closes must be dropped — `min_speech` is checked + // against the raw speech window (raw_end - raw_start), not against + // the padded boundaries. + // + // Updated 0.3.0: post the silence-counter off-by-one fix, push-based + // close requires FIVE consecutive low-probability frames at the + // default `min_silence_duration_ms = 100` (was 4 pre-0.3.0). Trailing + // silence is extended from 4 to 5 frames so the close still fires + // via `push_probability` — otherwise `finish()` would emit the + // burst-plus-trailing-silence as a single trailing segment that + // satisfies the 250 ms duration check, which is a different (and + // correct, but separate) behaviour. See `tests/parity/README.md` + // and the 0.3.0 CHANGELOG entry. let config = SpeechOptions::default(); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.0; 4]; probabilities.extend(vec![0.9; 6]); - probabilities.extend(vec![0.0; 4]); + probabilities.extend(vec![0.0; 5]); let segments = collect(&mut segmenter, &probabilities); assert!(segments.is_empty()); @@ -517,12 +559,25 @@ mod tests { #[test] fn force_split_during_silence_closes_without_restarting() { + // Updated 0.3.0: max_speech_duration bumped from 224 ms to 256 ms so + // the max-speech split fires one frame later, after `max_split_end` + // has been recorded by the silence-counter logic. With the + // off-by-one fix to that logic, `max_split_end` is now set on the + // 4th low-probability frame instead of the 3rd, so the test's + // pre-existing 224 ms ceiling would split at sample 3_584 with + // `max_split_end == None` (falling back to `frame_start` and + // closing at sample 3_584 instead of at the recorded silence + // boundary 2_048). Bumping the ceiling preserves the property under + // test — that a force-split during silence closes at the silence + // boundary, not at the current frame, and does NOT restart a new + // segment afterwards. See `tests/parity/README.md` and the 0.3.0 + // CHANGELOG entry. let config = SpeechOptions::default() .with_min_speech_duration(Duration::ZERO) .with_speech_pad(Duration::ZERO) .with_min_silence_duration(Duration::from_millis(10_000)) .with_min_silence_at_max_speech(Duration::from_millis(64)) - .with_max_speech_duration(Duration::from_millis(224)); + .with_max_speech_duration(Duration::from_millis(256)); let mut segmenter = SpeechSegmenter::new(config); let mut probabilities = vec![0.9; 4]; @@ -534,6 +589,79 @@ mod tests { assert_eq!(segments[0].end_sample(), 2_048); } + #[test] + fn four_frame_silence_dip_does_not_close_segment_at_default_min_silence() { + // Pinned in 0.3.0 as a regression guard for the silence-counter + // off-by-one fix. + // + // At the default `min_silence_duration_ms = 100` (1600 samples at + // 16 kHz) and the default 32 ms / 512-sample frame, upstream Python + // `silero-vad` (`get_speech_timestamps`) closes a segment after + // FIVE consecutive low-probability frames — `sil_dur_now = + // cur_sample - temp_end` is evaluated BEFORE the current frame is + // consumed, so the comparator sees `(k-1) * 512` on the k-th + // low-prob frame and only crosses the 1600-sample threshold at + // k = 5. + // + // Pre-0.3.0 the silero crate evaluated the same counter AFTER the + // current frame was added to `current_sample`, so it saw `k * 512` + // and closed at k = 4. A 4-frame (128 ms) silence dip would + // therefore split a segment in the crate but be tolerated by Python. + // + // This test pins the post-fix behaviour: a 4-frame silence dip must + // be tolerated. The 30-frame speech runs ensure both halves + // individually clear `min_speech_duration_ms = 250` (8 frames), + // so neither would be dropped by the min-speech filter if the + // segment did split. + // + // See `tests/parity/README.md` "Off-by-one silence threshold finding" + // and the 0.3.0 CHANGELOG entry for the motivation. + let config = SpeechOptions::default(); + let mut segmenter = SpeechSegmenter::new(config.clone()); + + let mut probabilities = vec![1.0; 30]; + probabilities.extend(vec![0.0; 4]); + probabilities.extend(vec![1.0; 30]); + + let segments = collect(&mut segmenter, &probabilities); + assert_eq!( + segments.len(), + 1, + "4-frame silence dip must be tolerated at default min_silence_duration_ms = 100; \ + got {} segments", + segments.len() + ); + // Sanity: the (one) segment must start at 0 (the start-pad + // saturates against the timeline's zero) and span the full + // 30 + 4 + 30 = 64 frame window — at 512 samples / frame, that + // ends at 32_768. + assert_eq!(segments[0].start_sample(), 0); + assert_eq!(segments[0].end_sample(), 32_768); + } + + #[test] + fn five_frame_silence_dip_closes_segment_at_default_min_silence() { + // Companion to `four_frame_silence_dip_does_not_close_segment_*`. + // Pinned in 0.3.0: at the same defaults, FIVE consecutive low-prob + // frames must close the segment — matching upstream Python + // silero-vad's `sil_dur_now >= 1600` firing on the 5th frame. + let config = SpeechOptions::default(); + let mut segmenter = SpeechSegmenter::new(config); + + let mut probabilities = vec![1.0; 30]; + probabilities.extend(vec![0.0; 5]); + probabilities.extend(vec![1.0; 30]); + + let segments = collect(&mut segmenter, &probabilities); + assert_eq!( + segments.len(), + 2, + "5-frame silence dip must close the segment at default \ + min_silence_duration_ms = 100; got {} segments", + segments.len() + ); + } + #[test] fn force_split_applies_speech_pad_to_split_boundaries() { let config = SpeechOptions::default() diff --git a/src/lib.rs b/src/lib.rs index 788d274..dbec68a 100644 --- a/src/lib.rs +++ b/src/lib.rs @@ -13,6 +13,13 @@ mod stream; pub use detector::{SpeechDetector, SpeechSegment, SpeechSegmenter, detect_speech}; pub use error::{Error, Result}; pub use options::{GraphOptimizationLevel, SampleRate, SessionOptions, SpeechOptions}; + +/// Version string of the `silero` crate (`CARGO_PKG_VERSION`). +/// +/// Exposed so out-of-tree harnesses (e.g. the parity runner) can record +/// the exact silero version under test rather than the harness binary's +/// own version. +pub const VERSION: &str = env!("CARGO_PKG_VERSION"); #[cfg(feature = "bundled")] #[cfg_attr(docsrs, doc(cfg(feature = "bundled")))] pub use session::BUNDLED_MODEL; diff --git a/tests/parity/Cargo.toml b/tests/parity/Cargo.toml new file mode 100644 index 0000000..d234ec6 --- /dev/null +++ b/tests/parity/Cargo.toml @@ -0,0 +1,35 @@ +[workspace] + +[package] +name = "silero-parity-runner" +version = "0.0.0" +edition = "2024" +publish = false +description = "Manual parity harness: dump silero (Rust crate) VAD output to JSON for side-by-side comparison with upstream Python silero-vad." + +[dependencies] +# `silero` crate under test. `bundled` ships the ONNX model bytes +# in the binary so the harness has no external model dependency. +silero = { path = "../..", features = ["bundled"] } +# `ffmpeg-next` mirrors the production audio-loading path that +# downstream callers (whispery, dia) use, and matches how upstream +# Python `silero-vad` loads audio (it uses `torchaudio` which calls +# ffmpeg under the hood, then casts to f32 / 32768.0). Loading via +# the same backend on both sides keeps the f32 buffer the model +# sees byte-identical, so any output divergence is the model / +# segmenter — not the audio decode path. +# +# Pinned to `8` because the local toolchain ships system FFmpeg 8.x +# (Homebrew `ffmpeg 8.1`) which dropped `libavcodec/avfft.h`; the +# `ffmpeg-next 7.x` series still references that header in its +# bindgen pass and therefore fails to build against system FFmpeg 8. +# Same pin whispery's parity harness uses. +ffmpeg-next = "8" +serde_json = "1" +clap = { version = "4", features = ["derive"] } +sha2 = "0.10" +anyhow = "1" + +[[bin]] +name = "silero-parity-runner" +path = "src/main.rs" diff --git a/tests/parity/README.md b/tests/parity/README.md new file mode 100644 index 0000000..f674679 --- /dev/null +++ b/tests/parity/README.md @@ -0,0 +1,191 @@ +# silero parity test harness + +A side-by-side runner that compares this crate's VAD output against +upstream Python `silero-vad` on the same audio, reporting per-segment +IoU. Models the same approach `dia/tests/parity/` uses for pyannote +parity. + +The bundled ONNX model in `models/silero_vad.onnx` is the same network +upstream silero-vad ships, so this is genuinely a runtime comparison +(ORT inference + Rust segmenter vs PyTorch / ORT inference + Python +segmenter on identical bytes) — not a model-architecture comparison. + +## Layout + +- `Cargo.toml` / `src/main.rs` — Rust binary `silero-parity-runner` + that loads a 16 kHz mono WAV via `ffmpeg-next`, runs + `silero::detect_speech`, and emits JSON. +- `python/pyproject.toml` / `python/silero_vad_runner.py` — same CLI + shape, same JSON schema, runs upstream `silero_vad.get_speech_timestamps`. +- `python/score.py` — sequence-position pairing, per-segment IoU, + median + p10/p90 + worst-N report. +- `run.sh` — end-to-end driver (bring up venv → run both → score). + +## Prerequisites + +- `cargo` + Rust toolchain (the runner builds via `path = "../.."`). +- `uv` for Python virtualenv management (`brew install uv` or + `pip install uv`). +- `ffmpeg` on PATH — the Python runner shells out to it for audio + loading; the Rust runner uses `ffmpeg-next` (in-process bindings). + On macOS with Homebrew FFmpeg 8.x, `ffmpeg-next` is pinned to `8` + in `Cargo.toml` because the `7.x` series still references the + removed `libavcodec/avfft.h` header. +- A 16 kHz mono WAV (or any container ffmpeg can decode; will be + resampled). + +ORT runtime: this crate (and therefore the runner) uses `ort` with its +default `download-binaries` + `copy-dylibs` features, so a prebuilt +ONNX Runtime ships next to the binary — `ORT_DYLIB_PATH` is **not** +required (unlike the whispery harness, which uses `load-dynamic`). + +## Run + +```bash +cd silero +./tests/parity/run.sh /path/to/clip_16k.wav +./tests/parity/run.sh /path/to/fixture-dir # uses clip_16k.wav inside +``` + +Outputs land in `tests/parity/out/`: +- `silero_rs_.json` — Rust runner output. +- `silero_py_.json` — Python runner output. +- `score_.json` — IoU summary. + +Exit code 0 iff median IoU >= 0.95 **and** segment counts match. + +## Canonical fixture set + +The dia parity fixtures double as the silero parity fixtures: they're +real-speech 16 kHz mono WAVs of varying length and speaker counts. + +``` +/Users/user/Develop/findit-studio/dia/tests/parity/fixtures/ +├── 01_dialogue/clip_16k.wav # ~120 s, 2 spk dialogue +├── 02_pyannote_sample/clip_16k.wav # ~30 s, pyannote sample +├── 03_dual_speaker/clip_16k.wav # ~60 s, 2 spk +├── 04_three_speaker/clip_16k.wav # 3 spk +├── 05_four_speaker/clip_16k.wav # 4 spk +└── 06_long_recording/clip_16k.wav # ~977 s, long-form +``` + +These are deliberately **not copied** into the silero repo (they're +large; dia is the source of truth for them). Pass the directory or +WAV path on the `run.sh` command line. + +For first validation we recommend the five short fixtures (skip +`06_long_recording` — at ~16 minutes it's slow to run and the short +fixtures cover all interesting boundary conditions). + +## Default parameter alignment + +Both runners default to the same parameter set, validated 2026-05 +against `silero-vad 6.2.1` source +(`src/silero_vad/utils_vad.py:get_speech_timestamps`): + +| Parameter | silero crate default | silero-vad-py default | Aligned? | +|------------------------------|----------------------|-----------------------|----------| +| `threshold` | 0.5 | 0.5 | yes | +| `min_speech_duration_ms` | 250 | 250 | yes | +| `min_silence_duration_ms` | 100 | 100 | yes | +| `speech_pad_ms` | 30 | 30 | yes | +| `min_silence_at_max_speech_ms`| 98 | 98 | yes | +| `max_speech_duration_s` | None (no limit) | `float('inf')` | yes | +| `sampling_rate` | 16 000 Hz | 16 000 Hz | yes | +| `window_size_samples` | 512 (chunk_samples) | 512 | yes | +| `neg_threshold` (end_thresh) | start - 0.15 (clamped to >=0.01) | start - 0.15 | yes | + +(See `silero/src/options.rs:default_*` constants and the upstream +`get_speech_timestamps` function signature.) + +### Off-by-one silence threshold finding (fixed in v0.3.0) + +> **Status: fixed in silero v0.3.0.** The harness no longer applies +> the `--min-silence-ms 132` workaround described below. Both runners +> now use upstream Python `silero-vad`'s defaults verbatim. + +**Historical context (preserved here as a record of how the bug was +characterised before the fix):** + +Up to and including silero v0.2.x the crate's +`SpeechSegmenter::push_probability` and Python's +`get_speech_timestamps` differed by exactly **one model frame +(32 ms at 16 kHz / 512-sample windows)** in how they computed the +"silence so far" counter: + +- **Python** (`silero_vad/utils_vad.py`): + - `temp_end` is set to `cur_sample` on the FIRST low-probability + frame. + - `sil_dur_now = cur_sample - temp_end` is computed BEFORE the + current frame is "consumed" (it's the frame's *start* sample). + - On the first low-prob frame, `sil_dur_now = 0`. On the k-th + consecutive low-prob frame, `sil_dur_now = (k-1) * 512`. + - Closes when `sil_dur_now >= 1600` → k = 5 frames. + +- **silero crate (pre-v0.3.0)** (`silero/src/detector.rs:147-190`): + - `tentative_end` is set to `frame_start` on the first low-prob + frame; immediately after, `current_sample` is incremented by + `frame_samples` (so it represents the END of the current frame). + - `silence_samples = current_sample - silence_start = j * 512` after + the j-th consecutive low-prob frame (j ≥ 1). + - Closes when `silence_samples >= 1600` → j = 4 frames. + +So a 4-frame (128 ms) silence dip closed the pre-v0.3.0 crate's +segment but was *tolerated* by Python — Python kept it as one segment +until 5 consecutive low-prob frames had passed. On a clip with many +short silence dips (e.g. dialogue with quick turn-taking), the crate +produced measurably more segments than Python at the same nominal +`min_silence_duration_ms`. + +**Pre-v0.3.0 workaround (now removed)**: `run.sh` used to override +the crate side with `--min-silence-ms 132` (= 100 + 32), shifting the +close threshold by one frame so the two segmenters consumed the same +number of low-prob frames before closing. + +**Fix in v0.3.0**: `SpeechSegmenter::push_probability` now evaluates +the silence counter against `frame_start` (the start sample of the +current frame) instead of `current_sample` (the end). This mirrors +Python's "compute `cur_sample - temp_end` before consuming the +current frame" semantics literally. The same correction applies to +the `min_silence_at_max_speech_samples` comparator that lives on the +same code path. Both close-after-5-frames and the (4-frame, no-close) +boundary are now pinned by unit tests in +`silero/src/detector.rs::tests` — +`five_frame_silence_dip_closes_segment_at_default_min_silence` and +`four_frame_silence_dip_does_not_close_segment_at_default_min_silence`. + +**Migration note for callers**: this is a behaviour change. Anyone +who hand-tuned `min_silence_duration_ms` against the v0.2.x response +curve may want to subtract ~32 ms from their override to get the +same effective behaviour against v0.3.0+. + +The other parameters (start/end threshold, min-speech, speech-pad, +min-silence-at-max-speech) all lined up at defaults pre-fix too — +only the silence-counter equation diverged. + +## How parity is scored + +`score.py` pairs segments by **sequence position** (i-th from a vs +i-th from b) and computes time-range IoU per pair. This is the right +matcher when both runners are expected to produce the same boundaries +on the same audio — a single missing or extra segment will degrade +the metric instead of accidentally re-aligning everything around the +gap. + +Pass condition (default): median IoU >= 0.95 **and** `len(segments_a) +== len(segments_b)`. Pass `--allow-segment-count-mismatch` to soften +the count check (useful when diagnosing which side over- or +under-segments). + +The `clip_sha256` field on each runner output hashes the f32 PCM +bytes the model actually saw. If those hashes diverge, score.py warns +loudly because any IoU disagreement could then be a loader issue +rather than a model issue. + +## Notes + +- The harness is **NOT** part of `cargo test`. It's a manual run for + release-time validation and for diagnosing regressions. +- Don't commit binary fixtures or model files into this crate. +- Don't change anything in `silero/src/` from this harness — it's + read-only on the public crate API. diff --git a/tests/parity/python/pyproject.toml b/tests/parity/python/pyproject.toml new file mode 100644 index 0000000..f7fce25 --- /dev/null +++ b/tests/parity/python/pyproject.toml @@ -0,0 +1,31 @@ +[project] +name = "silero-parity-reference" +version = "0.0.0" +requires-python = ">=3.10" +# `silero-vad` is the upstream PyPI package this harness compares the +# Rust crate against. Pinned to `>=5.1` so we always pull a stable +# 5.x/6.x — the `get_speech_timestamps` API and parameter set has +# been stable since 5.x. Bump this exact pin (or tighten to `==`) on +# release branches if a behaviour change in upstream silero-vad +# regresses the parity numbers. +# +# `torch` is a transitive of `silero-vad` already; we list it +# explicitly so a one-off install of the parity venv works without +# relying on solver cascades. +dependencies = [ + "silero-vad >= 5.1", + "torch >= 2.0", + "numpy >= 1.26", + # `silero-vad`'s `load_silero_vad(onnx=True)` path uses + # onnxruntime — the runner defaults to that backend so both + # runners feed identical bytes to ORT. Without this dep, the ONNX + # path raises at load time. + "onnxruntime >= 1.18", +] + +# Disable setuptools auto-discovery: this project carries Python +# scripts only — no installable package layout — so an empty +# `packages` list lets `uv pip install -e .` install just the project +# metadata + dependencies without erroring on auto-discovery. +[tool.setuptools] +packages = [] diff --git a/tests/parity/python/score.py b/tests/parity/python/score.py new file mode 100644 index 0000000..d8fe005 --- /dev/null +++ b/tests/parity/python/score.py @@ -0,0 +1,237 @@ +"""Compare two parity-runner JSON outputs (one silero-rs, one +silero-vad-py) and report per-segment IoU statistics. + +Approach: +1. Pair segments by **sequence position** — silero VAD output is + ordered and the two implementations run the same segmenter logic + on the same audio, so the i-th segment from one runner corresponds + to the i-th segment from the other when both are well-aligned. If + the segment counts differ we still pair as far as the shorter list + goes and count the rest as drops on the longer side. +2. For each matched pair, compute time-range IoU on `[start_s, end_s]`. +3. Emit a JSON summary on stdout (or `--out`) and a human-readable + summary on stderr. + +Why sequence-position pairing rather than text-aware Needleman-Wunsch +(the whispery harness does the latter): silero output has no text to +key on, and a near-bit-exact runner pair will produce near-identical +segment boundaries — sequence-position is the right matching when the +implementations are supposed to agree. If a Rust regression shifts the +segment count by ±1, the pairing degrades gracefully and the +`segment_count_*` fields in the summary make the divergence obvious. + +Pass/fail: median IoU >= `--threshold` (default 0.95) AND the segment +counts match. The 0.95 default reflects "near-bit-equivalent" — silero +running through ORT vs PyTorch on identical inputs should produce +boundaries that round to within one frame (32 ms at 16 kHz / 512-sample +windows). + +Usage: + uv run python score.py +""" + +from __future__ import annotations + +import argparse +import json +import statistics +import sys +from dataclasses import dataclass +from pathlib import Path + + +@dataclass +class Segment: + start_s: float + end_s: float + + @property + def duration_s(self) -> float: + return max(0.0, self.end_s - self.start_s) + + +def _load(path: Path) -> tuple[str, list[Segment], dict]: + payload = json.loads(path.read_text()) + segments = [ + Segment(start_s=float(s["start_s"]), end_s=float(s["end_s"])) + for s in payload["segments"] + ] + return payload.get("runner", path.stem), segments, payload + + +def _iou(a: Segment, b: Segment) -> float: + inter = max(0.0, min(a.end_s, b.end_s) - max(a.start_s, b.start_s)) + union = max(a.end_s, b.end_s) - min(a.start_s, b.start_s) + if union <= 0.0: + return 0.0 + return inter / union + + +def _stats(values: list[float]) -> dict[str, float | int]: + if not values: + return {"count": 0} + sv = sorted(values) + n = len(sv) + return { + "count": n, + "mean": float(statistics.fmean(sv)), + "median": float(statistics.median(sv)), + "p10": float(sv[max(0, int(0.10 * (n - 1)))]), + "p90": float(sv[min(n - 1, int(0.90 * (n - 1)))]), + "min": float(sv[0]), + "max": float(sv[-1]), + "below_0.5": int(sum(1 for v in sv if v < 0.5)), + "below_0.9": int(sum(1 for v in sv if v < 0.9)), + } + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Score two silero parity-runner JSON outputs against each other." + ) + parser.add_argument("a_json", type=Path, help="First runner JSON (e.g. silero-rs).") + parser.add_argument("b_json", type=Path, help="Second runner JSON (e.g. silero-vad-py).") + parser.add_argument( + "--out", + type=Path, + default=None, + help="Write JSON summary here (default: stdout).", + ) + parser.add_argument( + "--threshold", + type=float, + default=0.95, + help="Median IoU threshold for exit-code 0 (default: 0.95).", + ) + parser.add_argument( + "--allow-segment-count-mismatch", + action="store_true", + help=( + "By default the score fails if the two runners produced different " + "segment counts (which usually means a real boundary disagreement, " + "not just a fractional shift). Pass this to soften the check to " + "median-IoU only." + ), + ) + args = parser.parse_args() + + name_a, segs_a, payload_a = _load(args.a_json) + name_b, segs_b, payload_b = _load(args.b_json) + + # Quick sanity: surface a clip_sha256 mismatch loudly. If the two + # runners disagree on the input bytes, the IoU number is comparing + # apples to oranges and any disagreement is the loader's fault, not + # the model's. + sha_a = payload_a.get("clip_sha256") + sha_b = payload_b.get("clip_sha256") + sha_match = sha_a is not None and sha_b is not None and sha_a == sha_b + if not sha_match: + print( + f"[score] WARNING: clip_sha256 differs between runners: " + f"{name_a}={sha_a[:16] if sha_a else '(none)'} vs " + f"{name_b}={sha_b[:16] if sha_b else '(none)'} — IoU below " + f"may reflect loader divergence rather than VAD divergence", + file=sys.stderr, + ) + + pairs: list[tuple[int, int]] = [] + n_pairs = min(len(segs_a), len(segs_b)) + for i in range(n_pairs): + pairs.append((i, i)) + dropped_a = max(0, len(segs_a) - n_pairs) + dropped_b = max(0, len(segs_b) - n_pairs) + + matched = [(segs_a[i], segs_b[j], _iou(segs_a[i], segs_b[j])) for i, j in pairs] + iou_values = [iou for _, _, iou in matched] + iou_stats = _stats(iou_values) + + matched_sorted = sorted(matched, key=lambda t: t[2]) + worst = [ + { + "iou": round(iou, 4), + name_a: { + "start_s": round(sa.start_s, 3), + "end_s": round(sa.end_s, 3), + "dur_s": round(sa.duration_s, 3), + }, + name_b: { + "start_s": round(sb.start_s, 3), + "end_s": round(sb.end_s, 3), + "dur_s": round(sb.duration_s, 3), + }, + } + for sa, sb, iou in matched_sorted[:5] + ] + + counts_match = len(segs_a) == len(segs_b) + median_pass = iou_stats.get("median", 0.0) >= args.threshold and len(matched) > 0 + passed = bool(median_pass and (counts_match or args.allow_segment_count_mismatch)) + + summary = { + "runner_a": name_a, + "runner_b": name_b, + "clip_sha256_match": sha_match, + "segment_count_a": len(segs_a), + "segment_count_b": len(segs_b), + "matched_pairs": len(matched), + "dropped_by_a": dropped_a, + "dropped_by_b": dropped_b, + "iou": iou_stats, + "worst_5": worst, + "threshold_median_iou": args.threshold, + "counts_match": counts_match, + "passed": passed, + } + + serialized = json.dumps(summary, indent=2) + if args.out is None: + print(serialized) + else: + args.out.write_text(serialized + "\n") + + median = iou_stats.get("median", 0.0) + print( + f"\n[score] {name_a} ({len(segs_a)} segs) vs {name_b} ({len(segs_b)} segs)", + file=sys.stderr, + ) + print( + f" matched={len(matched)} dropped_a={dropped_a} dropped_b={dropped_b}", + file=sys.stderr, + ) + if iou_stats["count"] == 0: + print( + " no matched pairs — both runners produced empty segment lists", + file=sys.stderr, + ) + # Empty + empty is technically a match; only fail if either side + # had segments. + return 0 if (len(segs_a) == 0 and len(segs_b) == 0) else 1 + print( + f" IoU mean={iou_stats['mean']:.4f} median={iou_stats['median']:.4f} " + f"p10={iou_stats['p10']:.4f} p90={iou_stats['p90']:.4f} " + f"below_0.5={iou_stats['below_0.5']} below_0.9={iou_stats['below_0.9']}", + file=sys.stderr, + ) + if worst: + print(" worst 5 pairs:", file=sys.stderr) + for w in worst: + a = w[name_a] + b = w[name_b] + print( + f" iou={w['iou']:.3f} a=[{a['start_s']:.3f},{a['end_s']:.3f}] " + f"({a['dur_s']:.3f}s) b=[{b['start_s']:.3f},{b['end_s']:.3f}] " + f"({b['dur_s']:.3f}s)", + file=sys.stderr, + ) + + pass_str = "PASS" if summary["passed"] else "FAIL" + print( + f" {pass_str} (median IoU {median:.4f} vs threshold {args.threshold}, " + f"counts {len(segs_a)} vs {len(segs_b)})", + file=sys.stderr, + ) + return 0 if summary["passed"] else 1 + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/parity/python/silero_vad_runner.py b/tests/parity/python/silero_vad_runner.py new file mode 100644 index 0000000..2320819 --- /dev/null +++ b/tests/parity/python/silero_vad_runner.py @@ -0,0 +1,239 @@ +"""Run upstream Python `silero-vad` on a 16 kHz mono WAV; emit the raw +VAD segments as JSON in the same schema as the Rust +`silero-parity-runner`. + +Why this is structurally simple: +- We call `silero_vad.get_speech_timestamps(audio, model, ...)` directly. + That's the same entry point upstream documentation publishes; the + Rust crate's `SpeechSegmenter` is a port of the same logic. +- Defaults match between the two runners (validated 2026-05 against + silero-vad 6.2.1 source). See `tests/parity/README.md`. + +Usage: + uv run python silero_vad_runner.py --out + +All knobs are exposed as CLI flags so `run.sh` can pass exactly the +same parameter set to both runners. +""" + +from __future__ import annotations + +import argparse +import hashlib +import json +import math +import subprocess +import sys +import time +from importlib.metadata import version as pkg_version +from pathlib import Path + +import numpy as np +import torch +from silero_vad import get_speech_timestamps, load_silero_vad + + +def load_audio_16k_mono_f32(path: Path) -> np.ndarray: + """Decode `path` to 16 kHz mono `np.float32`. + + Mirrors WhisperX's `load_audio` (whisperx/audio.py): shell out to + `ffmpeg -nostdin -threads 0 -i -f s16le -ac 1 -acodec + pcm_s16le -ar 16000 -`, then `np.frombuffer(out, np.int16).astype( + np.float32) / 32768.0`. The Rust runner uses `ffmpeg-next` to do + exactly the same thing in-process. Doing the same conversion on + both sides keeps the f32 buffer the model sees byte-identical, so + `clip_sha256` matches across runners and any output divergence is + the model / segmenter rather than the loader. + + `silero-vad`'s own `read_audio` uses `torchaudio.load` which goes + through ffmpeg/sox under the hood — close enough that segments + almost always agree, but the byte-identical path is what makes the + parity hash check meaningful. + """ + cmd = [ + "ffmpeg", + "-nostdin", + "-threads", + "0", + "-i", + str(path), + "-f", + "s16le", + "-ac", + "1", + "-acodec", + "pcm_s16le", + "-ar", + "16000", + "-", + ] + proc = subprocess.run(cmd, capture_output=True, check=True) + pcm = np.frombuffer(proc.stdout, dtype=np.int16) + return pcm.astype(np.float32) / 32768.0 + + +def sha256_f32_buffer(audio: np.ndarray) -> str: + h = hashlib.sha256() + h.update(audio.tobytes(order="C")) + return h.hexdigest() + + +def main() -> int: + parser = argparse.ArgumentParser( + description="Run upstream Python silero-vad on a 16 kHz mono WAV; emit segments as JSON." + ) + parser.add_argument("wav_path", type=Path, help="16 kHz mono WAV (any container ffmpeg can decode).") + parser.add_argument("--out", type=Path, default=None, help="Output JSON path (default: stdout).") + # Defaults below match `silero_vad.get_speech_timestamps` exactly + # (validated against silero-vad 6.2.1 — see README). They also + # match the silero Rust crate's `SpeechOptions::default()`. Both + # runners therefore default to apples-to-apples comparison out of + # the box. + parser.add_argument("--threshold", type=float, default=0.5) + parser.add_argument("--min-speech-ms", type=int, default=250) + parser.add_argument("--min-silence-ms", type=int, default=100) + parser.add_argument("--speech-pad-ms", type=int, default=30) + parser.add_argument("--min-silence-at-max-speech-ms", type=int, default=98) + parser.add_argument( + "--max-speech-s", + type=float, + default=None, + help="Max speech duration in seconds before force-split. Default: no limit (math.inf).", + ) + parser.add_argument( + "--backend", + choices=("jit", "onnx"), + default="onnx", + help=( + "Silero model backend. Defaults to `onnx` so the Python side runs " + "the SAME ORT bytes the Rust crate runs (silero-vad's bundled " + "`data/silero_vad.onnx` is byte-identical to " + "`silero/models/silero_vad.onnx`). `jit` runs PyTorch JIT " + "instead — useful for measuring runtime drift, but those " + "numbers are NOT a fair Rust-vs-Python segmenter comparison " + "because PyTorch and ORT can disagree at the FP level." + ), + ) + args = parser.parse_args() + + wav_path = args.wav_path.resolve() + if not wav_path.is_file(): + print(f"WAV not found: {wav_path}", file=sys.stderr) + return 2 + + audio = load_audio_16k_mono_f32(wav_path) + sample_rate = 16_000 + duration_s = float(len(audio)) / sample_rate + clip_sha256 = sha256_f32_buffer(audio) + + print( + f"[silero-vad-py] wav={wav_path} dur={duration_s:.2f}s sha256={clip_sha256[:16]} " + f"threshold={args.threshold} max_speech_s={args.max_speech_s}", + file=sys.stderr, + ) + + t0 = time.monotonic() + # `load_silero_vad(onnx=...)` returns the VAD model from the + # bundled snapshot the silero-vad PyPI package ships (silero-vad 6.x + # bundles its own ONNX/JIT in the package itself rather than via + # torch.hub). We pass `onnx=True` by default so both runners feed + # identical bytes to ORT — same model, same backend — and any IoU + # disagreement is the segmenter logic, not the inference runtime. + use_onnx = args.backend == "onnx" + model = load_silero_vad(onnx=use_onnx) + backend_label = "silero_vad.onnx" if use_onnx else "silero_vad.jit" + t_load = time.monotonic() + + audio_t = torch.from_numpy(audio) + + kwargs = dict( + sampling_rate=sample_rate, + threshold=args.threshold, + min_speech_duration_ms=args.min_speech_ms, + min_silence_duration_ms=args.min_silence_ms, + speech_pad_ms=args.speech_pad_ms, + min_silence_at_max_speech=args.min_silence_at_max_speech_ms, + ) + if args.max_speech_s is not None: + kwargs["max_speech_duration_s"] = args.max_speech_s + else: + # Match the silero-vad default explicitly: `float('inf')`. Pass + # it through rather than relying on the library default so the + # JSON output records exactly what was used. + kwargs["max_speech_duration_s"] = math.inf + + timestamps = get_speech_timestamps(audio_t, model, **kwargs) + t_vad = time.monotonic() + + print( + f"[silero-vad-py] load={t_load - t0:.2f}s vad={t_vad - t_load:.2f}s " + f"-> {len(timestamps)} segments", + file=sys.stderr, + ) + + # `get_speech_timestamps` returns dicts with int sample indices + # under `start`/`end` (since we don't pass `return_seconds=True`). + # Emit both sample- and second-coordinates so score.py can choose. + segments = [] + for ts in timestamps: + start_sample = int(ts["start"]) + end_sample = int(ts["end"]) + segments.append( + { + "start_s": start_sample / sample_rate, + "end_s": end_sample / sample_rate, + "start_sample": start_sample, + "end_sample": end_sample, + } + ) + + payload = { + "runner": "silero-vad-py", + "silero_vad_version": _resolve_version(), + "torch_version": torch.__version__, + "backend": backend_label, + "clip_path": str(wav_path), + "clip_sha256": clip_sha256, + "duration_s": duration_s, + "params": { + "threshold": args.threshold, + "min_speech_duration_ms": args.min_speech_ms, + "min_silence_duration_ms": args.min_silence_ms, + "speech_pad_ms": args.speech_pad_ms, + "min_silence_at_max_speech_ms": args.min_silence_at_max_speech_ms, + # Effective value (matches what was actually passed to + # `get_speech_timestamps`): the float when `--max-speech-s` + # was set, otherwise `math.inf`. `json.dumps` emits the + # latter as `Infinity`, which Python's `json.loads` round- + # trips correctly; use a JSON parser that accepts non-strict + # output if you read this field from a different stack. + "max_speech_s": kwargs["max_speech_duration_s"], + "sampling_rate": sample_rate, + "window_size_samples": 512, + }, + "segment_count": len(segments), + "segments": segments, + } + + serialized = json.dumps(payload, indent=2) + if args.out is None: + print(serialized) + else: + args.out.write_text(serialized + "\n") + print( + f"[silero-vad-py] wrote {len(segments)} segments to {args.out}", + file=sys.stderr, + ) + + return 0 + + +def _resolve_version() -> str | None: + try: + return pkg_version("silero-vad") + except Exception: + return None + + +if __name__ == "__main__": + sys.exit(main()) diff --git a/tests/parity/run.sh b/tests/parity/run.sh new file mode 100755 index 0000000..0b2a884 --- /dev/null +++ b/tests/parity/run.sh @@ -0,0 +1,116 @@ +#!/usr/bin/env bash +# silero-rs vs upstream Python silero-vad parity harness. +# +# Requires: +# - `cargo` + Rust toolchain (silero-parity-runner builds via path = "../..") +# - `uv` on PATH (https://docs.astral.sh/uv/) for the Python venv +# - `ffmpeg` on PATH (the Python runner shells out to it for audio loading) +# +# Usage: +# ./tests/parity/run.sh +# +# Accepts either a fixture directory (uses `clip_16k.wav` inside) or a +# direct WAV path. +# +# The canonical test set is dia's parity fixtures at +# /Users/user/Develop/findit-studio/dia/tests/parity/fixtures/, which +# we don't copy into this repo (they're large). See README for a +# pointer. + +set -euo pipefail +SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)" +ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)" + +usage() { + echo "usage: $(basename "$0") " >&2 + echo "" >&2 + echo "Examples:" >&2 + echo " $(basename "$0") /path/to/dia/tests/parity/fixtures/01_dialogue" >&2 + echo " $(basename "$0") /path/to/clip_16k.wav" >&2 + exit 64 +} + +if [ "$#" -ne 1 ]; then + usage +fi + +ARG="$1" +if [ -d "$ARG" ]; then + CLIP="$ARG/clip_16k.wav" +elif [ -f "$ARG" ]; then + CLIP="$ARG" +else + echo "[run.sh] $ARG is neither a directory nor a WAV file" >&2 + exit 65 +fi + +if [ ! -f "$CLIP" ]; then + echo "[run.sh] no clip at $CLIP" >&2 + exit 66 +fi + +ABS_CLIP="$(cd "$(dirname "$CLIP")" && pwd)/$(basename "$CLIP")" +FIXTURE_NAME="$(basename "$(dirname "$ABS_CLIP")")" +if [ "$FIXTURE_NAME" = "" ] || [ "$FIXTURE_NAME" = "/" ]; then + FIXTURE_NAME="$(basename "$ABS_CLIP" .wav)" +fi + +OUT_DIR="$SCRIPT_DIR/out" +mkdir -p "$OUT_DIR" +RUST_OUT="$OUT_DIR/silero_rs_${FIXTURE_NAME}.json" +PY_OUT="$OUT_DIR/silero_py_${FIXTURE_NAME}.json" +SCORE_OUT="$OUT_DIR/score_${FIXTURE_NAME}.json" + +echo "[run.sh] clip: $ABS_CLIP" +echo "[run.sh] outputs: $RUST_OUT, $PY_OUT, $SCORE_OUT" + +# 1) uv venv for the Python side. Cached when unchanged. +cd "$SCRIPT_DIR/python" +if [ ! -d .venv ]; then + echo "[run.sh] creating uv venv at $(pwd)/.venv ..." + uv venv +fi +echo "[run.sh] syncing python deps (cached when unchanged) ..." +uv pip install -e . > /dev/null + +# 2) Rust runner. Builds in release mode with the bundled silero ONNX +# model. ort 2.0.0-rc.12's default features include `download-binaries` +# + `copy-dylibs`, so the prebuilt ONNX Runtime ships next to the +# binary — no need for `ORT_DYLIB_PATH` (unlike whispery's harness +# which uses load-dynamic). +# +# Both runners now use the upstream Python silero-vad defaults +# verbatim (threshold 0.5, min_speech_duration_ms 250, +# min_silence_duration_ms 100, speech_pad_ms 30, +# min_silence_at_max_speech_ms 98). The previous `--min-silence-ms 132` +# crate-side override compensated for an off-by-one in +# `SpeechSegmenter::push_probability`'s silence counter; that bug was +# fixed in silero v0.3.0, so the override is no longer required. +# `--min-silence-ms` remains a CLI flag on the runner for advanced +# users who want to override. +cd "$ROOT" +echo "[run.sh] running silero-parity-runner ..." +cargo run \ + --release \ + --quiet \ + --manifest-path tests/parity/Cargo.toml \ + -p silero-parity-runner \ + --bin silero-parity-runner \ + -- "$ABS_CLIP" \ + --out "$RUST_OUT" + +# 3) Python runner. Defaults match upstream silero-vad PyPI defaults, +# and the crate (v0.3.0+) now matches them too. +cd "$SCRIPT_DIR/python" +echo "[run.sh] running silero_vad_runner.py ..." +uv run python silero_vad_runner.py "$ABS_CLIP" --out "$PY_OUT" + +# 4) Score. Captures the score's exit code and propagates it. +cd "$SCRIPT_DIR/python" +echo "[run.sh] scoring ..." +set +e +uv run python score.py "$RUST_OUT" "$PY_OUT" --out "$SCORE_OUT" +SCORE_RC=$? +set -e + +exit $SCORE_RC diff --git a/tests/parity/src/main.rs b/tests/parity/src/main.rs new file mode 100644 index 0000000..8283be5 --- /dev/null +++ b/tests/parity/src/main.rs @@ -0,0 +1,427 @@ +//! `silero-parity-runner` — load a 16 kHz mono WAV via `ffmpeg-next`, +//! push it through `silero::detect_speech` (the production one-shot +//! offline path), and dump the resulting speech segments as JSON. +//! Pair with `python/silero_vad_runner.py` (same JSON schema, +//! `runner = "silero-vad-py"`) and `python/score.py` for IoU +//! comparison. +//! +//! This binary is **NOT** part of `cargo test`. It's invoked from the +//! `run.sh` driver. Audio loading uses `ffmpeg-next` so the f32 buffer +//! the silero ONNX model consumes is byte-identical to what the upstream +//! Python `silero-vad` package consumes (which also goes through +//! ffmpeg via `torchaudio` / `read_audio`). +//! +//! All `SpeechOptions` knobs are exposed via flags so the run.sh +//! driver can pass parameters that match the Python runner exactly. +//! Defaults match the silero crate's `SpeechOptions::default()`, which +//! in turn match upstream silero-vad PyPI defaults (threshold 0.5, +//! min_speech_duration_ms 250, min_silence_duration_ms 100, +//! speech_pad_ms 30, min_silence_at_max_speech_ms 98). + +use std::{ + fs, + io::Write, + path::{Path, PathBuf}, + sync::OnceLock, + time::Duration, +}; + +use anyhow::{Context, Result, bail}; +use clap::Parser; +use ffmpeg_next as ffmpeg; +use serde_json::json; +use sha2::{Digest, Sha256}; +use silero::{SampleRate, Session, SpeechOptions, detect_speech}; + +// Take the version string from the silero crate itself (re-exported as +// `silero::VERSION`) rather than `env!("CARGO_PKG_VERSION")`, which in +// this binary resolves to the parity-runner's own version (0.0.0). The +// JSON output should record the version of the crate under test. +const SILERO_CRATE_VERSION: &str = silero::VERSION; +// SHA-256 of the bundled ONNX model bytes. Computed on demand below. +// Logged so a snapshot rebuild in the silero crate that swaps +// `models/silero_vad.onnx` cannot silently invalidate the parity +// numbers — the JSON output records exactly which model bytes ran. + +#[derive(Parser, Debug)] +#[command( + about = "Run silero (Rust crate) VAD on a 16 kHz mono WAV; emit JSON for side-by-side comparison with upstream Python silero-vad." +)] +struct Args { + /// Path to a 16 kHz mono WAV (or any audio container ffmpeg can + /// decode; resampled to 16 kHz mono internally). + wav_path: PathBuf, + + /// Output file (defaults to stdout). + #[arg(long)] + out: Option, + + /// Speech-onset probability threshold. Silero crate default: 0.5. + #[arg(long, default_value_t = 0.5)] + threshold: f32, + + /// Minimum speech duration in milliseconds; shorter speech bursts are + /// dropped. Silero crate default: 250. + #[arg(long, default_value_t = 250)] + min_speech_ms: u64, + + /// Minimum silence duration in milliseconds before a speech segment + /// is closed. Silero crate default: 100. + #[arg(long, default_value_t = 100)] + min_silence_ms: u64, + + /// Speech padding (added at both ends of every emitted segment) in + /// milliseconds. Silero crate default: 30. + #[arg(long, default_value_t = 30)] + speech_pad_ms: u64, + + /// Minimum silence used as a preferred split point when + /// `--max-speech-s` is hit, in milliseconds. Silero crate default: 98 + /// (which matches upstream Python silero-vad's 0.098 s default). + #[arg(long, default_value_t = 98)] + min_silence_at_max_speech_ms: u64, + + /// Maximum speech duration in seconds before the segmenter + /// force-splits a long segment. Defaults to "no limit" (matches both + /// the Rust crate and Python silero-vad defaults). Pass e.g. `30` to + /// match WhisperX-style chunking. + #[arg(long)] + max_speech_s: Option, +} + +/// Idempotent guard for `ffmpeg::init()`. Persists the init outcome in +/// a `OnceLock` so a failed first init keeps surfacing on subsequent +/// calls (the previous `Once`-based version stored the error on the +/// stack and silently returned `Ok(())` on later calls). +fn ffmpeg_init() -> Result<()> { + // `ffmpeg::Error` is not `Clone`, so store the error as `String` — + // we only need the message on subsequent calls. + static INIT: OnceLock> = OnceLock::new(); + match INIT.get_or_init(|| ffmpeg::init().map_err(|e| e.to_string())) { + Ok(()) => Ok(()), + Err(msg) => Err(anyhow::anyhow!("ffmpeg::init failed: {msg}")), + } +} + +/// Load an audio file as 16 kHz mono f32 via ffmpeg-next. +/// +/// This mirrors the loader in `whispery`'s parity runner. Decoding +/// path: container open → audio decoder → resample to 16 kHz mono +/// `s16` (signed 16-bit packed, little-endian) → cast each sample to +/// `f32` and divide by exactly `32768.0`. +/// +/// Why s16-then-divide rather than f32-direct: upstream Python +/// silero-vad loads audio via `torchaudio.load` (or `whisperx.audio`'s +/// ffmpeg shell-out) which lands on `np.float32 / 32768.0`. Doing the +/// same conversion on the Rust side keeps the f32 buffer the model +/// sees byte-identical, so a hash comparison on the JSON output's +/// `clip_sha256` field can verify both runners decoded the audio the +/// same way before flagging any output divergence as a model issue. +/// +/// Returns `(samples, duration_s, sha256)`. +fn read_audio_16k_mono_f32(path: &Path) -> Result<(Vec, f64, String)> { + use ffmpeg::format::sample::{Sample, Type as SampleType}; + use ffmpeg::software::resampling::Context as Resampler; + use ffmpeg::{ChannelLayout, codec::context::Context as CodecContext, frame, media}; + + ffmpeg_init()?; + + let mut ictx = ffmpeg::format::input(path) + .with_context(|| format!("open audio container at {}", path.display()))?; + let stream = ictx + .streams() + .best(media::Type::Audio) + .ok_or_else(|| anyhow::anyhow!("{}: no audio stream", path.display()))?; + let stream_index = stream.index(); + + let codec_ctx = CodecContext::from_parameters(stream.parameters()) + .with_context(|| format!("decoder context for {}", path.display()))?; + let mut decoder = codec_ctx + .decoder() + .audio() + .with_context(|| format!("audio decoder for {}", path.display()))?; + decoder + .set_parameters(stream.parameters()) + .with_context(|| format!("decoder set_parameters for {}", path.display()))?; + + const TARGET_RATE: u32 = 16_000; + let target_format = Sample::I16(SampleType::Packed); + let target_layout = ChannelLayout::MONO; + + // PCM/WAV decoders commonly emit frames with `ch_layout.order = + // UNSPEC` (only the channel count is set); libswresample's + // `swr_alloc_set_opts2` rejects that in FFmpeg 7+. Fall back to + // `ChannelLayout::default(channels)` if the source layout is empty. + let resolve_src_layout = + |layout: ChannelLayout, channels: i32| -> ChannelLayout { + if layout.is_empty() { + ChannelLayout::default(channels) + } else { + layout + } + }; + + let mut src_format = decoder.format(); + let mut src_rate = decoder.rate(); + let mut src_layout = resolve_src_layout(decoder.channel_layout(), decoder.channels() as i32); + + let build_resampler = |src_format, + src_layout, + src_rate| + -> Result { + Resampler::get( + src_format, + src_layout, + src_rate, + target_format, + target_layout, + TARGET_RATE, + ) + .with_context(|| format!("init libswresample for {}", path.display())) + }; + + let mut resampler = build_resampler(src_format, src_layout, src_rate)?; + + let mut samples_f32: Vec = Vec::new(); + let mut decoded = frame::Audio::empty(); + + // Push i16 samples from a packed-mono frame into `samples_f32`, + // dividing by the literal `32768.0` exactly as + // WhisperX/torchaudio does. + let push_resampled = |frame: &frame::Audio, dst: &mut Vec| { + let n = frame.samples(); + if n == 0 { + return; + } + let plane: &[i16] = frame.plane::(0); + debug_assert!(plane.len() >= n); + dst.reserve(n); + for &s in &plane[..n] { + dst.push(s as f32 / 32768.0_f32); + } + }; + + // Run a decoded frame through the resampler. Handles + // `InputChanged` / `OutputChanged` by rebuilding the resampler + // against the new source params. + let run_resample = |decoded: &frame::Audio, + resampler: &mut Resampler, + samples_f32: &mut Vec, + src_format: &mut Sample, + src_layout: &mut ChannelLayout, + src_rate: &mut u32| + -> Result<()> { + let mut resampled = frame::Audio::empty(); + match resampler.run(decoded, &mut resampled) { + Ok(_) => { + push_resampled(&resampled, samples_f32); + } + Err(ffmpeg::Error::InputChanged | ffmpeg::Error::OutputChanged) => { + *src_format = decoded.format(); + *src_layout = resolve_src_layout( + decoded.channel_layout(), + decoded.channels() as i32, + ); + *src_rate = decoded.rate(); + *resampler = build_resampler(*src_format, *src_layout, *src_rate)?; + let mut retried = frame::Audio::empty(); + resampler + .run(decoded, &mut retried) + .context("libswresample::run after rebuild")?; + push_resampled(&retried, samples_f32); + } + Err(e) => return Err(anyhow::anyhow!("libswresample::run: {e}")), + } + Ok(()) + }; + + let fixup_frame_layout = |frame: &mut frame::Audio, src_layout: ChannelLayout| { + if frame.channel_layout().is_empty() { + frame.set_channel_layout(src_layout); + } + }; + + for (s, packet) in ictx.packets() { + if s.index() != stream_index { + continue; + } + decoder.send_packet(&packet).context("decoder.send_packet")?; + while decoder.receive_frame(&mut decoded).is_ok() { + fixup_frame_layout(&mut decoded, src_layout); + run_resample( + &decoded, + &mut resampler, + &mut samples_f32, + &mut src_format, + &mut src_layout, + &mut src_rate, + )?; + } + } + decoder.send_eof().context("decoder.send_eof")?; + while decoder.receive_frame(&mut decoded).is_ok() { + fixup_frame_layout(&mut decoded, src_layout); + run_resample( + &decoded, + &mut resampler, + &mut samples_f32, + &mut src_format, + &mut src_layout, + &mut src_rate, + )?; + } + + // Final libswresample flush. `OutputChanged` here means "no buffered + // samples" in the rate-1:1 case (which is what the dia 16 kHz mono + // PCM fixtures hit). Treat it as a no-op rather than a hard error. + loop { + let mut tail = frame::Audio::empty(); + match resampler.flush(&mut tail) { + Ok(_) => { + if tail.samples() == 0 { + break; + } + push_resampled(&tail, &mut samples_f32); + } + Err(ffmpeg::Error::OutputChanged) => break, + Err(e) => { + return Err(anyhow::anyhow!("libswresample::flush at EOF: {e}")); + } + } + } + + if samples_f32.is_empty() { + bail!( + "{}: ffmpeg-next decoded zero samples; corrupt or empty audio?", + path.display() + ); + } + + let duration_s = samples_f32.len() as f64 / TARGET_RATE as f64; + + // Hash the f32 bytes (LE) the model will see — same trick the + // whispery harness uses. Comparing this against the Python runner's + // own clip_sha256 is what catches loader-quantisation divergences. + let mut hasher = Sha256::new(); + // Safety: `f32` is `Copy + 'static`, layout is well-defined as 4 + // little-endian bytes per sample on every target this harness ships + // to (macOS / Linux x86_64+aarch64). + let bytes = unsafe { + std::slice::from_raw_parts( + samples_f32.as_ptr() as *const u8, + samples_f32.len() * std::mem::size_of::(), + ) + }; + hasher.update(bytes); + let sha = format!("{:x}", hasher.finalize()); + + Ok((samples_f32, duration_s, sha)) +} + +fn model_sha256() -> String { + let mut hasher = Sha256::new(); + hasher.update(silero::BUNDLED_MODEL); + format!("{:x}", hasher.finalize()) +} + +fn main() -> Result<()> { + let args = Args::parse(); + + let (samples, duration_s, clip_sha256) = read_audio_16k_mono_f32(&args.wav_path)?; + eprintln!( + "[silero-parity] wav={} dur={:.2}s samples={} sha256={}", + args.wav_path.display(), + duration_s, + samples.len(), + &clip_sha256[..16] + ); + + // Build SpeechOptions from CLI flags. Every default matches the + // silero crate's `SpeechOptions::default()`, which in turn matches + // upstream Python silero-vad defaults. + let mut opts = SpeechOptions::new() + .with_sample_rate(SampleRate::Rate16k) + .with_start_threshold(args.threshold) + .with_min_speech_duration(Duration::from_millis(args.min_speech_ms)) + .with_min_silence_duration(Duration::from_millis(args.min_silence_ms)) + .with_speech_pad(Duration::from_millis(args.speech_pad_ms)) + .with_min_silence_at_max_speech(Duration::from_millis( + args.min_silence_at_max_speech_ms, + )); + if let Some(s) = args.max_speech_s { + let ms = (s * 1000.0).round() as u64; + opts = opts.with_max_speech_duration(Duration::from_millis(ms)); + } + + eprintln!( + "[silero-parity] threshold={} min_speech_ms={} min_silence_ms={} \ + pad_ms={} min_silence_at_max_speech_ms={} max_speech_s={:?}", + args.threshold, + args.min_speech_ms, + args.min_silence_ms, + args.speech_pad_ms, + args.min_silence_at_max_speech_ms, + args.max_speech_s, + ); + + let mut session = Session::bundled().context("load bundled silero ONNX session")?; + let segments = detect_speech(&mut session, &samples, opts).context("detect_speech")?; + + eprintln!( + "[silero-parity] {} segments detected", + segments.len() + ); + + let segments_json: Vec = segments + .iter() + .map(|s| { + json!({ + "start_s": s.start_seconds(), + "end_s": s.end_seconds(), + "start_sample": s.start_sample(), + "end_sample": s.end_sample(), + }) + }) + .collect(); + + let payload = json!({ + "runner": "silero-rs", + "silero_crate_version": SILERO_CRATE_VERSION, + "model_sha256": model_sha256(), + "clip_path": args.wav_path.display().to_string(), + "clip_sha256": clip_sha256, + "duration_s": duration_s, + "params": { + "threshold": args.threshold, + "min_speech_duration_ms": args.min_speech_ms, + "min_silence_duration_ms": args.min_silence_ms, + "speech_pad_ms": args.speech_pad_ms, + "min_silence_at_max_speech_ms": args.min_silence_at_max_speech_ms, + "max_speech_s": args.max_speech_s, + "sampling_rate": 16_000, + "window_size_samples": 512, + }, + "segment_count": segments.len(), + "segments": segments_json, + }); + + let serialized = serde_json::to_string_pretty(&payload)?; + match args.out { + Some(path) => { + let mut f = fs::File::create(&path) + .with_context(|| format!("create output {}", path.display()))?; + f.write_all(serialized.as_bytes())?; + f.write_all(b"\n")?; + eprintln!( + "[silero-parity] wrote {} segments to {}", + segments.len(), + path.display() + ); + } + None => { + println!("{serialized}"); + } + } + + Ok(()) +}