diff --git a/.gitignore b/.gitignore
index f47e72d..c323e0f 100644
--- a/.gitignore
+++ b/.gitignore
@@ -17,4 +17,11 @@
 /target
 Cargo.lock
 
-**.claude/
\ No newline at end of file
+**.claude/
+
+# Parity harness (manual; outputs are per-machine fixtures).
+# `target/` and `Cargo.lock` are already covered by the rules above.
+tests/parity/out/
+tests/parity/python/.venv/
+tests/parity/python/uv.lock
+tests/parity/python/*.egg-info/
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 1412816..ff21711 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -7,6 +7,45 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.3.0] - 2026-05-02
+
+### Changed
+
+- **Behaviour change** — `SpeechSegmenter::push_probability` now closes
+  speech segments when the silence counter matches the upstream Python
+  `silero-vad` package's semantics. Previously the crate's silence
+  counter was evaluated AFTER the current frame's contribution had been
+  added to `current_sample`, while upstream Python evaluates the
+  equivalent `cur_sample - temp_end` BEFORE the current frame is
+  consumed. The crate's counter therefore fired one model frame
+  (32 ms at 16 kHz / 512-sample windows) too early — at the default
+  `min_silence_duration_ms = 100`, the crate closed a segment after 4
+  consecutive low-probability frames where Python tolerates the dip and
+  closes after 5. The same off-by-one applied to the
+  `min_silence_at_max_speech_samples` comparator on the same code path.
+  Discovered by the parity harness in `tests/parity/`.
+
+### Migration
+
+Callers who hand-tuned `min_silence_duration_ms` against the v0.2.x
+response curve may want to subtract ~32 ms from their value to keep the
+same effective behaviour against v0.3.0+. Default callers do not need
+to change anything — defaults still match upstream silero-vad PyPI
+defaults verbatim, and the response curve is now strictly closer to
+upstream than it was in v0.2.x.
+
+### Verified
+
+- `cargo test`
+- `cargo test --no-default-features`
+- `cargo build --release`
+- `tests/parity/run.sh` on the five short dia parity fixtures
+  (`01_dialogue`, `02_pyannote_sample`, `03_dual_speaker`,
+  `04_three_speaker`, `05_four_speaker`): median IoU 1.0000 and
+  segment counts match exactly against upstream Python silero-vad
+  (51/51, 4/4, 14/14, 6/6, 14/14) WITHOUT the previous
+  `--min-silence-ms 132` override.
+
 ## [0.2.0] - 2026-04-21
 
 ### Added
diff --git a/Cargo.toml b/Cargo.toml
index 7b57d8f..bfbfe5e 100644
--- a/Cargo.toml
+++ b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "silero"
-version = "0.2.1"
+version = "0.3.0"
 edition = "2024"
 rust-version = "1.85"
 repository = "https://github.com/Findit-AI/silero"
diff --git a/src/detector.rs b/src/detector.rs
index fa4d7bf..4d0243c 100644
--- a/src/detector.rs
+++ b/src/detector.rs
@@ -176,8 +176,18 @@ impl SpeechSegmenter {
       return None;
     }
 
+    // Silence-counter is evaluated against `frame_start` (the start sample
+    // of the current frame), not `current_sample` (which is already the
+    // *end* of the current frame). This matches upstream Python
+    // `silero-vad`'s `sil_dur_now = cur_sample - temp_end` semantics,
+    // where `cur_sample` is read BEFORE the model consumes the current
+    // window. Without this, the comparator fires one frame early — a
+    // 4-frame (128 ms) silence dip would close a segment at default
+    // `min_silence_duration_ms = 100`, where Python tolerates it and
+    // closes after 5 consecutive low-probability frames. See the parity
+    // harness in `tests/parity/` and the v0.3.0 CHANGELOG entry.
     let silence_start = *self.tentative_end.get_or_insert(frame_start);
-    let silence_samples = self.current_sample.saturating_sub(silence_start);
+    let silence_samples = frame_start.saturating_sub(silence_start);
     if silence_samples > self.options.min_silence_at_max_speech_samples() {
       self.max_split_end = Some(silence_start);
     }
@@ -390,6 +400,18 @@ mod tests {
 
   #[test]
   fn middle_band_frames_do_not_reset_tentative_end() {
+    // Verifies that mid-band probabilities (between the end_threshold and
+    // start_threshold, e.g. `0.4` against the default `0.5` start) do NOT
+    // reset the silence accumulator — they're treated as "not yet
+    // confirmed speech".
+    //
+    // Updated 0.3.0: post the silence-counter off-by-one fix, the segment
+    // closes after FIVE consecutive low-or-mid-band frames at the default
+    // `min_silence_duration_ms = 100` (1600 samples / 512 per frame =
+    // 3.125 → 4 prior frames + the close-firing 5th frame), matching
+    // upstream Python silero-vad. The pre-0.3.0 crate closed after FOUR
+    // frames (one frame too eager). See `tests/parity/README.md` and the
+    // 0.3.0 CHANGELOG entry for the full derivation.
     let config = SpeechOptions::default()
       .with_min_speech_duration(Duration::ZERO)
       .with_speech_pad(Duration::ZERO)
@@ -397,24 +419,44 @@ mod tests {
     let mut segmenter = SpeechSegmenter::new(config);
 
     let mut probabilities = vec![0.9; 4];
-    probabilities.extend([0.0, 0.4, 0.0, 0.0]);
+    // Five low/mid frames so the segment closes via push_probability.
+    // The mid-band 0.4 frame in the middle must NOT reset the silence
+    // accumulator — that's the actual property under test.
+    probabilities.extend([0.0, 0.4, 0.0, 0.0, 0.0]);
     probabilities.extend(vec![0.9; 4]);
 
     let segments = collect(&mut segmenter, &probabilities);
     assert_eq!(segments.len(), 2);
     assert_eq!(segments[0].start_sample(), 0);
     assert_eq!(segments[0].end_sample(), 2_048);
-    assert_eq!(segments[1].start_sample(), 4_096);
+    // Segment two starts on the first speech frame after the closed
+    // silence (4 high + 5 silence = frame index 9, sample 4_608).
+    assert_eq!(segments[1].start_sample(), 4_608);
   }
 
   #[test]
   fn min_speech_duration_is_checked_before_padding() {
+    // A speech burst of 6 frames * 32 ms = 192 ms is shorter than the
+    // default `min_speech_duration_ms = 250`, so the segment that the
+    // trailing silence closes must be dropped — `min_speech` is checked
+    // against the raw speech window (raw_end - raw_start), not against
+    // the padded boundaries.
+    //
+    // Updated 0.3.0: post the silence-counter off-by-one fix, push-based
+    // close requires FIVE consecutive low-probability frames at the
+    // default `min_silence_duration_ms = 100` (was 4 pre-0.3.0). Trailing
+    // silence is extended from 4 to 5 frames so the close still fires
+    // via `push_probability` — otherwise `finish()` would emit the
+    // burst-plus-trailing-silence as a single trailing segment that
+    // satisfies the 250 ms duration check, which is a different (and
+    // correct, but separate) behaviour. See `tests/parity/README.md`
+    // and the 0.3.0 CHANGELOG entry.
     let config = SpeechOptions::default();
     let mut segmenter = SpeechSegmenter::new(config);
 
     let mut probabilities = vec![0.0; 4];
     probabilities.extend(vec![0.9; 6]);
-    probabilities.extend(vec![0.0; 4]);
+    probabilities.extend(vec![0.0; 5]);
 
     let segments = collect(&mut segmenter, &probabilities);
     assert!(segments.is_empty());
@@ -517,12 +559,25 @@ mod tests {
 
   #[test]
   fn force_split_during_silence_closes_without_restarting() {
+    // Updated 0.3.0: max_speech_duration bumped from 224 ms to 256 ms so
+    // the max-speech split fires one frame later, after `max_split_end`
+    // has been recorded by the silence-counter logic. With the
+    // off-by-one fix to that logic, `max_split_end` is now set on the
+    // 4th low-probability frame instead of the 3rd, so the test's
+    // pre-existing 224 ms ceiling would split at sample 3_584 with
+    // `max_split_end == None` (falling back to `frame_start` and
+    // closing at sample 3_584 instead of at the recorded silence
+    // boundary 2_048). Bumping the ceiling preserves the property under
+    // test — that a force-split during silence closes at the silence
+    // boundary, not at the current frame, and does NOT restart a new
+    // segment afterwards. See `tests/parity/README.md` and the 0.3.0
+    // CHANGELOG entry.
     let config = SpeechOptions::default()
       .with_min_speech_duration(Duration::ZERO)
       .with_speech_pad(Duration::ZERO)
       .with_min_silence_duration(Duration::from_millis(10_000))
       .with_min_silence_at_max_speech(Duration::from_millis(64))
-      .with_max_speech_duration(Duration::from_millis(224));
+      .with_max_speech_duration(Duration::from_millis(256));
     let mut segmenter = SpeechSegmenter::new(config);
 
     let mut probabilities = vec![0.9; 4];
@@ -534,6 +589,79 @@ mod tests {
     assert_eq!(segments[0].end_sample(), 2_048);
   }
 
+  #[test]
+  fn four_frame_silence_dip_does_not_close_segment_at_default_min_silence() {
+    // Pinned in 0.3.0 as a regression guard for the silence-counter
+    // off-by-one fix.
+    //
+    // At the default `min_silence_duration_ms = 100` (1600 samples at
+    // 16 kHz) and the default 32 ms / 512-sample frame, upstream Python
+    // `silero-vad` (`get_speech_timestamps`) closes a segment after
+    // FIVE consecutive low-probability frames — `sil_dur_now =
+    // cur_sample - temp_end` is evaluated BEFORE the current frame is
+    // consumed, so the comparator sees `(k-1) * 512` on the k-th
+    // low-prob frame and only crosses the 1600-sample threshold at
+    // k = 5.
+    //
+    // Pre-0.3.0 the silero crate evaluated the same counter AFTER the
+    // current frame was added to `current_sample`, so it saw `k * 512`
+    // and closed at k = 4. A 4-frame (128 ms) silence dip would
+    // therefore split a segment in the crate but be tolerated by Python.
+    //
+    // This test pins the post-fix behaviour: a 4-frame silence dip must
+    // be tolerated. The 30-frame speech runs ensure both halves
+    // individually clear `min_speech_duration_ms = 250` (8 frames),
+    // so neither would be dropped by the min-speech filter if the
+    // segment did split.
+    //
+    // See `tests/parity/README.md` "Off-by-one silence threshold finding"
+    // and the 0.3.0 CHANGELOG entry for the motivation.
+    let config = SpeechOptions::default();
+    let mut segmenter = SpeechSegmenter::new(config.clone());
+
+    let mut probabilities = vec![1.0; 30];
+    probabilities.extend(vec![0.0; 4]);
+    probabilities.extend(vec![1.0; 30]);
+
+    let segments = collect(&mut segmenter, &probabilities);
+    assert_eq!(
+      segments.len(),
+      1,
+      "4-frame silence dip must be tolerated at default min_silence_duration_ms = 100; \
+       got {} segments",
+      segments.len()
+    );
+    // Sanity: the (one) segment must start at 0 (the start-pad
+    // saturates against the timeline's zero) and span the full
+    // 30 + 4 + 30 = 64 frame window — at 512 samples / frame, that
+    // ends at 32_768.
+    assert_eq!(segments[0].start_sample(), 0);
+    assert_eq!(segments[0].end_sample(), 32_768);
+  }
+
+  #[test]
+  fn five_frame_silence_dip_closes_segment_at_default_min_silence() {
+    // Companion to `four_frame_silence_dip_does_not_close_segment_*`.
+    // Pinned in 0.3.0: at the same defaults, FIVE consecutive low-prob
+    // frames must close the segment — matching upstream Python
+    // silero-vad's `sil_dur_now >= 1600` firing on the 5th frame.
+    let config = SpeechOptions::default();
+    let mut segmenter = SpeechSegmenter::new(config);
+
+    let mut probabilities = vec![1.0; 30];
+    probabilities.extend(vec![0.0; 5]);
+    probabilities.extend(vec![1.0; 30]);
+
+    let segments = collect(&mut segmenter, &probabilities);
+    assert_eq!(
+      segments.len(),
+      2,
+      "5-frame silence dip must close the segment at default \
+       min_silence_duration_ms = 100; got {} segments",
+      segments.len()
+    );
+  }
+
   #[test]
   fn force_split_applies_speech_pad_to_split_boundaries() {
     let config = SpeechOptions::default()
diff --git a/src/lib.rs b/src/lib.rs
index 788d274..dbec68a 100644
--- a/src/lib.rs
+++ b/src/lib.rs
@@ -13,6 +13,13 @@ mod stream;
 pub use detector::{SpeechDetector, SpeechSegment, SpeechSegmenter, detect_speech};
 pub use error::{Error, Result};
 pub use options::{GraphOptimizationLevel, SampleRate, SessionOptions, SpeechOptions};
+
+/// Version string of the `silero` crate (`CARGO_PKG_VERSION`).
+///
+/// Exposed so out-of-tree harnesses (e.g. the parity runner) can record
+/// the exact silero version under test rather than the harness binary's
+/// own version.
+pub const VERSION: &str = env!("CARGO_PKG_VERSION");
 #[cfg(feature = "bundled")]
 #[cfg_attr(docsrs, doc(cfg(feature = "bundled")))]
 pub use session::BUNDLED_MODEL;
diff --git a/tests/parity/Cargo.toml b/tests/parity/Cargo.toml
new file mode 100644
index 0000000..d234ec6
--- /dev/null
+++ b/tests/parity/Cargo.toml
@@ -0,0 +1,35 @@
+[workspace]
+
+[package]
+name        = "silero-parity-runner"
+version     = "0.0.0"
+edition     = "2024"
+publish     = false
+description = "Manual parity harness: dump silero (Rust crate) VAD output to JSON for side-by-side comparison with upstream Python silero-vad."
+
+[dependencies]
+# `silero` crate under test. `bundled` ships the ONNX model bytes
+# in the binary so the harness has no external model dependency.
+silero      = { path = "../..", features = ["bundled"] }
+# `ffmpeg-next` mirrors the production audio-loading path that
+# downstream callers (whispery, dia) use, and matches how upstream
+# Python `silero-vad` loads audio (it uses `torchaudio` which calls
+# ffmpeg under the hood, then casts to f32 / 32768.0). Loading via
+# the same backend on both sides keeps the f32 buffer the model
+# sees byte-identical, so any output divergence is the model /
+# segmenter — not the audio decode path.
+#
+# Pinned to `8` because the local toolchain ships system FFmpeg 8.x
+# (Homebrew `ffmpeg 8.1`) which dropped `libavcodec/avfft.h`; the
+# `ffmpeg-next 7.x` series still references that header in its
+# bindgen pass and therefore fails to build against system FFmpeg 8.
+# Same pin whispery's parity harness uses.
+ffmpeg-next = "8"
+serde_json  = "1"
+clap        = { version = "4", features = ["derive"] }
+sha2        = "0.10"
+anyhow      = "1"
+
+[[bin]]
+name = "silero-parity-runner"
+path = "src/main.rs"
diff --git a/tests/parity/README.md b/tests/parity/README.md
new file mode 100644
index 0000000..f674679
--- /dev/null
+++ b/tests/parity/README.md
@@ -0,0 +1,191 @@
+# silero parity test harness
+
+A side-by-side runner that compares this crate's VAD output against
+upstream Python `silero-vad` on the same audio, reporting per-segment
+IoU. Models the same approach `dia/tests/parity/` uses for pyannote
+parity.
+
+The bundled ONNX model in `models/silero_vad.onnx` is the same network
+upstream silero-vad ships, so this is genuinely a runtime comparison
+(ORT inference + Rust segmenter vs PyTorch / ORT inference + Python
+segmenter on identical bytes) — not a model-architecture comparison.
+
+## Layout
+
+- `Cargo.toml` / `src/main.rs` — Rust binary `silero-parity-runner`
+  that loads a 16 kHz mono WAV via `ffmpeg-next`, runs
+  `silero::detect_speech`, and emits JSON.
+- `python/pyproject.toml` / `python/silero_vad_runner.py` — same CLI
+  shape, same JSON schema, runs upstream `silero_vad.get_speech_timestamps`.
+- `python/score.py` — sequence-position pairing, per-segment IoU,
+  median + p10/p90 + worst-N report.
+- `run.sh` — end-to-end driver (bring up venv → run both → score).
+
+## Prerequisites
+
+- `cargo` + Rust toolchain (the runner builds via `path = "../.."`).
+- `uv` for Python virtualenv management (`brew install uv` or
+  `pip install uv`).
+- `ffmpeg` on PATH — the Python runner shells out to it for audio
+  loading; the Rust runner uses `ffmpeg-next` (in-process bindings).
+  On macOS with Homebrew FFmpeg 8.x, `ffmpeg-next` is pinned to `8`
+  in `Cargo.toml` because the `7.x` series still references the
+  removed `libavcodec/avfft.h` header.
+- A 16 kHz mono WAV (or any container ffmpeg can decode; will be
+  resampled).
+
+ORT runtime: this crate (and therefore the runner) uses `ort` with its
+default `download-binaries` + `copy-dylibs` features, so a prebuilt
+ONNX Runtime ships next to the binary — `ORT_DYLIB_PATH` is **not**
+required (unlike the whispery harness, which uses `load-dynamic`).
+
+## Run
+
+```bash
+cd silero
+./tests/parity/run.sh /path/to/clip_16k.wav
+./tests/parity/run.sh /path/to/fixture-dir         # uses clip_16k.wav inside
+```
+
+Outputs land in `tests/parity/out/`:
+- `silero_rs_<name>.json` — Rust runner output.
+- `silero_py_<name>.json` — Python runner output.
+- `score_<name>.json` — IoU summary.
+
+Exit code 0 iff median IoU >= 0.95 **and** segment counts match.
+
+## Canonical fixture set
+
+The dia parity fixtures double as the silero parity fixtures: they're
+real-speech 16 kHz mono WAVs of varying length and speaker counts.
+
+```
+/Users/user/Develop/findit-studio/dia/tests/parity/fixtures/
+├── 01_dialogue/clip_16k.wav        # ~120 s, 2 spk dialogue
+├── 02_pyannote_sample/clip_16k.wav # ~30 s, pyannote sample
+├── 03_dual_speaker/clip_16k.wav    # ~60 s, 2 spk
+├── 04_three_speaker/clip_16k.wav   # 3 spk
+├── 05_four_speaker/clip_16k.wav    # 4 spk
+└── 06_long_recording/clip_16k.wav  # ~977 s, long-form
+```
+
+These are deliberately **not copied** into the silero repo (they're
+large; dia is the source of truth for them). Pass the directory or
+WAV path on the `run.sh` command line.
+
+For first validation we recommend the five short fixtures (skip
+`06_long_recording` — at ~16 minutes it's slow to run and the short
+fixtures cover all interesting boundary conditions).
+
+## Default parameter alignment
+
+Both runners default to the same parameter set, validated 2026-05
+against `silero-vad 6.2.1` source
+(`src/silero_vad/utils_vad.py:get_speech_timestamps`):
+
+| Parameter                    | silero crate default | silero-vad-py default | Aligned? |
+|------------------------------|----------------------|-----------------------|----------|
+| `threshold`                  | 0.5                  | 0.5                   | yes      |
+| `min_speech_duration_ms`     | 250                  | 250                   | yes      |
+| `min_silence_duration_ms`    | 100                  | 100                   | yes      |
+| `speech_pad_ms`              | 30                   | 30                    | yes      |
+| `min_silence_at_max_speech_ms`| 98                  | 98                    | yes      |
+| `max_speech_duration_s`      | None (no limit)      | `float('inf')`        | yes      |
+| `sampling_rate`              | 16 000 Hz            | 16 000 Hz             | yes      |
+| `window_size_samples`        | 512 (chunk_samples)  | 512                   | yes      |
+| `neg_threshold` (end_thresh) | start - 0.15 (clamped to >=0.01) | start - 0.15 | yes |
+
+(See `silero/src/options.rs:default_*` constants and the upstream
+`get_speech_timestamps` function signature.)
+
+### Off-by-one silence threshold finding (fixed in v0.3.0)
+
+> **Status: fixed in silero v0.3.0.** The harness no longer applies
+> the `--min-silence-ms 132` workaround described below. Both runners
+> now use upstream Python `silero-vad`'s defaults verbatim.
+
+**Historical context (preserved here as a record of how the bug was
+characterised before the fix):**
+
+Up to and including silero v0.2.x the crate's
+`SpeechSegmenter::push_probability` and Python's
+`get_speech_timestamps` differed by exactly **one model frame
+(32 ms at 16 kHz / 512-sample windows)** in how they computed the
+"silence so far" counter:
+
+- **Python** (`silero_vad/utils_vad.py`):
+  - `temp_end` is set to `cur_sample` on the FIRST low-probability
+    frame.
+  - `sil_dur_now = cur_sample - temp_end` is computed BEFORE the
+    current frame is "consumed" (it's the frame's *start* sample).
+  - On the first low-prob frame, `sil_dur_now = 0`. On the k-th
+    consecutive low-prob frame, `sil_dur_now = (k-1) * 512`.
+  - Closes when `sil_dur_now >= 1600` → k = 5 frames.
+
+- **silero crate (pre-v0.3.0)** (`silero/src/detector.rs:147-190`):
+  - `tentative_end` is set to `frame_start` on the first low-prob
+    frame; immediately after, `current_sample` is incremented by
+    `frame_samples` (so it represents the END of the current frame).
+  - `silence_samples = current_sample - silence_start = j * 512` after
+    the j-th consecutive low-prob frame (j ≥ 1).
+  - Closes when `silence_samples >= 1600` → j = 4 frames.
+
+So a 4-frame (128 ms) silence dip closed the pre-v0.3.0 crate's
+segment but was *tolerated* by Python — Python kept it as one segment
+until 5 consecutive low-prob frames had passed. On a clip with many
+short silence dips (e.g. dialogue with quick turn-taking), the crate
+produced measurably more segments than Python at the same nominal
+`min_silence_duration_ms`.
+
+**Pre-v0.3.0 workaround (now removed)**: `run.sh` used to override
+the crate side with `--min-silence-ms 132` (= 100 + 32), shifting the
+close threshold by one frame so the two segmenters consumed the same
+number of low-prob frames before closing.
+
+**Fix in v0.3.0**: `SpeechSegmenter::push_probability` now evaluates
+the silence counter against `frame_start` (the start sample of the
+current frame) instead of `current_sample` (the end). This mirrors
+Python's "compute `cur_sample - temp_end` before consuming the
+current frame" semantics literally. The same correction applies to
+the `min_silence_at_max_speech_samples` comparator that lives on the
+same code path. Both close-after-5-frames and the (4-frame, no-close)
+boundary are now pinned by unit tests in
+`silero/src/detector.rs::tests` —
+`five_frame_silence_dip_closes_segment_at_default_min_silence` and
+`four_frame_silence_dip_does_not_close_segment_at_default_min_silence`.
+
+**Migration note for callers**: this is a behaviour change. Anyone
+who hand-tuned `min_silence_duration_ms` against the v0.2.x response
+curve may want to subtract ~32 ms from their override to get the
+same effective behaviour against v0.3.0+.
+
+The other parameters (start/end threshold, min-speech, speech-pad,
+min-silence-at-max-speech) all lined up at defaults pre-fix too —
+only the silence-counter equation diverged.
+
+## How parity is scored
+
+`score.py` pairs segments by **sequence position** (i-th from a vs
+i-th from b) and computes time-range IoU per pair. This is the right
+matcher when both runners are expected to produce the same boundaries
+on the same audio — a single missing or extra segment will degrade
+the metric instead of accidentally re-aligning everything around the
+gap.
+
+Pass condition (default): median IoU >= 0.95 **and** `len(segments_a)
+== len(segments_b)`. Pass `--allow-segment-count-mismatch` to soften
+the count check (useful when diagnosing which side over- or
+under-segments).
+
+The `clip_sha256` field on each runner output hashes the f32 PCM
+bytes the model actually saw. If those hashes diverge, score.py warns
+loudly because any IoU disagreement could then be a loader issue
+rather than a model issue.
+
+## Notes
+
+- The harness is **NOT** part of `cargo test`. It's a manual run for
+  release-time validation and for diagnosing regressions.
+- Don't commit binary fixtures or model files into this crate.
+- Don't change anything in `silero/src/` from this harness — it's
+  read-only on the public crate API.
diff --git a/tests/parity/python/pyproject.toml b/tests/parity/python/pyproject.toml
new file mode 100644
index 0000000..f7fce25
--- /dev/null
+++ b/tests/parity/python/pyproject.toml
@@ -0,0 +1,31 @@
+[project]
+name = "silero-parity-reference"
+version = "0.0.0"
+requires-python = ">=3.10"
+# `silero-vad` is the upstream PyPI package this harness compares the
+# Rust crate against. Pinned to `>=5.1` so we always pull a stable
+# 5.x/6.x — the `get_speech_timestamps` API and parameter set has
+# been stable since 5.x. Bump this exact pin (or tighten to `==`) on
+# release branches if a behaviour change in upstream silero-vad
+# regresses the parity numbers.
+#
+# `torch` is a transitive of `silero-vad` already; we list it
+# explicitly so a one-off install of the parity venv works without
+# relying on solver cascades.
+dependencies = [
+  "silero-vad >= 5.1",
+  "torch >= 2.0",
+  "numpy >= 1.26",
+  # `silero-vad`'s `load_silero_vad(onnx=True)` path uses
+  # onnxruntime — the runner defaults to that backend so both
+  # runners feed identical bytes to ORT. Without this dep, the ONNX
+  # path raises at load time.
+  "onnxruntime >= 1.18",
+]
+
+# Disable setuptools auto-discovery: this project carries Python
+# scripts only — no installable package layout — so an empty
+# `packages` list lets `uv pip install -e .` install just the project
+# metadata + dependencies without erroring on auto-discovery.
+[tool.setuptools]
+packages = []
diff --git a/tests/parity/python/score.py b/tests/parity/python/score.py
new file mode 100644
index 0000000..d8fe005
--- /dev/null
+++ b/tests/parity/python/score.py
@@ -0,0 +1,237 @@
+"""Compare two parity-runner JSON outputs (one silero-rs, one
+silero-vad-py) and report per-segment IoU statistics.
+
+Approach:
+1. Pair segments by **sequence position** — silero VAD output is
+   ordered and the two implementations run the same segmenter logic
+   on the same audio, so the i-th segment from one runner corresponds
+   to the i-th segment from the other when both are well-aligned. If
+   the segment counts differ we still pair as far as the shorter list
+   goes and count the rest as drops on the longer side.
+2. For each matched pair, compute time-range IoU on `[start_s, end_s]`.
+3. Emit a JSON summary on stdout (or `--out`) and a human-readable
+   summary on stderr.
+
+Why sequence-position pairing rather than text-aware Needleman-Wunsch
+(the whispery harness does the latter): silero output has no text to
+key on, and a near-bit-exact runner pair will produce near-identical
+segment boundaries — sequence-position is the right matching when the
+implementations are supposed to agree. If a Rust regression shifts the
+segment count by ±1, the pairing degrades gracefully and the
+`segment_count_*` fields in the summary make the divergence obvious.
+
+Pass/fail: median IoU >= `--threshold` (default 0.95) AND the segment
+counts match. The 0.95 default reflects "near-bit-equivalent" — silero
+running through ORT vs PyTorch on identical inputs should produce
+boundaries that round to within one frame (32 ms at 16 kHz / 512-sample
+windows).
+
+Usage:
+    uv run python score.py <silero_rs.json> <silero_py.json>
+"""
+
+from __future__ import annotations
+
+import argparse
+import json
+import statistics
+import sys
+from dataclasses import dataclass
+from pathlib import Path
+
+
+@dataclass
+class Segment:
+    start_s: float
+    end_s: float
+
+    @property
+    def duration_s(self) -> float:
+        return max(0.0, self.end_s - self.start_s)
+
+
+def _load(path: Path) -> tuple[str, list[Segment], dict]:
+    payload = json.loads(path.read_text())
+    segments = [
+        Segment(start_s=float(s["start_s"]), end_s=float(s["end_s"]))
+        for s in payload["segments"]
+    ]
+    return payload.get("runner", path.stem), segments, payload
+
+
+def _iou(a: Segment, b: Segment) -> float:
+    inter = max(0.0, min(a.end_s, b.end_s) - max(a.start_s, b.start_s))
+    union = max(a.end_s, b.end_s) - min(a.start_s, b.start_s)
+    if union <= 0.0:
+        return 0.0
+    return inter / union
+
+
+def _stats(values: list[float]) -> dict[str, float | int]:
+    if not values:
+        return {"count": 0}
+    sv = sorted(values)
+    n = len(sv)
+    return {
+        "count": n,
+        "mean": float(statistics.fmean(sv)),
+        "median": float(statistics.median(sv)),
+        "p10": float(sv[max(0, int(0.10 * (n - 1)))]),
+        "p90": float(sv[min(n - 1, int(0.90 * (n - 1)))]),
+        "min": float(sv[0]),
+        "max": float(sv[-1]),
+        "below_0.5": int(sum(1 for v in sv if v < 0.5)),
+        "below_0.9": int(sum(1 for v in sv if v < 0.9)),
+    }
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Score two silero parity-runner JSON outputs against each other."
+    )
+    parser.add_argument("a_json", type=Path, help="First runner JSON (e.g. silero-rs).")
+    parser.add_argument("b_json", type=Path, help="Second runner JSON (e.g. silero-vad-py).")
+    parser.add_argument(
+        "--out",
+        type=Path,
+        default=None,
+        help="Write JSON summary here (default: stdout).",
+    )
+    parser.add_argument(
+        "--threshold",
+        type=float,
+        default=0.95,
+        help="Median IoU threshold for exit-code 0 (default: 0.95).",
+    )
+    parser.add_argument(
+        "--allow-segment-count-mismatch",
+        action="store_true",
+        help=(
+            "By default the score fails if the two runners produced different "
+            "segment counts (which usually means a real boundary disagreement, "
+            "not just a fractional shift). Pass this to soften the check to "
+            "median-IoU only."
+        ),
+    )
+    args = parser.parse_args()
+
+    name_a, segs_a, payload_a = _load(args.a_json)
+    name_b, segs_b, payload_b = _load(args.b_json)
+
+    # Quick sanity: surface a clip_sha256 mismatch loudly. If the two
+    # runners disagree on the input bytes, the IoU number is comparing
+    # apples to oranges and any disagreement is the loader's fault, not
+    # the model's.
+    sha_a = payload_a.get("clip_sha256")
+    sha_b = payload_b.get("clip_sha256")
+    sha_match = sha_a is not None and sha_b is not None and sha_a == sha_b
+    if not sha_match:
+        print(
+            f"[score] WARNING: clip_sha256 differs between runners: "
+            f"{name_a}={sha_a[:16] if sha_a else '(none)'} vs "
+            f"{name_b}={sha_b[:16] if sha_b else '(none)'} — IoU below "
+            f"may reflect loader divergence rather than VAD divergence",
+            file=sys.stderr,
+        )
+
+    pairs: list[tuple[int, int]] = []
+    n_pairs = min(len(segs_a), len(segs_b))
+    for i in range(n_pairs):
+        pairs.append((i, i))
+    dropped_a = max(0, len(segs_a) - n_pairs)
+    dropped_b = max(0, len(segs_b) - n_pairs)
+
+    matched = [(segs_a[i], segs_b[j], _iou(segs_a[i], segs_b[j])) for i, j in pairs]
+    iou_values = [iou for _, _, iou in matched]
+    iou_stats = _stats(iou_values)
+
+    matched_sorted = sorted(matched, key=lambda t: t[2])
+    worst = [
+        {
+            "iou": round(iou, 4),
+            name_a: {
+                "start_s": round(sa.start_s, 3),
+                "end_s": round(sa.end_s, 3),
+                "dur_s": round(sa.duration_s, 3),
+            },
+            name_b: {
+                "start_s": round(sb.start_s, 3),
+                "end_s": round(sb.end_s, 3),
+                "dur_s": round(sb.duration_s, 3),
+            },
+        }
+        for sa, sb, iou in matched_sorted[:5]
+    ]
+
+    counts_match = len(segs_a) == len(segs_b)
+    median_pass = iou_stats.get("median", 0.0) >= args.threshold and len(matched) > 0
+    passed = bool(median_pass and (counts_match or args.allow_segment_count_mismatch))
+
+    summary = {
+        "runner_a": name_a,
+        "runner_b": name_b,
+        "clip_sha256_match": sha_match,
+        "segment_count_a": len(segs_a),
+        "segment_count_b": len(segs_b),
+        "matched_pairs": len(matched),
+        "dropped_by_a": dropped_a,
+        "dropped_by_b": dropped_b,
+        "iou": iou_stats,
+        "worst_5": worst,
+        "threshold_median_iou": args.threshold,
+        "counts_match": counts_match,
+        "passed": passed,
+    }
+
+    serialized = json.dumps(summary, indent=2)
+    if args.out is None:
+        print(serialized)
+    else:
+        args.out.write_text(serialized + "\n")
+
+    median = iou_stats.get("median", 0.0)
+    print(
+        f"\n[score] {name_a} ({len(segs_a)} segs) vs {name_b} ({len(segs_b)} segs)",
+        file=sys.stderr,
+    )
+    print(
+        f"  matched={len(matched)} dropped_a={dropped_a} dropped_b={dropped_b}",
+        file=sys.stderr,
+    )
+    if iou_stats["count"] == 0:
+        print(
+            "  no matched pairs — both runners produced empty segment lists",
+            file=sys.stderr,
+        )
+        # Empty + empty is technically a match; only fail if either side
+        # had segments.
+        return 0 if (len(segs_a) == 0 and len(segs_b) == 0) else 1
+    print(
+        f"  IoU mean={iou_stats['mean']:.4f} median={iou_stats['median']:.4f} "
+        f"p10={iou_stats['p10']:.4f} p90={iou_stats['p90']:.4f} "
+        f"below_0.5={iou_stats['below_0.5']} below_0.9={iou_stats['below_0.9']}",
+        file=sys.stderr,
+    )
+    if worst:
+        print("  worst 5 pairs:", file=sys.stderr)
+        for w in worst:
+            a = w[name_a]
+            b = w[name_b]
+            print(
+                f"    iou={w['iou']:.3f} a=[{a['start_s']:.3f},{a['end_s']:.3f}] "
+                f"({a['dur_s']:.3f}s) b=[{b['start_s']:.3f},{b['end_s']:.3f}] "
+                f"({b['dur_s']:.3f}s)",
+                file=sys.stderr,
+            )
+
+    pass_str = "PASS" if summary["passed"] else "FAIL"
+    print(
+        f"  {pass_str} (median IoU {median:.4f} vs threshold {args.threshold}, "
+        f"counts {len(segs_a)} vs {len(segs_b)})",
+        file=sys.stderr,
+    )
+    return 0 if summary["passed"] else 1
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/parity/python/silero_vad_runner.py b/tests/parity/python/silero_vad_runner.py
new file mode 100644
index 0000000..2320819
--- /dev/null
+++ b/tests/parity/python/silero_vad_runner.py
@@ -0,0 +1,239 @@
+"""Run upstream Python `silero-vad` on a 16 kHz mono WAV; emit the raw
+VAD segments as JSON in the same schema as the Rust
+`silero-parity-runner`.
+
+Why this is structurally simple:
+- We call `silero_vad.get_speech_timestamps(audio, model, ...)` directly.
+  That's the same entry point upstream documentation publishes; the
+  Rust crate's `SpeechSegmenter` is a port of the same logic.
+- Defaults match between the two runners (validated 2026-05 against
+  silero-vad 6.2.1 source). See `tests/parity/README.md`.
+
+Usage:
+    uv run python silero_vad_runner.py <wav_path> --out <json_path>
+
+All knobs are exposed as CLI flags so `run.sh` can pass exactly the
+same parameter set to both runners.
+"""
+
+from __future__ import annotations
+
+import argparse
+import hashlib
+import json
+import math
+import subprocess
+import sys
+import time
+from importlib.metadata import version as pkg_version
+from pathlib import Path
+
+import numpy as np
+import torch
+from silero_vad import get_speech_timestamps, load_silero_vad
+
+
+def load_audio_16k_mono_f32(path: Path) -> np.ndarray:
+    """Decode `path` to 16 kHz mono `np.float32`.
+
+    Mirrors WhisperX's `load_audio` (whisperx/audio.py): shell out to
+    `ffmpeg -nostdin -threads 0 -i <path> -f s16le -ac 1 -acodec
+    pcm_s16le -ar 16000 -`, then `np.frombuffer(out, np.int16).astype(
+    np.float32) / 32768.0`. The Rust runner uses `ffmpeg-next` to do
+    exactly the same thing in-process. Doing the same conversion on
+    both sides keeps the f32 buffer the model sees byte-identical, so
+    `clip_sha256` matches across runners and any output divergence is
+    the model / segmenter rather than the loader.
+
+    `silero-vad`'s own `read_audio` uses `torchaudio.load` which goes
+    through ffmpeg/sox under the hood — close enough that segments
+    almost always agree, but the byte-identical path is what makes the
+    parity hash check meaningful.
+    """
+    cmd = [
+        "ffmpeg",
+        "-nostdin",
+        "-threads",
+        "0",
+        "-i",
+        str(path),
+        "-f",
+        "s16le",
+        "-ac",
+        "1",
+        "-acodec",
+        "pcm_s16le",
+        "-ar",
+        "16000",
+        "-",
+    ]
+    proc = subprocess.run(cmd, capture_output=True, check=True)
+    pcm = np.frombuffer(proc.stdout, dtype=np.int16)
+    return pcm.astype(np.float32) / 32768.0
+
+
+def sha256_f32_buffer(audio: np.ndarray) -> str:
+    h = hashlib.sha256()
+    h.update(audio.tobytes(order="C"))
+    return h.hexdigest()
+
+
+def main() -> int:
+    parser = argparse.ArgumentParser(
+        description="Run upstream Python silero-vad on a 16 kHz mono WAV; emit segments as JSON."
+    )
+    parser.add_argument("wav_path", type=Path, help="16 kHz mono WAV (any container ffmpeg can decode).")
+    parser.add_argument("--out", type=Path, default=None, help="Output JSON path (default: stdout).")
+    # Defaults below match `silero_vad.get_speech_timestamps` exactly
+    # (validated against silero-vad 6.2.1 — see README). They also
+    # match the silero Rust crate's `SpeechOptions::default()`. Both
+    # runners therefore default to apples-to-apples comparison out of
+    # the box.
+    parser.add_argument("--threshold", type=float, default=0.5)
+    parser.add_argument("--min-speech-ms", type=int, default=250)
+    parser.add_argument("--min-silence-ms", type=int, default=100)
+    parser.add_argument("--speech-pad-ms", type=int, default=30)
+    parser.add_argument("--min-silence-at-max-speech-ms", type=int, default=98)
+    parser.add_argument(
+        "--max-speech-s",
+        type=float,
+        default=None,
+        help="Max speech duration in seconds before force-split. Default: no limit (math.inf).",
+    )
+    parser.add_argument(
+        "--backend",
+        choices=("jit", "onnx"),
+        default="onnx",
+        help=(
+            "Silero model backend. Defaults to `onnx` so the Python side runs "
+            "the SAME ORT bytes the Rust crate runs (silero-vad's bundled "
+            "`data/silero_vad.onnx` is byte-identical to "
+            "`silero/models/silero_vad.onnx`). `jit` runs PyTorch JIT "
+            "instead — useful for measuring runtime drift, but those "
+            "numbers are NOT a fair Rust-vs-Python segmenter comparison "
+            "because PyTorch and ORT can disagree at the FP level."
+        ),
+    )
+    args = parser.parse_args()
+
+    wav_path = args.wav_path.resolve()
+    if not wav_path.is_file():
+        print(f"WAV not found: {wav_path}", file=sys.stderr)
+        return 2
+
+    audio = load_audio_16k_mono_f32(wav_path)
+    sample_rate = 16_000
+    duration_s = float(len(audio)) / sample_rate
+    clip_sha256 = sha256_f32_buffer(audio)
+
+    print(
+        f"[silero-vad-py] wav={wav_path} dur={duration_s:.2f}s sha256={clip_sha256[:16]} "
+        f"threshold={args.threshold} max_speech_s={args.max_speech_s}",
+        file=sys.stderr,
+    )
+
+    t0 = time.monotonic()
+    # `load_silero_vad(onnx=...)` returns the VAD model from the
+    # bundled snapshot the silero-vad PyPI package ships (silero-vad 6.x
+    # bundles its own ONNX/JIT in the package itself rather than via
+    # torch.hub). We pass `onnx=True` by default so both runners feed
+    # identical bytes to ORT — same model, same backend — and any IoU
+    # disagreement is the segmenter logic, not the inference runtime.
+    use_onnx = args.backend == "onnx"
+    model = load_silero_vad(onnx=use_onnx)
+    backend_label = "silero_vad.onnx" if use_onnx else "silero_vad.jit"
+    t_load = time.monotonic()
+
+    audio_t = torch.from_numpy(audio)
+
+    kwargs = dict(
+        sampling_rate=sample_rate,
+        threshold=args.threshold,
+        min_speech_duration_ms=args.min_speech_ms,
+        min_silence_duration_ms=args.min_silence_ms,
+        speech_pad_ms=args.speech_pad_ms,
+        min_silence_at_max_speech=args.min_silence_at_max_speech_ms,
+    )
+    if args.max_speech_s is not None:
+        kwargs["max_speech_duration_s"] = args.max_speech_s
+    else:
+        # Match the silero-vad default explicitly: `float('inf')`. Pass
+        # it through rather than relying on the library default so the
+        # JSON output records exactly what was used.
+        kwargs["max_speech_duration_s"] = math.inf
+
+    timestamps = get_speech_timestamps(audio_t, model, **kwargs)
+    t_vad = time.monotonic()
+
+    print(
+        f"[silero-vad-py] load={t_load - t0:.2f}s vad={t_vad - t_load:.2f}s "
+        f"-> {len(timestamps)} segments",
+        file=sys.stderr,
+    )
+
+    # `get_speech_timestamps` returns dicts with int sample indices
+    # under `start`/`end` (since we don't pass `return_seconds=True`).
+    # Emit both sample- and second-coordinates so score.py can choose.
+    segments = []
+    for ts in timestamps:
+        start_sample = int(ts["start"])
+        end_sample = int(ts["end"])
+        segments.append(
+            {
+                "start_s": start_sample / sample_rate,
+                "end_s": end_sample / sample_rate,
+                "start_sample": start_sample,
+                "end_sample": end_sample,
+            }
+        )
+
+    payload = {
+        "runner": "silero-vad-py",
+        "silero_vad_version": _resolve_version(),
+        "torch_version": torch.__version__,
+        "backend": backend_label,
+        "clip_path": str(wav_path),
+        "clip_sha256": clip_sha256,
+        "duration_s": duration_s,
+        "params": {
+            "threshold": args.threshold,
+            "min_speech_duration_ms": args.min_speech_ms,
+            "min_silence_duration_ms": args.min_silence_ms,
+            "speech_pad_ms": args.speech_pad_ms,
+            "min_silence_at_max_speech_ms": args.min_silence_at_max_speech_ms,
+            # Effective value (matches what was actually passed to
+            # `get_speech_timestamps`): the float when `--max-speech-s`
+            # was set, otherwise `math.inf`. `json.dumps` emits the
+            # latter as `Infinity`, which Python's `json.loads` round-
+            # trips correctly; use a JSON parser that accepts non-strict
+            # output if you read this field from a different stack.
+            "max_speech_s": kwargs["max_speech_duration_s"],
+            "sampling_rate": sample_rate,
+            "window_size_samples": 512,
+        },
+        "segment_count": len(segments),
+        "segments": segments,
+    }
+
+    serialized = json.dumps(payload, indent=2)
+    if args.out is None:
+        print(serialized)
+    else:
+        args.out.write_text(serialized + "\n")
+        print(
+            f"[silero-vad-py] wrote {len(segments)} segments to {args.out}",
+            file=sys.stderr,
+        )
+
+    return 0
+
+
+def _resolve_version() -> str | None:
+    try:
+        return pkg_version("silero-vad")
+    except Exception:
+        return None
+
+
+if __name__ == "__main__":
+    sys.exit(main())
diff --git a/tests/parity/run.sh b/tests/parity/run.sh
new file mode 100755
index 0000000..0b2a884
--- /dev/null
+++ b/tests/parity/run.sh
@@ -0,0 +1,116 @@
+#!/usr/bin/env bash
+# silero-rs vs upstream Python silero-vad parity harness.
+#
+# Requires:
+# - `cargo` + Rust toolchain (silero-parity-runner builds via path = "../..")
+# - `uv` on PATH (https://docs.astral.sh/uv/) for the Python venv
+# - `ffmpeg` on PATH (the Python runner shells out to it for audio loading)
+#
+# Usage:
+#   ./tests/parity/run.sh <fixture-dir|wav-path>
+#
+# Accepts either a fixture directory (uses `clip_16k.wav` inside) or a
+# direct WAV path.
+#
+# The canonical test set is dia's parity fixtures at
+# /Users/user/Develop/findit-studio/dia/tests/parity/fixtures/, which
+# we don't copy into this repo (they're large). See README for a
+# pointer.
+
+set -euo pipefail
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
+
+usage() {
+  echo "usage: $(basename "$0") <fixture-dir|wav-path>" >&2
+  echo "" >&2
+  echo "Examples:" >&2
+  echo "  $(basename "$0") /path/to/dia/tests/parity/fixtures/01_dialogue" >&2
+  echo "  $(basename "$0") /path/to/clip_16k.wav" >&2
+  exit 64
+}
+
+if [ "$#" -ne 1 ]; then
+  usage
+fi
+
+ARG="$1"
+if [ -d "$ARG" ]; then
+  CLIP="$ARG/clip_16k.wav"
+elif [ -f "$ARG" ]; then
+  CLIP="$ARG"
+else
+  echo "[run.sh] $ARG is neither a directory nor a WAV file" >&2
+  exit 65
+fi
+
+if [ ! -f "$CLIP" ]; then
+  echo "[run.sh] no clip at $CLIP" >&2
+  exit 66
+fi
+
+ABS_CLIP="$(cd "$(dirname "$CLIP")" && pwd)/$(basename "$CLIP")"
+FIXTURE_NAME="$(basename "$(dirname "$ABS_CLIP")")"
+if [ "$FIXTURE_NAME" = "" ] || [ "$FIXTURE_NAME" = "/" ]; then
+  FIXTURE_NAME="$(basename "$ABS_CLIP" .wav)"
+fi
+
+OUT_DIR="$SCRIPT_DIR/out"
+mkdir -p "$OUT_DIR"
+RUST_OUT="$OUT_DIR/silero_rs_${FIXTURE_NAME}.json"
+PY_OUT="$OUT_DIR/silero_py_${FIXTURE_NAME}.json"
+SCORE_OUT="$OUT_DIR/score_${FIXTURE_NAME}.json"
+
+echo "[run.sh] clip:    $ABS_CLIP"
+echo "[run.sh] outputs: $RUST_OUT, $PY_OUT, $SCORE_OUT"
+
+# 1) uv venv for the Python side. Cached when unchanged.
+cd "$SCRIPT_DIR/python"
+if [ ! -d .venv ]; then
+  echo "[run.sh] creating uv venv at $(pwd)/.venv ..."
+  uv venv
+fi
+echo "[run.sh] syncing python deps (cached when unchanged) ..."
+uv pip install -e . > /dev/null
+
+# 2) Rust runner. Builds in release mode with the bundled silero ONNX
+# model. ort 2.0.0-rc.12's default features include `download-binaries`
+# + `copy-dylibs`, so the prebuilt ONNX Runtime ships next to the
+# binary — no need for `ORT_DYLIB_PATH` (unlike whispery's harness
+# which uses load-dynamic).
+#
+# Both runners now use the upstream Python silero-vad defaults
+# verbatim (threshold 0.5, min_speech_duration_ms 250,
+# min_silence_duration_ms 100, speech_pad_ms 30,
+# min_silence_at_max_speech_ms 98). The previous `--min-silence-ms 132`
+# crate-side override compensated for an off-by-one in
+# `SpeechSegmenter::push_probability`'s silence counter; that bug was
+# fixed in silero v0.3.0, so the override is no longer required.
+# `--min-silence-ms` remains a CLI flag on the runner for advanced
+# users who want to override.
+cd "$ROOT"
+echo "[run.sh] running silero-parity-runner ..."
+cargo run \
+  --release \
+  --quiet \
+  --manifest-path tests/parity/Cargo.toml \
+  -p silero-parity-runner \
+  --bin silero-parity-runner \
+  -- "$ABS_CLIP" \
+  --out "$RUST_OUT"
+
+# 3) Python runner. Defaults match upstream silero-vad PyPI defaults,
+# and the crate (v0.3.0+) now matches them too.
+cd "$SCRIPT_DIR/python"
+echo "[run.sh] running silero_vad_runner.py ..."
+uv run python silero_vad_runner.py "$ABS_CLIP" --out "$PY_OUT"
+
+# 4) Score. Captures the score's exit code and propagates it.
+cd "$SCRIPT_DIR/python"
+echo "[run.sh] scoring ..."
+set +e
+uv run python score.py "$RUST_OUT" "$PY_OUT" --out "$SCORE_OUT"
+SCORE_RC=$?
+set -e
+
+exit $SCORE_RC
diff --git a/tests/parity/src/main.rs b/tests/parity/src/main.rs
new file mode 100644
index 0000000..8283be5
--- /dev/null
+++ b/tests/parity/src/main.rs
@@ -0,0 +1,427 @@
+//! `silero-parity-runner` — load a 16 kHz mono WAV via `ffmpeg-next`,
+//! push it through `silero::detect_speech` (the production one-shot
+//! offline path), and dump the resulting speech segments as JSON.
+//! Pair with `python/silero_vad_runner.py` (same JSON schema,
+//! `runner = "silero-vad-py"`) and `python/score.py` for IoU
+//! comparison.
+//!
+//! This binary is **NOT** part of `cargo test`. It's invoked from the
+//! `run.sh` driver. Audio loading uses `ffmpeg-next` so the f32 buffer
+//! the silero ONNX model consumes is byte-identical to what the upstream
+//! Python `silero-vad` package consumes (which also goes through
+//! ffmpeg via `torchaudio` / `read_audio`).
+//!
+//! All `SpeechOptions` knobs are exposed via flags so the run.sh
+//! driver can pass parameters that match the Python runner exactly.
+//! Defaults match the silero crate's `SpeechOptions::default()`, which
+//! in turn match upstream silero-vad PyPI defaults (threshold 0.5,
+//! min_speech_duration_ms 250, min_silence_duration_ms 100,
+//! speech_pad_ms 30, min_silence_at_max_speech_ms 98).
+
+use std::{
+  fs,
+  io::Write,
+  path::{Path, PathBuf},
+  sync::OnceLock,
+  time::Duration,
+};
+
+use anyhow::{Context, Result, bail};
+use clap::Parser;
+use ffmpeg_next as ffmpeg;
+use serde_json::json;
+use sha2::{Digest, Sha256};
+use silero::{SampleRate, Session, SpeechOptions, detect_speech};
+
+// Take the version string from the silero crate itself (re-exported as
+// `silero::VERSION`) rather than `env!("CARGO_PKG_VERSION")`, which in
+// this binary resolves to the parity-runner's own version (0.0.0). The
+// JSON output should record the version of the crate under test.
+const SILERO_CRATE_VERSION: &str = silero::VERSION;
+// SHA-256 of the bundled ONNX model bytes. Computed on demand below.
+// Logged so a snapshot rebuild in the silero crate that swaps
+// `models/silero_vad.onnx` cannot silently invalidate the parity
+// numbers — the JSON output records exactly which model bytes ran.
+
+#[derive(Parser, Debug)]
+#[command(
+  about = "Run silero (Rust crate) VAD on a 16 kHz mono WAV; emit JSON for side-by-side comparison with upstream Python silero-vad."
+)]
+struct Args {
+  /// Path to a 16 kHz mono WAV (or any audio container ffmpeg can
+  /// decode; resampled to 16 kHz mono internally).
+  wav_path: PathBuf,
+
+  /// Output file (defaults to stdout).
+  #[arg(long)]
+  out: Option<PathBuf>,
+
+  /// Speech-onset probability threshold. Silero crate default: 0.5.
+  #[arg(long, default_value_t = 0.5)]
+  threshold: f32,
+
+  /// Minimum speech duration in milliseconds; shorter speech bursts are
+  /// dropped. Silero crate default: 250.
+  #[arg(long, default_value_t = 250)]
+  min_speech_ms: u64,
+
+  /// Minimum silence duration in milliseconds before a speech segment
+  /// is closed. Silero crate default: 100.
+  #[arg(long, default_value_t = 100)]
+  min_silence_ms: u64,
+
+  /// Speech padding (added at both ends of every emitted segment) in
+  /// milliseconds. Silero crate default: 30.
+  #[arg(long, default_value_t = 30)]
+  speech_pad_ms: u64,
+
+  /// Minimum silence used as a preferred split point when
+  /// `--max-speech-s` is hit, in milliseconds. Silero crate default: 98
+  /// (which matches upstream Python silero-vad's 0.098 s default).
+  #[arg(long, default_value_t = 98)]
+  min_silence_at_max_speech_ms: u64,
+
+  /// Maximum speech duration in seconds before the segmenter
+  /// force-splits a long segment. Defaults to "no limit" (matches both
+  /// the Rust crate and Python silero-vad defaults). Pass e.g. `30` to
+  /// match WhisperX-style chunking.
+  #[arg(long)]
+  max_speech_s: Option<f64>,
+}
+
+/// Idempotent guard for `ffmpeg::init()`. Persists the init outcome in
+/// a `OnceLock` so a failed first init keeps surfacing on subsequent
+/// calls (the previous `Once`-based version stored the error on the
+/// stack and silently returned `Ok(())` on later calls).
+fn ffmpeg_init() -> Result<()> {
+  // `ffmpeg::Error` is not `Clone`, so store the error as `String` —
+  // we only need the message on subsequent calls.
+  static INIT: OnceLock<std::result::Result<(), String>> = OnceLock::new();
+  match INIT.get_or_init(|| ffmpeg::init().map_err(|e| e.to_string())) {
+    Ok(()) => Ok(()),
+    Err(msg) => Err(anyhow::anyhow!("ffmpeg::init failed: {msg}")),
+  }
+}
+
+/// Load an audio file as 16 kHz mono f32 via ffmpeg-next.
+///
+/// This mirrors the loader in `whispery`'s parity runner. Decoding
+/// path: container open → audio decoder → resample to 16 kHz mono
+/// `s16` (signed 16-bit packed, little-endian) → cast each sample to
+/// `f32` and divide by exactly `32768.0`.
+///
+/// Why s16-then-divide rather than f32-direct: upstream Python
+/// silero-vad loads audio via `torchaudio.load` (or `whisperx.audio`'s
+/// ffmpeg shell-out) which lands on `np.float32 / 32768.0`. Doing the
+/// same conversion on the Rust side keeps the f32 buffer the model
+/// sees byte-identical, so a hash comparison on the JSON output's
+/// `clip_sha256` field can verify both runners decoded the audio the
+/// same way before flagging any output divergence as a model issue.
+///
+/// Returns `(samples, duration_s, sha256)`.
+fn read_audio_16k_mono_f32(path: &Path) -> Result<(Vec<f32>, f64, String)> {
+  use ffmpeg::format::sample::{Sample, Type as SampleType};
+  use ffmpeg::software::resampling::Context as Resampler;
+  use ffmpeg::{ChannelLayout, codec::context::Context as CodecContext, frame, media};
+
+  ffmpeg_init()?;
+
+  let mut ictx = ffmpeg::format::input(path)
+    .with_context(|| format!("open audio container at {}", path.display()))?;
+  let stream = ictx
+    .streams()
+    .best(media::Type::Audio)
+    .ok_or_else(|| anyhow::anyhow!("{}: no audio stream", path.display()))?;
+  let stream_index = stream.index();
+
+  let codec_ctx = CodecContext::from_parameters(stream.parameters())
+    .with_context(|| format!("decoder context for {}", path.display()))?;
+  let mut decoder = codec_ctx
+    .decoder()
+    .audio()
+    .with_context(|| format!("audio decoder for {}", path.display()))?;
+  decoder
+    .set_parameters(stream.parameters())
+    .with_context(|| format!("decoder set_parameters for {}", path.display()))?;
+
+  const TARGET_RATE: u32 = 16_000;
+  let target_format = Sample::I16(SampleType::Packed);
+  let target_layout = ChannelLayout::MONO;
+
+  // PCM/WAV decoders commonly emit frames with `ch_layout.order =
+  // UNSPEC` (only the channel count is set); libswresample's
+  // `swr_alloc_set_opts2` rejects that in FFmpeg 7+. Fall back to
+  // `ChannelLayout::default(channels)` if the source layout is empty.
+  let resolve_src_layout =
+    |layout: ChannelLayout, channels: i32| -> ChannelLayout {
+      if layout.is_empty() {
+        ChannelLayout::default(channels)
+      } else {
+        layout
+      }
+    };
+
+  let mut src_format = decoder.format();
+  let mut src_rate = decoder.rate();
+  let mut src_layout = resolve_src_layout(decoder.channel_layout(), decoder.channels() as i32);
+
+  let build_resampler = |src_format,
+                         src_layout,
+                         src_rate|
+   -> Result<Resampler> {
+    Resampler::get(
+      src_format,
+      src_layout,
+      src_rate,
+      target_format,
+      target_layout,
+      TARGET_RATE,
+    )
+    .with_context(|| format!("init libswresample for {}", path.display()))
+  };
+
+  let mut resampler = build_resampler(src_format, src_layout, src_rate)?;
+
+  let mut samples_f32: Vec<f32> = Vec::new();
+  let mut decoded = frame::Audio::empty();
+
+  // Push i16 samples from a packed-mono frame into `samples_f32`,
+  // dividing by the literal `32768.0` exactly as
+  // WhisperX/torchaudio does.
+  let push_resampled = |frame: &frame::Audio, dst: &mut Vec<f32>| {
+    let n = frame.samples();
+    if n == 0 {
+      return;
+    }
+    let plane: &[i16] = frame.plane::<i16>(0);
+    debug_assert!(plane.len() >= n);
+    dst.reserve(n);
+    for &s in &plane[..n] {
+      dst.push(s as f32 / 32768.0_f32);
+    }
+  };
+
+  // Run a decoded frame through the resampler. Handles
+  // `InputChanged` / `OutputChanged` by rebuilding the resampler
+  // against the new source params.
+  let run_resample = |decoded: &frame::Audio,
+                      resampler: &mut Resampler,
+                      samples_f32: &mut Vec<f32>,
+                      src_format: &mut Sample,
+                      src_layout: &mut ChannelLayout,
+                      src_rate: &mut u32|
+   -> Result<()> {
+    let mut resampled = frame::Audio::empty();
+    match resampler.run(decoded, &mut resampled) {
+      Ok(_) => {
+        push_resampled(&resampled, samples_f32);
+      }
+      Err(ffmpeg::Error::InputChanged | ffmpeg::Error::OutputChanged) => {
+        *src_format = decoded.format();
+        *src_layout = resolve_src_layout(
+          decoded.channel_layout(),
+          decoded.channels() as i32,
+        );
+        *src_rate = decoded.rate();
+        *resampler = build_resampler(*src_format, *src_layout, *src_rate)?;
+        let mut retried = frame::Audio::empty();
+        resampler
+          .run(decoded, &mut retried)
+          .context("libswresample::run after rebuild")?;
+        push_resampled(&retried, samples_f32);
+      }
+      Err(e) => return Err(anyhow::anyhow!("libswresample::run: {e}")),
+    }
+    Ok(())
+  };
+
+  let fixup_frame_layout = |frame: &mut frame::Audio, src_layout: ChannelLayout| {
+    if frame.channel_layout().is_empty() {
+      frame.set_channel_layout(src_layout);
+    }
+  };
+
+  for (s, packet) in ictx.packets() {
+    if s.index() != stream_index {
+      continue;
+    }
+    decoder.send_packet(&packet).context("decoder.send_packet")?;
+    while decoder.receive_frame(&mut decoded).is_ok() {
+      fixup_frame_layout(&mut decoded, src_layout);
+      run_resample(
+        &decoded,
+        &mut resampler,
+        &mut samples_f32,
+        &mut src_format,
+        &mut src_layout,
+        &mut src_rate,
+      )?;
+    }
+  }
+  decoder.send_eof().context("decoder.send_eof")?;
+  while decoder.receive_frame(&mut decoded).is_ok() {
+    fixup_frame_layout(&mut decoded, src_layout);
+    run_resample(
+      &decoded,
+      &mut resampler,
+      &mut samples_f32,
+      &mut src_format,
+      &mut src_layout,
+      &mut src_rate,
+    )?;
+  }
+
+  // Final libswresample flush. `OutputChanged` here means "no buffered
+  // samples" in the rate-1:1 case (which is what the dia 16 kHz mono
+  // PCM fixtures hit). Treat it as a no-op rather than a hard error.
+  loop {
+    let mut tail = frame::Audio::empty();
+    match resampler.flush(&mut tail) {
+      Ok(_) => {
+        if tail.samples() == 0 {
+          break;
+        }
+        push_resampled(&tail, &mut samples_f32);
+      }
+      Err(ffmpeg::Error::OutputChanged) => break,
+      Err(e) => {
+        return Err(anyhow::anyhow!("libswresample::flush at EOF: {e}"));
+      }
+    }
+  }
+
+  if samples_f32.is_empty() {
+    bail!(
+      "{}: ffmpeg-next decoded zero samples; corrupt or empty audio?",
+      path.display()
+    );
+  }
+
+  let duration_s = samples_f32.len() as f64 / TARGET_RATE as f64;
+
+  // Hash the f32 bytes (LE) the model will see — same trick the
+  // whispery harness uses. Comparing this against the Python runner's
+  // own clip_sha256 is what catches loader-quantisation divergences.
+  let mut hasher = Sha256::new();
+  // Safety: `f32` is `Copy + 'static`, layout is well-defined as 4
+  // little-endian bytes per sample on every target this harness ships
+  // to (macOS / Linux x86_64+aarch64).
+  let bytes = unsafe {
+    std::slice::from_raw_parts(
+      samples_f32.as_ptr() as *const u8,
+      samples_f32.len() * std::mem::size_of::<f32>(),
+    )
+  };
+  hasher.update(bytes);
+  let sha = format!("{:x}", hasher.finalize());
+
+  Ok((samples_f32, duration_s, sha))
+}
+
+fn model_sha256() -> String {
+  let mut hasher = Sha256::new();
+  hasher.update(silero::BUNDLED_MODEL);
+  format!("{:x}", hasher.finalize())
+}
+
+fn main() -> Result<()> {
+  let args = Args::parse();
+
+  let (samples, duration_s, clip_sha256) = read_audio_16k_mono_f32(&args.wav_path)?;
+  eprintln!(
+    "[silero-parity] wav={} dur={:.2}s samples={} sha256={}",
+    args.wav_path.display(),
+    duration_s,
+    samples.len(),
+    &clip_sha256[..16]
+  );
+
+  // Build SpeechOptions from CLI flags. Every default matches the
+  // silero crate's `SpeechOptions::default()`, which in turn matches
+  // upstream Python silero-vad defaults.
+  let mut opts = SpeechOptions::new()
+    .with_sample_rate(SampleRate::Rate16k)
+    .with_start_threshold(args.threshold)
+    .with_min_speech_duration(Duration::from_millis(args.min_speech_ms))
+    .with_min_silence_duration(Duration::from_millis(args.min_silence_ms))
+    .with_speech_pad(Duration::from_millis(args.speech_pad_ms))
+    .with_min_silence_at_max_speech(Duration::from_millis(
+      args.min_silence_at_max_speech_ms,
+    ));
+  if let Some(s) = args.max_speech_s {
+    let ms = (s * 1000.0).round() as u64;
+    opts = opts.with_max_speech_duration(Duration::from_millis(ms));
+  }
+
+  eprintln!(
+    "[silero-parity] threshold={} min_speech_ms={} min_silence_ms={} \
+     pad_ms={} min_silence_at_max_speech_ms={} max_speech_s={:?}",
+    args.threshold,
+    args.min_speech_ms,
+    args.min_silence_ms,
+    args.speech_pad_ms,
+    args.min_silence_at_max_speech_ms,
+    args.max_speech_s,
+  );
+
+  let mut session = Session::bundled().context("load bundled silero ONNX session")?;
+  let segments = detect_speech(&mut session, &samples, opts).context("detect_speech")?;
+
+  eprintln!(
+    "[silero-parity] {} segments detected",
+    segments.len()
+  );
+
+  let segments_json: Vec<serde_json::Value> = segments
+    .iter()
+    .map(|s| {
+      json!({
+        "start_s": s.start_seconds(),
+        "end_s": s.end_seconds(),
+        "start_sample": s.start_sample(),
+        "end_sample": s.end_sample(),
+      })
+    })
+    .collect();
+
+  let payload = json!({
+    "runner": "silero-rs",
+    "silero_crate_version": SILERO_CRATE_VERSION,
+    "model_sha256": model_sha256(),
+    "clip_path": args.wav_path.display().to_string(),
+    "clip_sha256": clip_sha256,
+    "duration_s": duration_s,
+    "params": {
+      "threshold": args.threshold,
+      "min_speech_duration_ms": args.min_speech_ms,
+      "min_silence_duration_ms": args.min_silence_ms,
+      "speech_pad_ms": args.speech_pad_ms,
+      "min_silence_at_max_speech_ms": args.min_silence_at_max_speech_ms,
+      "max_speech_s": args.max_speech_s,
+      "sampling_rate": 16_000,
+      "window_size_samples": 512,
+    },
+    "segment_count": segments.len(),
+    "segments": segments_json,
+  });
+
+  let serialized = serde_json::to_string_pretty(&payload)?;
+  match args.out {
+    Some(path) => {
+      let mut f = fs::File::create(&path)
+        .with_context(|| format!("create output {}", path.display()))?;
+      f.write_all(serialized.as_bytes())?;
+      f.write_all(b"\n")?;
+      eprintln!(
+        "[silero-parity] wrote {} segments to {}",
+        segments.len(),
+        path.display()
+      );
+    }
+    None => {
+      println!("{serialized}");
+    }
+  }
+
+  Ok(())
+}