diff --git a/examples/rust-onnx-example/.gitignore b/examples/rust-onnx-example/.gitignore new file mode 100644 index 0000000..9552704 --- /dev/null +++ b/examples/rust-onnx-example/.gitignore @@ -0,0 +1,2 @@ +/target +recorder.wav diff --git a/examples/rust-onnx-example/Cargo.toml b/examples/rust-onnx-example/Cargo.toml new file mode 100644 index 0000000..9a1ecad --- /dev/null +++ b/examples/rust-onnx-example/Cargo.toml @@ -0,0 +1,8 @@ +[package] +name = "rust-onnx-example" +version = "0.1.0" +edition = "2021" + +[dependencies] +wavekat-vad = { version = "0.1", features = ["firered"] } +hound = "3" diff --git a/examples/rust-onnx-example/README.md b/examples/rust-onnx-example/README.md new file mode 100644 index 0000000..d0293cd --- /dev/null +++ b/examples/rust-onnx-example/README.md @@ -0,0 +1,34 @@ +# Rust example using wavekat-vad + +Uses [wavekat-vad](https://github.com/wavekat/wavekat-vad), a Rust library that provides a unified interface for multiple VAD backends including FireRedVAD. The FireRedVAD ONNX model and CMVN stats are downloaded and embedded in the binary at compile time — no manual model setup needed. + +## Features + +- Pure Rust Mel filterbank + CMVN preprocessing (no C dependencies for audio features) +- Automatic resampling from any sample rate to 16kHz +- `FrameAdapter` handles frame buffering (feed any chunk size, get correctly sized 10ms frames) +- Works with any WAV file format (mono/stereo, any sample rate) + +## Usage + +```sh +cargo run -- /path/to/audio.wav +``` + +Sample output: + +``` +File: audio.wav (16000Hz, 1ch, 16bit) +Duration: 3.50s (56000 samples at 16000Hz) + +FireRedVAD — frame: 160 samples (10ms) + + 0ms 0.000 + 10ms 0.000 + 20ms 0.012 + 30ms 0.008 + 40ms 0.245 + 50ms 0.876 ################################### SPEECH + 60ms 0.923 #################################### SPEECH + ... +``` diff --git a/examples/rust-onnx-example/src/main.rs b/examples/rust-onnx-example/src/main.rs new file mode 100644 index 0000000..418caa5 --- /dev/null +++ b/examples/rust-onnx-example/src/main.rs @@ -0,0 +1,73 @@ +use wavekat_vad::backends::firered::FireRedVad; +use wavekat_vad::{FrameAdapter, VoiceActivityDetector}; + +fn main() { + let audio_path = std::env::args() + .nth(1) + .unwrap_or_else(|| String::from("recorder.wav")); + + // Open WAV file + let mut reader = hound::WavReader::open(&audio_path).expect("failed to open WAV file"); + let spec = reader.spec(); + println!( + "File: {audio_path} ({}Hz, {}ch, {}bit)", + spec.sample_rate, spec.channels, spec.bits_per_sample + ); + + if spec.sample_format != hound::SampleFormat::Int { + panic!("Unsupported sample format. Expect Int."); + } + + // Read samples (first channel only for multi-channel files) + let samples: Vec = reader + .samples::() + .step_by(spec.channels as usize) + .map(|s| s.expect("failed to read sample")) + .collect(); + + // Resample to 16kHz if needed + let target_rate = 16000; + let samples = if spec.sample_rate != target_rate { + println!("Resampling {}Hz -> {}Hz", spec.sample_rate, target_rate); + use wavekat_vad::preprocessing::AudioResampler; + let mut resampler = + AudioResampler::new(spec.sample_rate, target_rate).expect("failed to create resampler"); + resampler.process(&samples) + } else { + samples + }; + + let duration_s = samples.len() as f64 / target_rate as f64; + println!( + "Duration: {duration_s:.2}s ({} samples at {target_rate}Hz)\n", + samples.len() + ); + + // Create FireRedVAD — the ONNX model + CMVN stats are embedded in the binary at compile time + let vad = FireRedVad::new().expect("failed to create FireRedVAD"); + let caps = vad.capabilities(); + println!( + "FireRedVAD — frame: {} samples ({}ms)\n", + caps.frame_size, caps.frame_duration_ms + ); + + // FrameAdapter handles automatic frame buffering so you can feed any chunk size + let mut adapter = FrameAdapter::new(vad); + + // Process in 20ms chunks (arbitrary — the adapter buffers to the required 10ms frame size) + let chunk_size = target_rate as usize / 50; // 320 samples = 20ms + let mut time_ms = 0.0; + let step_ms = chunk_size as f64 * 1000.0 / target_rate as f64; + + for chunk in samples.chunks(chunk_size) { + let results = adapter.process_all(chunk, target_rate).unwrap(); + for prob in results { + let bar = "#".repeat((prob * 40.0) as usize); + let label = if prob > 0.5 { " SPEECH" } else { "" }; + println!("{time_ms:8.0}ms {prob:.3} {bar}{label}"); + } + time_ms += step_ms; + } + + println!("\nFinished."); +} diff --git a/runtime/README.md b/runtime/README.md index 9fd9e23..e736701 100644 --- a/runtime/README.md +++ b/runtime/README.md @@ -11,4 +11,7 @@ What’s included More details in [NCNN README.md](ncnn/) ## onnxruntime -TODO \ No newline at end of file + +### Rust + +- [Rust ONNX Example](../examples/rust-onnx-example/) — Stream VAD using [wavekat-vad](https://github.com/wavekat/wavekat-vad) with pure Rust Mel filterbank + CMVN preprocessing \ No newline at end of file