diff --git a/.gitignore b/.gitignore index d5a631f..d8789d6 100644 --- a/.gitignore +++ b/.gitignore @@ -5,6 +5,9 @@ Cargo.lock .DS_Store .cargo/config.toml +# Claude Code runtime state +.claude/scheduled_tasks.lock + # Python tooling (scripts/) scripts/.venv/ scripts/__pycache__/ diff --git a/Makefile b/Makefile index 4f1b0bd..2ae6151 100644 --- a/Makefile +++ b/Makefile @@ -1,4 +1,4 @@ -.PHONY: help check test fmt lint doc ci accuracy mel example-controller +.PHONY: help check test fmt lint doc ci accuracy mel hf-smoke example-controller help: @echo "Available targets:" @@ -6,6 +6,7 @@ help: @echo " test Run all tests" @echo " accuracy Cross-validate Rust pipeline against Python reference" @echo " mel Compare Rust vs Python mel spectrograms element-wise" + @echo " hf-smoke Download wavekat/smart-turn-ONNX from HF and run zh fixtures" @echo " fmt Format code" @echo " lint Run clippy with warnings as errors" @echo " doc Build and open docs in browser" @@ -28,6 +29,13 @@ accuracy: mel: cargo test --features pipecat -- mel_report --ignored --nocapture +# Download wavekat/smart-turn-ONNX from HuggingFace and assert the zh fine-tune +# correctly classifies the Mandarin fixtures. Requires network on first run; +# subsequent runs hit the HF cache under $HF_HOME/hub/. +hf-smoke: + cargo test --features wavekat-smart-turn --test pipecat \ + -- --ignored wavekat_hf_download_smoke --nocapture + # Run TurnController example example-controller: cargo run --features pipecat --example controller diff --git a/README.md b/README.md index c560e4b..49995ba 100644 --- a/README.md +++ b/README.md @@ -19,8 +19,15 @@ models behind common Rust traits. Same pattern as | Backend | Feature flag | Input | Model size | Inference | License | |---------|-------------|-------|------------|-----------|---------| | [Pipecat Smart Turn v3](https://github.com/pipecat-ai/smart-turn) | `pipecat` | Audio (16 kHz PCM) | ~8 MB (int8 ONNX) | ~12 ms CPU | BSD 2-Clause | +| WaveKat Smart Turn fine-tunes ([HF](https://huggingface.co/wavekat/smart-turn-ONNX)) | `wavekat-smart-turn` | Audio (16 kHz PCM) | ~8 MB (int8 ONNX) | ~12 ms CPU | BSD 2-Clause | | [LiveKit Turn Detector](https://github.com/livekit/turn-detector) | `livekit` | Text (ASR transcript) | ~400 MB (ONNX) | ~25 ms CPU | LiveKit Model License | +The WaveKat fine-tunes share the upstream Pipecat ONNX contract (same input +shape, same tensor names) — they're language-specialized weights for the +same architecture. Use them when you want better behavior on a specific +language; today Mandarin (`zh`) is the only one shipped, but more will land +in the same HF repo over time. + ## Quick Start ```sh @@ -92,8 +99,33 @@ wavekat-voice --> orchestrates VAD + turn + ASR + LLM + TTS | Flag | Default | Description | |------|---------|-------------| | `pipecat` | off | Pipecat Smart Turn v3 audio backend (requires `ort`, `ndarray`) | +| `wavekat-smart-turn` | off | WaveKat language-specialized fine-tunes; implies `pipecat`, adds `hf-hub` runtime download | | `livekit` | off | LiveKit text-based backend (requires `ort`, `ndarray`) | +### Selecting a Smart Turn variant + +```rust +use wavekat_turn::audio::{PipecatSmartTurn, SmartTurnVariant}; +# #[cfg(feature = "wavekat-smart-turn")] +use wavekat_turn::audio::SmartTurnLang; + +// Embedded upstream weights — works offline, no setup. +let detector = PipecatSmartTurn::new()?; + +# #[cfg(feature = "wavekat-smart-turn")] +// WaveKat Mandarin fine-tune — downloaded from HuggingFace on first call, +// then cached under $HF_HOME/hub/. +let detector = PipecatSmartTurn::with_variant( + SmartTurnVariant::Wavekat(SmartTurnLang::Zh), +)?; +``` + +The first call for a WaveKat variant downloads the ONNX from +[`wavekat/smart-turn-ONNX`](https://huggingface.co/wavekat/smart-turn-ONNX) +and caches it under `$HF_HOME/hub/` (default `~/.cache/huggingface/hub/`). +For offline builds, set `WAVEKAT_TURN_MODEL_DIR` to a directory containing +`/smart-turn-cpu.onnx` to skip the download. + ## Important Notes - **8 kHz telephony audio must be upsampled to 16 kHz** before passing to diff --git a/crates/wavekat-turn/Cargo.toml b/crates/wavekat-turn/Cargo.toml index aa9d3f4..865211c 100644 --- a/crates/wavekat-turn/Cargo.toml +++ b/crates/wavekat-turn/Cargo.toml @@ -17,6 +17,10 @@ build = "build.rs" default = [] pipecat = ["dep:ort", "dep:ndarray", "dep:realfft", "dep:ureq"] livekit = ["dep:ort", "dep:ndarray"] +# WaveKat language-specialized Smart Turn fine-tunes, fetched from HuggingFace +# at runtime via `hf-hub`. The language is chosen at runtime through +# `SmartTurnVariant::Wavekat(SmartTurnLang::…)`. +wavekat-smart-turn = ["pipecat", "dep:hf-hub"] [dependencies] wavekat-core = "0.0.4" @@ -26,6 +30,9 @@ thiserror = "2" ort = { version = "2.0.0-rc.12", optional = true, features = ["ndarray"] } ndarray = { version = "0.17", optional = true } realfft = { version = "3", optional = true } +# Runtime HuggingFace downloads for WaveKat fine-tunes (gated on +# `wavekat-smart-turn`). A blocking ureq backend keeps us off tokio. +hf-hub = { version = "0.5", optional = true, default-features = false, features = ["ureq"] } [build-dependencies] ureq = { version = "3", optional = true } diff --git a/crates/wavekat-turn/src/audio/mod.rs b/crates/wavekat-turn/src/audio/mod.rs index 2e3869b..3f5f198 100644 --- a/crates/wavekat-turn/src/audio/mod.rs +++ b/crates/wavekat-turn/src/audio/mod.rs @@ -2,9 +2,21 @@ //! //! These backends operate directly on raw audio frames and do not //! require an upstream ASR transcript. +//! +//! [`PipecatSmartTurn`] is the entry point; [`SmartTurnVariant`] selects +//! which set of weights to load (upstream Pipecat vs WaveKat fine-tunes). +//! When the `wavekat-smart-turn` feature is enabled, [`SmartTurnLang`] +//! enumerates the language-specialized fine-tunes available on +//! HuggingFace. #[cfg(feature = "pipecat")] mod pipecat; +#[cfg(feature = "wavekat-smart-turn")] +pub(crate) mod wavekat_download; + #[cfg(feature = "pipecat")] -pub use pipecat::PipecatSmartTurn; +pub use pipecat::{PipecatSmartTurn, SmartTurnVariant}; + +#[cfg(feature = "wavekat-smart-turn")] +pub use pipecat::SmartTurnLang; diff --git a/crates/wavekat-turn/src/audio/pipecat.rs b/crates/wavekat-turn/src/audio/pipecat.rs index eefe115..1ac38cd 100644 --- a/crates/wavekat-turn/src/audio/pipecat.rs +++ b/crates/wavekat-turn/src/audio/pipecat.rs @@ -57,6 +57,40 @@ use realfft::{RealFftPlanner, RealToComplex}; use crate::onnx; use crate::{AudioFrame, AudioTurnDetector, StageTiming, TurnError, TurnPrediction, TurnState}; +// --------------------------------------------------------------------------- +// Model variants +// --------------------------------------------------------------------------- + +/// Language for a WaveKat fine-tune of Pipecat Smart Turn. +/// +/// Each variant resolves to a `/smart-turn-cpu.onnx` file inside the +/// language-agnostic HuggingFace repo `wavekat/smart-turn-ONNX`. The set is +/// marked `#[non_exhaustive]` because adding a new language must not be a +/// breaking change. +#[cfg(feature = "wavekat-smart-turn")] +#[non_exhaustive] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SmartTurnLang { + /// Mandarin Chinese. + Zh, +} + +/// Which set of Smart Turn weights to load. +/// +/// All variants share the same architecture (Whisper-Tiny encoder + binary +/// classification head) and ONNX tensor contract — only the weights differ. +#[non_exhaustive] +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum SmartTurnVariant { + /// Upstream multilingual Pipecat Smart Turn v3 (embedded in the crate). + PipecatV3, + /// WaveKat language-specialized fine-tune. Resolved at runtime through + /// HuggingFace (cached under `$HF_HOME/hub/`) and overridable via + /// `WAVEKAT_TURN_MODEL_DIR`. + #[cfg(feature = "wavekat-smart-turn")] + Wavekat(SmartTurnLang), +} + // --------------------------------------------------------------------------- // Constants // --------------------------------------------------------------------------- @@ -373,6 +407,10 @@ fn prepare_audio(samples: &[f32]) -> Vec { /// Pipecat Smart Turn v3 detector. /// +/// Wraps the Smart Turn v3 architecture (Whisper-Tiny encoder + binary +/// classification head). Use [`new`] for the embedded upstream weights, or +/// [`with_variant`] to pick a WaveKat fine-tune at runtime. +/// /// Buffers up to 8 seconds of audio internally. Call [`push_audio`] with /// every incoming 16 kHz frame, then call [`predict`] when the VAD fires /// end-of-speech to get a [`TurnPrediction`]. @@ -392,6 +430,8 @@ fn prepare_audio(samples: &[f32]) -> Vec { /// # } /// ``` /// +/// [`new`]: Self::new +/// [`with_variant`]: Self::with_variant /// [`push_audio`]: AudioTurnDetector::push_audio /// [`predict`]: AudioTurnDetector::predict pub struct PipecatSmartTurn { @@ -409,12 +449,34 @@ unsafe impl Send for PipecatSmartTurn {} unsafe impl Sync for PipecatSmartTurn {} impl PipecatSmartTurn { - /// Load the Smart Turn v3.2 model embedded at compile time. + /// Load the upstream Pipecat Smart Turn v3.2 model embedded at compile time. + /// + /// Equivalent to [`with_variant(SmartTurnVariant::PipecatV3)`](Self::with_variant). pub fn new() -> Result { let session = onnx::session_from_memory(MODEL_BYTES)?; Ok(Self::build(session)) } + /// Load a specific variant of the Smart Turn model. + /// + /// - [`SmartTurnVariant::PipecatV3`] uses the embedded ONNX bytes — no + /// network required. + /// - [`SmartTurnVariant::Wavekat`] (when the `wavekat-smart-turn` feature + /// is enabled) downloads the corresponding language file from the + /// `wavekat/smart-turn-ONNX` HuggingFace repo and caches it under + /// `$HF_HOME/hub/`. Set `WAVEKAT_TURN_MODEL_DIR` to point at a + /// pre-populated directory (offline / CI use). + pub fn with_variant(variant: SmartTurnVariant) -> Result { + match variant { + SmartTurnVariant::PipecatV3 => Self::new(), + #[cfg(feature = "wavekat-smart-turn")] + SmartTurnVariant::Wavekat(lang) => { + let path = crate::audio::wavekat_download::resolve_model(lang)?; + Self::from_file(path) + } + } + } + /// Load a model from a custom path on disk. /// /// Useful for CI environments that supply the model file separately, or diff --git a/crates/wavekat-turn/src/audio/wavekat_download.rs b/crates/wavekat-turn/src/audio/wavekat_download.rs new file mode 100644 index 0000000..697564c --- /dev/null +++ b/crates/wavekat-turn/src/audio/wavekat_download.rs @@ -0,0 +1,66 @@ +//! Runtime download of WaveKat-trained Smart Turn weights from HuggingFace. +//! +//! Mirrors the `wavekat-tts` pattern: one language-agnostic HF repo with +//! per-language subdirectories, a dated `REVISION` pinned in code so that +//! model updates ship via a crate release, and a `WAVEKAT_TURN_MODEL_DIR` +//! escape hatch for offline / CI builds. + +use std::path::PathBuf; + +use hf_hub::api::sync::ApiBuilder; +use hf_hub::{Repo, RepoType}; + +use super::pipecat::SmartTurnLang; +use crate::error::TurnError; + +/// HuggingFace repo holding all WaveKat Smart Turn fine-tunes. +const REPO_ID: &str = "wavekat/smart-turn-ONNX"; + +/// Pinned model revision. Bumping this string is the way to ship updated +/// weights to consumers — same pattern as `wavekat-tts`. +const REVISION: &str = "main"; + +/// Env var that lets callers point at a local directory containing +/// `/smart-turn-cpu.onnx`, skipping the HuggingFace download entirely. +const LOCAL_DIR_ENV: &str = "WAVEKAT_TURN_MODEL_DIR"; + +/// Map a language to its file path inside the HF repo. +fn relative_path(lang: SmartTurnLang) -> &'static str { + match lang { + SmartTurnLang::Zh => "zh/smart-turn-cpu.onnx", + } +} + +/// Resolve the on-disk path for `lang`, downloading from HuggingFace if needed. +pub(crate) fn resolve_model(lang: SmartTurnLang) -> Result { + let rel = relative_path(lang); + + if let Some(dir) = std::env::var_os(LOCAL_DIR_ENV) { + let candidate = PathBuf::from(dir).join(rel); + if !candidate.exists() { + return Err(TurnError::ModelNotLoaded(format!( + "{LOCAL_DIR_ENV} is set but {} does not exist", + candidate.display() + ))); + } + return Ok(candidate); + } + + let api = ApiBuilder::new() + .with_token(std::env::var("HF_TOKEN").ok()) + .build() + .map_err(|e| TurnError::BackendError(format!("failed to build hf-hub client: {e}")))?; + + let repo = api.repo(Repo::with_revision( + REPO_ID.to_string(), + RepoType::Model, + REVISION.to_string(), + )); + + repo.get(rel).map_err(|e| { + TurnError::BackendError(format!( + "failed to download {REPO_ID}@{REVISION}:{rel} from HuggingFace: {e}. \ + Set {LOCAL_DIR_ENV} to a directory containing {rel} to skip the download." + )) + }) +} diff --git a/crates/wavekat-turn/src/lib.rs b/crates/wavekat-turn/src/lib.rs index fa68fa2..a696553 100644 --- a/crates/wavekat-turn/src/lib.rs +++ b/crates/wavekat-turn/src/lib.rs @@ -17,8 +17,15 @@ //! //! | Feature | Backend | Input | //! |---------|---------|-------| -//! | `pipecat` | Pipecat Smart Turn v3 (ONNX) | Audio (16 kHz) | +//! | `pipecat` | Pipecat Smart Turn v3 (ONNX, embedded) | Audio (16 kHz) | +//! | `wavekat-smart-turn` | WaveKat language-specialized fine-tunes (ONNX, runtime download) | Audio (16 kHz) | //! | `livekit` | LiveKit Turn Detector (ONNX) | Text | +//! +//! `wavekat-smart-turn` implies `pipecat` and adds an `hf-hub` runtime +//! dependency. Weights live in +//! [`wavekat/smart-turn-ONNX`](https://huggingface.co/wavekat/smart-turn-ONNX) +//! and are cached under `$HF_HOME/hub/`. Set `WAVEKAT_TURN_MODEL_DIR` to a +//! directory containing `/smart-turn-cpu.onnx` to skip the download. pub mod controller; pub mod error; diff --git a/crates/wavekat-turn/tests/pipecat.rs b/crates/wavekat-turn/tests/pipecat.rs index 0804308..018936c 100644 --- a/crates/wavekat-turn/tests/pipecat.rs +++ b/crates/wavekat-turn/tests/pipecat.rs @@ -47,6 +47,46 @@ fn test_new_loads_model() { PipecatSmartTurn::new().expect("PipecatSmartTurn::new() should succeed"); } +#[test] +fn test_with_variant_pipecat_v3_loads_model() { + use wavekat_turn::audio::SmartTurnVariant; + PipecatSmartTurn::with_variant(SmartTurnVariant::PipecatV3) + .expect("with_variant(PipecatV3) should succeed"); +} + +/// Exercise the WAVEKAT_TURN_MODEL_DIR override path without touching the +/// network: drop the embedded Pipecat ONNX into a temp dir under the +/// expected `/smart-turn-cpu.onnx` layout and confirm the variant +/// loader picks it up. The bytes happen to be the upstream model — that's +/// fine; we are only asserting the file resolution path works. +#[cfg(feature = "wavekat-smart-turn")] +#[test] +fn test_wavekat_variant_uses_local_dir_override() { + use wavekat_turn::audio::{SmartTurnLang, SmartTurnVariant}; + + let tmp = std::env::temp_dir().join("wavekat_turn_local_dir_test"); + let lang_dir = tmp.join("zh"); + std::fs::create_dir_all(&lang_dir).unwrap(); + let path = lang_dir.join("smart-turn-cpu.onnx"); + let model_bytes = include_bytes!(concat!(env!("OUT_DIR"), "/smart-turn-v3.2-cpu.onnx")); + std::fs::write(&path, model_bytes).unwrap(); + + // SAFETY: tests inside this crate that mutate env vars run on the same + // process. `cargo test` defaults to single-threaded for harness=false, + // but the std test harness parallelises — keep the env var set for the + // duration of this test and accept that no other test reads it. + unsafe { + std::env::set_var("WAVEKAT_TURN_MODEL_DIR", &tmp); + } + let result = PipecatSmartTurn::with_variant(SmartTurnVariant::Wavekat(SmartTurnLang::Zh)); + unsafe { + std::env::remove_var("WAVEKAT_TURN_MODEL_DIR"); + } + + let _ = std::fs::remove_dir_all(&tmp); + result.expect("with_variant(Wavekat(Zh)) should pick up the local override"); +} + #[test] fn test_from_file_loads_model() { let tmp = std::env::temp_dir().join("wavekat_turn_test"); @@ -152,6 +192,115 @@ fn test_from_file_invalid_path_returns_error() { ); } +/// End-to-end smoke test for the WaveKat HuggingFace download path. +/// +/// Pulls `wavekat/smart-turn-ONNX` from the Hub (cached in `$HF_HOME/hub/` +/// after the first run), runs it against the repo fixtures, and prints a +/// markdown table of probabilities. Asserts that the three `zh_*` clips +/// (Mandarin, synthesized with wavekat-tts at 24 kHz and resampled to +/// 16 kHz via ffmpeg) classify on the expected side of 0.5. Marked +/// `#[ignore]` so CI and `cargo test` never hit the network unintentionally. +/// +/// Run with: +/// cargo test --features wavekat-smart-turn --test pipecat \ +/// -- --ignored wavekat_hf_download_smoke --nocapture +#[cfg(feature = "wavekat-smart-turn")] +#[test] +#[ignore = "network: downloads ~8 MB from huggingface.co"] +fn wavekat_hf_download_smoke() { + use std::path::Path; + + use wavekat_turn::audio::{SmartTurnLang, SmartTurnVariant}; + use wavekat_turn::TurnState; + + fn fixtures_dir() -> std::path::PathBuf { + Path::new(env!("CARGO_MANIFEST_DIR")) + .parent() + .unwrap() + .parent() + .unwrap() + .join("tests/fixtures") + } + + fn load_wav(path: &Path) -> Vec { + let mut reader = + hound::WavReader::open(path).unwrap_or_else(|e| panic!("open {}: {e}", path.display())); + let spec = reader.spec(); + assert_eq!(spec.sample_rate, 16_000); + assert_eq!(spec.channels, 1); + match spec.sample_format { + hound::SampleFormat::Int => reader + .samples::() + .map(|s| s.unwrap() as f32 / 32768.0) + .collect(), + hound::SampleFormat::Float => reader.samples::().map(|s| s.unwrap()).collect(), + } + } + + fn p_complete(pred: &TurnPrediction) -> f32 { + match pred.state { + TurnState::Finished => pred.confidence, + TurnState::Unfinished => 1.0 - pred.confidence, + TurnState::Wait => unreachable!(), + } + } + + println!("\nLoading wavekat/smart-turn-ONNX (zh) from HuggingFace…"); + let mut detector = PipecatSmartTurn::with_variant(SmartTurnVariant::Wavekat(SmartTurnLang::Zh)) + .expect("HF download / model load failed"); + + // (clip, expected_state) — None means "print only, no assertion". + // English clips are kept for diagnostics; the zh fine-tune isn't expected + // to score them correctly. + let clips: &[(&str, Option)] = &[ + ("silence_2s.wav", None), + ("speech_finished.wav", None), + ("speech_mid.wav", None), + ("zh_speech_finished.wav", Some(TurnState::Finished)), + ("zh_speech_finished_short.wav", Some(TurnState::Finished)), + ("zh_speech_mid.wav", Some(TurnState::Unfinished)), + ]; + + println!(); + println!("| Clip | P(complete) | State | Latency (ms) | Expected |"); + println!("|------|-------------|-------|--------------|----------|"); + let mut failures = Vec::new(); + for (clip, expected) in clips { + detector.reset(); + let samples = load_wav(&fixtures_dir().join(clip)); + for chunk in samples.chunks(1600) { + detector.push_audio(&AudioFrame::new(chunk, 16_000)); + } + let pred = detector.predict().expect("predict failed"); + valid_prediction(&pred); + let exp_label = expected.map(|s| format!("{s:?}")).unwrap_or("—".into()); + println!( + "| `{}` | {:.4} | {:?} | {} | {} |", + clip, + p_complete(&pred), + pred.state, + pred.latency_ms, + exp_label, + ); + if let Some(want) = expected { + if pred.state != *want { + failures.push(format!( + "{clip}: expected {want:?}, got {:?} (P={:.4})", + pred.state, + p_complete(&pred), + )); + } + } + } + println!(); + if !failures.is_empty() { + panic!( + "zh fixture misclassifications:\n {}", + failures.join("\n ") + ); + } +} + /// Smoke test: latency is measured and non-zero (always runs, including debug). #[test] fn test_latency_is_measured() { diff --git a/docs/04-plan-wavekat-smart-turn.md b/docs/04-plan-wavekat-smart-turn.md new file mode 100644 index 0000000..e85c165 --- /dev/null +++ b/docs/04-plan-wavekat-smart-turn.md @@ -0,0 +1,465 @@ +# Plan: Distribute WaveKat Smart Turn Fine-tunes via `wavekat-turn` + +**Status:** Draft for review +**Date:** 2026-05-11 +**Branch:** `feat/wavekat-smart-turn` + +> Scope: a language-agnostic distribution path for our own Smart Turn +> fine-tunes. Mandarin (`zh`) is the first language we ship; the design must +> let us add more languages without breaking changes or new HF repos. + +--- + +## What we are (and are not) shipping + +**Smart Turn is Pipecat's project.** The architecture (Whisper-Tiny encoder + +binary classification head), the training recipe, the ONNX export pipeline, +and the original `smart-turn-v3.2-cpu.onnx` weights all belong to +[pipecat-ai/smart-turn](https://github.com/pipecat-ai/smart-turn) (BSD 2-Clause). + +What WaveKat contributes is **language-specialized weights** that drop into the +same architecture, exported to the **same ONNX interface** Pipecat already +defines. Concretely: + +- **Same input tensor:** `input_features`, shape `[B, 80, 800]`, float32. +- **Same output tensor:** `logits`, shape `[B, 1]`, float32 (sigmoid fused). +- **Same audio pipeline:** 16 kHz mono, 8-second window, Whisper-style log-mel + features (Slaney, n_fft=400, hop=160, 80 mels). + +The implication that runs through this whole plan: **anything compatible with +upstream Pipecat Smart Turn must remain compatible with our weights, and vice +versa.** That includes: + +1. **Pipecat's own Python loader** (`smart-turn` repo) must be able to consume + our ONNX files with no code changes. Validated by running the upstream + Python inference script against our exported ONNX. +2. **Our `wavekat-turn` Rust loader** picks them up via the same `from_file()` + path used today for the upstream model. +3. **Future ports** (e.g. a Pipecat Python integration, or a third-party + loader) can use the same files. + +Practical consequences for this design: + +- The HF repo must be **architecture-named, not crate-named**. It is "ONNX + weights for Pipecat Smart Turn, fine-tuned by WaveKat", not "weights for + wavekat-turn". +- The model card must lead with **strong attribution to upstream Pipecat** + (link to the GitHub repo, the upstream HF org, and the BSD 2-Clause notice) + before describing our fine-tunes. +- Tensor names, shapes, and feature-extraction parameters are **frozen** to + match Pipecat. If Pipecat ever revs the architecture (e.g. a v4 with + different input shape), we add a new family of repos rather than mutating + the existing one. +- We should not rename `PipecatSmartTurn` to `SmartTurnDetector` in + `wavekat-turn` (previously suggested as a follow-up). The name correctly + identifies the *architecture* we are wrapping; both upstream and our + weights are instances of it. That decision is now reversed — see + Decision 9. + +--- + +## Context + +- `training/smart-turn-zh/` produced a Mandarin fine-tune of Pipecat Smart Turn + v3 (same architecture: Whisper-Tiny encoder + binary classification head). +- The trained model lives in **wavekat-platform**, which is an **internal-only** + model registry today (no public anonymous read access). +- `wavekat-turn` is a public OSS crate published to crates.io. Its build script + downloads `smart-turn-v3.2-cpu.onnx` from HuggingFace at build time and embeds + the bytes via `include_bytes!()`. +- Goal: make our Chinese model usable from `wavekat-turn` with the same + zero-setup experience the upstream Pipecat model gets today. + +### Sibling-repo precedent: `wavekat-tts` + +`wavekat-tts` already publishes WaveKat-owned ONNX weights publicly under the +existing HuggingFace org **[`wavekat`](https://huggingface.co/wavekat)**: + +| Repo | Layout | Loading mechanism | +|------|--------|-------------------| +| [`wavekat/Qwen3-TTS-1.7B-VoiceDesign-ONNX`](https://huggingface.co/wavekat/Qwen3-TTS-1.7B-VoiceDesign-ONNX) | `fp32/*.onnx`, `int4/*.onnx`, `config.json`, `embeddings/*.npy`, `tokenizer/*` | Runtime download via `hf-hub` crate, cached at `$HF_HOME/hub/` | +| [`wavekat/Qwen3-TTS-0.6B-Base-ONNX`](https://huggingface.co/wavekat/Qwen3-TTS-0.6B-Base-ONNX) | Same shape, plus `speaker_encoder.onnx` / `tokenizer_encoder.onnx` | Same | + +Conventions established by `wavekat-tts` that we should follow: +- **HF org name:** `wavekat` (already confirmed live). +- **Repo naming:** `wavekat/-ONNX` with the `-ONNX` suffix. +- **Multi-precision layout:** `fp32/` and `int4/` subdirs inside one repo, so + users pick precision at runtime instead of at build time. +- **Revision pinning:** the consuming crate pins a dated revision string in + code (e.g. `REVISION: &str = "2026-04-06"`), so model updates ship via a + crate release, not silently when a user re-pulls. +- **Local override env var:** `WAVEKAT_MODEL_DIR` (TTS uses + `WAVEKAT_MODEL_DIR` / `WAVEKAT_CLONE_MODEL_DIR`) lets users point at a + pre-populated directory and skip downloads entirely — needed for offline + builds and CI. +- **License:** Apache 2.0 on the consuming crate; the model files inherit + their upstream license. + +--- + +## Question: HuggingFace first, or load from wavekat-platform? + +**Recommendation: HuggingFace first.** Use wavekat-platform as the +source-of-truth training registry; treat HF as the **public distribution +mirror** for snapshots we have explicitly chosen to release. This matches what +`wavekat-tts` already does — the `wavekat` HF org is established and the +pattern is proven across the ecosystem. + +| Concern | wavekat-platform | HuggingFace | +|-------------------------------|--------------------------------|---------------------------------------------------| +| Public anonymous access | No (internal) | Yes | +| Works in OSS user's `cargo build` | Would require auth tokens | Anonymous HTTP GET, no auth | +| CDN / global cache | None | Built-in | +| Matches upstream Pipecat path | No | Yes — same host, same URL shape as Pipecat | +| Build-script complexity | Auth, secrets, rate limits | A single `ureq::get(url)` call (already in place) | +| Versioning / reproducibility | Internal version IDs | Git revisions on the model repo | + +### What the workflow looks like + +1. Train on `wavekat-lab`, push artifact to **wavekat-platform** (already done). +2. When a checkpoint is ready for public release, **export an ONNX snapshot + to a HF model repo** under a WaveKat org (e.g. `wavekat/smart-turn-zh`). +3. `wavekat-turn`'s build script downloads from HF, the same way it does for + Pipecat. The platform stays internal; HF carries the public bits only. + +This keeps two clear roles: +- **Platform = training registry** (private, includes raw checkpoints, eval + artifacts, experiments). +- **HF = release channel** (public, only the ONNX files we have decided to + ship, tagged and immutable). + +### Open questions before publishing to HF + +- ~~HF org/account name.~~ **Resolved**: use the existing `wavekat` org (same + as `wavekat-tts` models). +- **HF repo name (language-agnostic).** zh is just the first of many planned + languages, so the repo name must not bake the language in. Two viable + shapes: + + **A. One repo, language subdirs** *(recommended)* + ``` + wavekat/smart-turn-ONNX + ├── zh/smart-turn-cpu.onnx + ├── ja/smart-turn-cpu.onnx (future) + ├── yue/smart-turn-cpu.onnx (future) + └── README.md + ``` + Mirrors the TTS precedent of per-axis subdirs (`fp32/`, `int4/`). + Adding a language later is a file push, not a new repo + new model card + + new revision string. + + **B. Per-language repos with a stable parent pattern** + ``` + wavekat/smart-turn-zh-ONNX (this branch) + wavekat/smart-turn-ja-ONNX (future) + ``` + Cleanest model card per language, but every new language is a new repo + + new constants in `wavekat-turn`, and the repo name still encodes a + language — exactly what we want to avoid. + + **Decision proposed: A.** Single repo `wavekat/smart-turn-ONNX` with + `/` subdirs. Future expansion is additive and never requires a new + HF repo. + +- License. Pipecat upstream is BSD 2-Clause. Our fine-tunes inherit that + unless we add separate ToS. Confirm we are comfortable publishing under + BSD 2-Clause. +- Model card content: per-language sections (training data sources, eval + numbers, intended use, known limitations — dialect coverage, SNR + conditions). Keep a single top-level model card with a section per + language. +- Revision convention. Pin a single dated `REVISION = "YYYY-MM-DD"` in + `wavekat-turn` code, same as `wavekat-tts`. Updates to any language + bump the same revision. + +--- + +## Architecture: how to add the model to wavekat-turn + +The Chinese model is **the same architecture** as upstream Pipecat — only the +weights differ. Mel feature extraction, ring-buffer logic, tensor shapes, +output interpretation, and the 0.5 threshold are all identical. + +That means we have three real options for how the public API surfaces it. + +### Option A — Variant on the existing `PipecatSmartTurn` struct *(recommended)* + +Add a `Variant` enum and constructors that select which set of weights to load. +Inference code is unchanged. + +```rust +/// Language for the WaveKat fine-tune. Extend as we ship more languages. +#[non_exhaustive] +pub enum SmartTurnLang { + /// Mandarin Chinese (first WaveKat fine-tune). + Zh, + // Ja, Yue, ... (future) +} + +#[non_exhaustive] +pub enum SmartTurnVariant { + /// Upstream multilingual Pipecat Smart Turn v3. + PipecatV3, + /// WaveKat fine-tune for a specific language. + Wavekat(SmartTurnLang), +} + +impl PipecatSmartTurn { + pub fn new() -> Result { // unchanged: PipecatV3 + pub fn with_variant(v: SmartTurnVariant) -> Result; + pub fn from_file(path: impl AsRef) -> Result; // unchanged +} +``` + +`#[non_exhaustive]` on both enums is deliberate: adding a new language must +not be a breaking change. + +The build/load layer resolves `Wavekat(lang)` to `/smart-turn-cpu.onnx` +inside the single `wavekat/smart-turn-ONNX` HF repo, so adding a new language +is a one-line variant addition + a file in the HF repo — no new constants, +no new feature flag. + +**Pros** +- Zero code duplication; the feature is purely "different bytes". +- Honest naming: the *backend* is "Pipecat Smart Turn v3 architecture"; both + models are instances of it. +- Users on a strict binary-size budget can disable one variant via features. + +**Cons** +- `PipecatSmartTurn` is no longer a single-model thing; the type name suggests + "Pipecat" even when running our weights. We can rename the struct to + `SmartTurnDetector` and keep `PipecatSmartTurn` as a deprecated type alias. + +### Option B — Separate `WavekatSmartTurnZh` struct + +A new struct in `audio/smart_turn_zh.rs` that mostly re-exports the same mel +extractor and inference logic. + +**Pros** +- Clearer in API docs: "for Chinese, use this struct". + +**Cons** +- The mel extractor, ring buffer, and inference path would be copy-pasted or + factored into a shared inner type — extra plumbing for no behavioral + difference. +- Long-term, every additional fine-tune (Cantonese, Japanese, domain-specific) + needs its own struct. Not scalable. + +### Option C — No automatic download; rely on `from_file()` only + +Publish to HF; expect users to download the file themselves and pass the path +to the existing `from_file()` constructor. Document the URL in the README. + +**Pros** +- Zero changes to `wavekat-turn` code. +- Lowest friction to ship. + +**Cons** +- Worse UX than the upstream Pipecat path, which is `new()` and "just works". +- Asymmetric: Pipecat users get build-time download, our own users don't. + +**Recommendation: Option A.** Single backend type, two (eventually N) variants. +Same UX as upstream Pipecat. Option C is a reasonable v0 if we want to publish +to HF before doing any Rust work. + +--- + +## Model loading strategy + +This is the most significant new question now that we've seen the +`wavekat-tts` precedent. The two crates have diverged: + +| Crate | Mechanism | Pros | Cons | +|-------|-----------|------|------| +| `wavekat-turn` (today) | `build.rs` downloads, `include_bytes!()` embeds | Zero runtime setup, model lives in the binary, offline-friendly after first build | Bloats binary per variant; no precision choice at runtime; build needs network unless `*_MODEL_PATH` is set | +| `wavekat-tts` (today) | `hf-hub` runtime download to `$HF_HOME/hub/`, override with `WAVEKAT_MODEL_DIR` | Supports large models, runtime precision selection, no binary bloat, easy to update models without rebuilding | First-run network dependency; cache lives outside the build artefact | + +The Chinese model is ~8 MB int8 — small enough to embed under the existing +**< 30 MB → embed** rule in [`02-plan-backends.md`](02-plan-backends.md). So +both options are technically viable. + +### Option 1 — Keep embedding (consistent with `wavekat-turn` today) + +Add the zh ONNX as a second `include_bytes!()` blob, downloaded by `build.rs` +under feature `pipecat-zh`. Identical pattern to upstream Pipecat. + +**Pros**: zero new dependencies; consistent with the current backend; works +offline at runtime; reproducible via the existing version-marker caching. + +**Cons**: ecosystem-inconsistent — a `wavekat-tts` user knows +`WAVEKAT_MODEL_DIR` and `~/.cache/huggingface/hub/`, but a `wavekat-turn` user +has to learn `PIPECAT_SMARTTURN_MODEL_PATH` and a build-time recompile to swap +weights. + +### Option 2 — Switch to `hf-hub` runtime download (align with `wavekat-tts`) + +Add `hf-hub` as a runtime dep gated on `pipecat-zh`. On first `new()` for the +zh variant, download `smart-turn-zh-cpu.onnx` to `$HF_HOME/hub/`. Honor +`WAVEKAT_MODEL_DIR` and `HF_TOKEN`. + +**Pros**: unified ecosystem story across `wavekat-vad` / `wavekat-turn` / +`wavekat-tts`; trivially supports future fine-tunes (Cantonese, domain-specific +etc.) without re-publishing the crate; no binary bloat; users can swap model +revisions by setting `WAVEKAT_MODEL_DIR` without rebuilding. + +**Cons**: divergence within `wavekat-turn` itself — upstream Pipecat stays +embedded, zh model downloads at runtime. Two mental models for the same crate. +And it introduces first-run network dependency for the zh variant. + +### Option 3 — Switch both variants to `hf-hub` (full alignment) + +Migrate the upstream Pipecat variant off `include_bytes!()` too, so the whole +crate uses `hf-hub` like `wavekat-tts`. Out of scope for this branch — would +need its own migration plan and a major-version bump. + +### Recommendation + +**Option 2** for this branch, with **Option 3 as a follow-up** in a separate +migration plan. + +Reasoning: +- The ecosystem-consistency win is real: a user who already runs `wavekat-tts` + doesn't have to learn a second set of env vars. +- The zh variant is the natural place to introduce `hf-hub` because it's + greenfield — no existing users to migrate. +- Once `hf-hub` is in the dep tree under a feature, migrating the upstream + Pipecat variant later is a localized change behind the same trait. +- We get **runtime precision selection** for free if/when we publish an fp16 + variant — no rebuild required. + +If we're cautious about adding `hf-hub`, Option 1 is a perfectly fine +fallback. The variant API stays the same either way; only the body of +`with_variant(WavekatZh)` changes. + +--- + +## Implementation plan (phased) + +### Phase 0 — Publish to HuggingFace (out-of-repo) + +1. ~~Decide HF org name and create the org if it does not exist.~~ Use + existing `wavekat` org. +2. Create one language-agnostic model repo: **`wavekat/smart-turn-ONNX`**. +3. Write a model card. **Lead with attribution**: + - First section: "WaveKat fine-tunes of [Pipecat Smart Turn v3](https://github.com/pipecat-ai/smart-turn) + ([upstream HF](https://huggingface.co/pipecat-ai/smart-turn-v3), + BSD 2-Clause)". State explicitly that the architecture, training recipe, + and ONNX export contract are Pipecat's; WaveKat contributes + language-specialized weights only. + - Followed by a per-language section (data, eval, limitations). + - Reproduce the BSD 2-Clause notice. +4. Export the ONNX from the wavekat-platform checkpoint we want to ship. + **Compatibility checks before push** (block on these): + - Tensor names match Pipecat: input `input_features` `[B, 80, 800]` + float32, output `logits` `[B, 1]` float32 (sigmoid fused). + - Loads in the upstream **Pipecat Python** inference pipeline with no + code changes — just swap the model path. Capture a reference inference + output for our fixture clips from Python. + - Loads in our **Rust** pipeline via `from_file()` and matches the + Python reference within the existing accuracy tolerance. +5. Push the ONNX to **`zh/smart-turn-cpu.onnx`** in the HF repo. Optionally + add `zh/smart-turn-fp32.onnx` if/when we want to ship higher precision. +6. Sanity check: + `curl -L https://huggingface.co/wavekat/smart-turn-ONNX/resolve/main/zh/smart-turn-cpu.onnx` + returns the expected bytes anonymously. + +Future-language workflow (e.g. Japanese): push `ja/smart-turn-cpu.onnx` to +the same repo, add a `Ja` variant to `SmartTurnLang`, ship a crate release. +No new HF repo. No new feature flag. + +**Python usability note**: because the ONNX matches Pipecat's contract, a +Python user can consume it directly from the Pipecat `smart-turn` repo with +something like `SmartTurnAnalyzer(model_path=hf_hub_download("wavekat/smart-turn-ONNX", "zh/smart-turn-cpu.onnx"))`. +The HF repo README should include this one-liner so the audience is +explicitly "Pipecat users (Python or Rust) who want non-English support", +not just `wavekat-turn` users. + +### Phase 1 — Add the variant to `wavekat-turn` + +Assuming **Option 2** (hf-hub runtime loading) is chosen: + +- Add `SmartTurnLang` and `SmartTurnVariant` enums (both `#[non_exhaustive]`). + Default constructor `new()` keeps using `PipecatV3` for backwards compat. +- Add `PipecatSmartTurn::with_variant(variant)` constructor. +- Add `hf-hub` as an optional dep gated on the feature flag. +- New module `src/audio/wavekat_download.rs` mirroring `wavekat-tts`' + `download.rs`: + - `REPO_ID = "wavekat/smart-turn-ONNX"`, dated `REVISION`. + - Map `SmartTurnLang::Zh → "zh/smart-turn-cpu.onnx"`. The path lookup is + the single point that knows about languages — adding a language is one + match arm. + - Honor `WAVEKAT_TURN_MODEL_DIR` and `HF_TOKEN` exactly as TTS does for + `WAVEKAT_MODEL_DIR`. Use a turn-specific name to avoid collision with + the TTS env var. + - Return a path that `onnx::session_from_file` consumes. +- In `with_variant(Wavekat(lang))`, call the download helper, then build a + session from the resolved path. The Pipecat variant continues to use the + embedded bytes path — no change. + +Alternative (Option 1, kept for fallback): + +- Extend `build.rs` with a download step per language and per-language + `include_bytes!()` blobs. Each new language requires a recompile and a + crate release — strictly worse for the multi-language future, but + acceptable if we want to avoid the `hf-hub` dependency. + +### Phase 2 — Cross-validation + +- Add fixture clips in Mandarin (a "finished" clip, an "unfinished" clip, + a silence/no-speech clip) under `tests/fixtures/`. +- Regenerate the Python reference (`scripts/gen_reference.py`) against the zh + checkpoint and add `*.zh.mel.npy` / expected probabilities. +- Extend `tests/pipecat.rs` (or add `tests/smart_turn_zh.rs`) with the same + 9-test matrix from Phase 4 of `02-plan-backends.md`, plus parity checks + against the Python reference. + +### Phase 3 — README and example updates + +- README: add a row to the Backends table, document the `pipecat-zh` feature, + show a one-line example with `with_variant(SmartTurnVariant::WavekatZh)`. +- `examples/controller.rs`: optional second example with the zh model. +- Update `02-plan-backends.md` to reflect "model variants" as a concept. + +### Phase 4 — Optional follow-ups + +- ~~Consider renaming `PipecatSmartTurn` to `SmartTurnDetector`.~~ **Reversed**: + keep `PipecatSmartTurn`. Pipecat owns the Smart Turn architecture; the + type name correctly identifies what we are wrapping. Our weights are + *instances* of Pipecat Smart Turn, not a separate detector. +- Decide whether `TurnController` should expose the variant in its API + surface. Probably not: it is detector-agnostic by design. +- Once the HF repo exists, open a small PR / issue on + [pipecat-ai/smart-turn](https://github.com/pipecat-ai/smart-turn) + pointing Python users at our weights for non-English support. Coordinate + on whether they want to list the WaveKat repo from their README. + +--- + +## Risks and tradeoffs + +| Risk | Mitigation | +|---------------------------------------------------------------|-----------------------------------------------------------------------------| +| Two embedded models double the crate's compiled size | Feature-gate each variant; default features enable only `pipecat`. | +| HF revision drift between platform and HF | Pin the revision in `build.rs` (not just the URL) — same pattern as today. | +| Model card not ready for public release | Phase 0 gates the rest; do not start Phase 1 until the HF repo is signed off. | +| License compatibility (Pipecat is BSD 2-Clause) | Confirm before publishing; include upstream attribution in the model card. | +| Discoverability — users may not know there is a zh variant | README table + variant docstring + a one-liner in the crate-level rustdoc. | +| Pipecat reves the architecture (v4 with different tensor shape) | Frozen contract is documented in Phase 0. A breaking change upstream means a new HF repo family (e.g. `wavekat/smart-turn-v4-ONNX`), not mutating the existing one. | +| Our ONNX silently diverges from Pipecat's contract | Phase 0 compatibility checks (Python pipeline + Rust pipeline) are gating. CI in the training repo should re-run them per checkpoint. | +| Python users can't easily find/use our weights | Model card includes a Python one-liner; consider a PR to upstream pointing at the WaveKat repo. | + +--- + +## Decisions to confirm before implementation + +1. **Distribution channel:** HuggingFace `wavekat` org as the public mirror, platform stays internal? *(strongly recommended: yes — sibling `wavekat-tts` already does this)* +2. **HF repo name:** `wavekat/smart-turn-ONNX` (language-agnostic, with per-language subdirs like `zh/`) — recommended over `wavekat/smart-turn-zh-ONNX` because more languages are coming. +3. **License:** ship under BSD 2-Clause to match upstream Pipecat? *(default: yes)* +4. **API shape:** variant enum on `PipecatSmartTurn` (Option A), separate struct (Option B), or `from_file()`-only (Option C)? *(recommended: A)* +5. **Loading mechanism:** keep `build.rs` + `include_bytes!()` (Option 1), or switch to `hf-hub` runtime download (Option 2)? *(recommended: 2 — aligns with `wavekat-tts`)* +6. **Feature flag name:** language-agnostic, e.g. `wavekat-smart-turn` or `smart-turn-wavekat`, rather than `pipecat-zh` / `smart-turn-zh` (one flag gates *all* WaveKat fine-tunes, language is chosen at runtime via `SmartTurnLang`). +7. **Default features:** does the zh variant ship in default features, or stay opt-in? *(recommended: opt-in — keeps default install lean)* +8. **Env var name:** `WAVEKAT_TURN_MODEL_DIR` (recommended, turn-specific, no collision with `WAVEKAT_MODEL_DIR` from TTS). Applies to whichever language is selected. +9. ~~Rename `PipecatSmartTurn` → `SmartTurnDetector`?~~ **Resolved (keep `PipecatSmartTurn`)** — the architecture is Pipecat's; renaming would obscure that. +10. **Future migration:** do we want to plan now for moving the upstream Pipecat variant off `include_bytes!()` to `hf-hub` too (Option 3), or leave that for later? +11. **Coordination with Pipecat upstream:** do we want to proactively notify pipecat-ai/smart-turn maintainers (issue / PR linking our HF repo) so Python users discover the weights? *(recommended: yes, after Phase 0 ships)* diff --git a/tests/fixtures/zh_speech_finished.wav b/tests/fixtures/zh_speech_finished.wav new file mode 100644 index 0000000..52da116 Binary files /dev/null and b/tests/fixtures/zh_speech_finished.wav differ diff --git a/tests/fixtures/zh_speech_finished_short.wav b/tests/fixtures/zh_speech_finished_short.wav new file mode 100644 index 0000000..1832e03 Binary files /dev/null and b/tests/fixtures/zh_speech_finished_short.wav differ diff --git a/tests/fixtures/zh_speech_mid.wav b/tests/fixtures/zh_speech_mid.wav new file mode 100644 index 0000000..2633a0c Binary files /dev/null and b/tests/fixtures/zh_speech_mid.wav differ