From 6b2d905b29e64d4711aee3993bdffb4bce8a6420 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Fri, 15 May 2026 11:07:51 +1200 Subject: [PATCH 1/9] feat(audio-lab): wire wavekat-asr into the audio pipeline (M1) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a sherpa-onnx ASR backend that fans out alongside the existing VAD and turn-detection pipelines. Each AsrConfig runs in its own worker thread (sherpa-onnx is sync + holds model state); a tokio task bridges the audio broadcast in, and a blocking_send loop bridges transcript events back to the websocket. WS surface: ListAsrBackends / SetAsrConfigs client messages, AsrBackends + Asr server messages. Asr events carry a `kind` field (ready, speech_started, speech_ended, partial, final, warning) with optional ts_ms/end_ms/text/confidence/message. M1 scope: backend only — no frontend yet. cargo check + clippy clean, existing tests still pass. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/audio-lab/backend/Cargo.toml | 1 + tools/audio-lab/backend/src/asr.rs | 290 ++++++++++++++++++++++++++++ tools/audio-lab/backend/src/main.rs | 1 + tools/audio-lab/backend/src/ws.rs | 148 +++++++++++++- 4 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 tools/audio-lab/backend/src/asr.rs diff --git a/tools/audio-lab/backend/Cargo.toml b/tools/audio-lab/backend/Cargo.toml index f094332..2f28dce 100644 --- a/tools/audio-lab/backend/Cargo.toml +++ b/tools/audio-lab/backend/Cargo.toml @@ -13,6 +13,7 @@ wavekat-vad = { version = "0.1", features = [ "serde", ] } wavekat-turn = { version = "0.0.8", features = ["pipecat", "wavekat-smart-turn"] } +wavekat-asr = { version = "0.0.4", features = ["sherpa-onnx"] } axum = { version = "0.8", features = ["ws", "multipart"] } tokio = { workspace = true } tower-http = { version = "0.6", features = ["cors", "fs"] } diff --git a/tools/audio-lab/backend/src/asr.rs b/tools/audio-lab/backend/src/asr.rs new file mode 100644 index 0000000..ff8779d --- /dev/null +++ b/tools/audio-lab/backend/src/asr.rs @@ -0,0 +1,290 @@ +use crate::audio_source::AudioFrame; +use serde::{Deserialize, Serialize}; +use std::collections::HashMap; +use tokio::sync::{broadcast, mpsc}; +use wavekat_asr::backends::sherpa_onnx::{ + ModelPreset, SherpaOnnxAsr, BILINGUAL_ZH_EN, PARAFORMER_BILINGUAL_ZH_EN, PARAFORMER_ZH, + ZIPFORMER_EN, +}; +use wavekat_asr::{AudioFrame as AsrAudioFrame, Channel, StreamingAsr, TranscriptEvent}; + +/// ASR target sample rate. Sherpa-onnx wants 16 kHz f32. +const ASR_RATE: u32 = 16_000; + +/// Configuration for a single ASR instance. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub struct AsrConfig { + /// Unique identifier for this config. + pub id: String, + /// Human-readable label. + pub label: String, + /// Backend name: currently only "sherpa-onnx". + pub backend: String, + /// Backend-specific parameters (e.g. `{"preset": "bilingual"}`). + pub params: HashMap, +} + +/// One transcript event tagged with the config that produced it. +#[derive(Debug, Clone, Serialize)] +#[serde(tag = "kind", rename_all = "snake_case")] +pub enum AsrServerEvent { + /// ASR backend has finished initialising (model loaded). Sent once + /// per config at startup so the frontend can move past "loading model…". + Ready { + config_id: String, + }, + SpeechStarted { + config_id: String, + ts_ms: f64, + }, + SpeechEnded { + config_id: String, + ts_ms: f64, + }, + Partial { + config_id: String, + ts_ms: f64, + text: String, + }, + Final { + config_id: String, + ts_ms: f64, + end_ms: f64, + text: String, + confidence: f32, + }, + Warning { + config_id: String, + message: String, + }, +} + +/// Pick a `ModelPreset` from a preset name string. +fn pick_preset(preset: &str) -> ModelPreset { + match preset { + "" | "bilingual" | "bilingual-zh-en" => BILINGUAL_ZH_EN, + "en" | "english" | "zipformer-en" => ZIPFORMER_EN, + "zh" | "chinese" | "paraformer-zh" => PARAFORMER_ZH, + "paraformer-zh-en" | "paraformer-bilingual" => PARAFORMER_BILINGUAL_ZH_EN, + _ => BILINGUAL_ZH_EN, + } +} + +/// Linear-interpolation resample of f32 samples. Matches the quality of the +/// resampler used in `pipeline.rs` for VAD; adequate for a lab tool. If WER +/// regressions show up, swap for rubato + a higher-quality kernel. +fn resample_linear_f32(samples: &[f32], from_rate: u32, to_rate: u32) -> Vec { + if from_rate == to_rate || samples.is_empty() { + return samples.to_vec(); + } + let ratio = to_rate as f64 / from_rate as f64; + let output_len = (samples.len() as f64 * ratio).round() as usize; + let mut output = Vec::with_capacity(output_len); + for i in 0..output_len { + let src_pos = i as f64 / ratio; + let src_idx = src_pos as usize; + let frac = (src_pos - src_idx as f64) as f32; + let s0 = samples[src_idx.min(samples.len() - 1)]; + let s1 = samples[(src_idx + 1).min(samples.len() - 1)]; + output.push(s0 + frac * (s1 - s0)); + } + output +} + +/// Convert i16 PCM to f32 in `[-1.0, 1.0]`. +fn i16_to_f32(samples: &[i16]) -> Vec { + samples + .iter() + .map(|s| *s as f32 / i16::MAX as f32) + .collect() +} + +/// Run the ASR pipeline: fan out audio to each active ASR config. +/// +/// Each config gets its own dedicated OS thread (sherpa-onnx is sync + holds +/// model state). A tokio task per config forwards broadcast frames into a +/// blocking channel feeding that thread. Transcript events are bridged back +/// onto the returned tokio mpsc. +pub fn run_asr_pipeline( + configs: &[AsrConfig], + audio_tx: &broadcast::Sender, + sample_rate: u32, +) -> mpsc::Receiver { + let (result_tx, result_rx) = mpsc::channel::(256); + + for config in configs { + if config.backend != "sherpa-onnx" { + tracing::warn!( + config_id = %config.id, + backend = %config.backend, + "unknown ASR backend, skipping" + ); + continue; + } + + let preset_name = config + .params + .get("preset") + .and_then(|v| v.as_str()) + .unwrap_or("bilingual") + .to_string(); + let preset = pick_preset(&preset_name); + + let config_id = config.id.clone(); + let label = config.label.clone(); + let result_tx_for_thread = result_tx.clone(); + + // Bridge tokio broadcast → std mpsc, so the worker thread can block + // on `recv()` without entering an async context. + let (audio_in_tx, audio_in_rx) = std::sync::mpsc::sync_channel::(256); + + let mut audio_rx = audio_tx.subscribe(); + let audio_in_tx_clone = audio_in_tx.clone(); + tokio::spawn(async move { + while let Ok(frame) = audio_rx.recv().await { + if audio_in_tx_clone.send(frame).is_err() { + break; + } + } + drop(audio_in_tx_clone); + }); + // Drop our copy so the worker exits when the broadcast closes. + drop(audio_in_tx); + + let config_id_for_thread = config_id.clone(); + std::thread::Builder::new() + .name(format!("asr-{}", config.id)) + .spawn(move || { + tracing::info!( + config_id = %config_id_for_thread, + label = %label, + preset = %preset_name, + "loading sherpa-onnx model (may download on first run)" + ); + + let (mut asr, asr_rx) = match SherpaOnnxAsr::with_preset(preset) { + Ok(pair) => pair, + Err(e) => { + let _ = result_tx_for_thread.blocking_send(AsrServerEvent::Warning { + config_id: config_id_for_thread.clone(), + message: format!("failed to init ASR: {e}"), + }); + return; + } + }; + + tracing::info!(config_id = %config_id_for_thread, "ASR model loaded"); + let _ = result_tx_for_thread.blocking_send(AsrServerEvent::Ready { + config_id: config_id_for_thread.clone(), + }); + + while let Ok(frame) = audio_in_rx.recv() { + let f32_samples = i16_to_f32(&frame.samples); + let resampled = resample_linear_f32(&f32_samples, sample_rate, ASR_RATE); + let asr_frame = AsrAudioFrame::new(resampled.as_slice(), ASR_RATE); + + if let Err(e) = asr.push_audio(&asr_frame, Channel::Local) { + tracing::warn!( + config_id = %config_id_for_thread, + "push_audio error: {e}" + ); + } + drain_events(&asr_rx, &config_id_for_thread, &result_tx_for_thread); + } + + // Audio stream closed — flush remaining transcript. + if let Err(e) = asr.finish() { + tracing::warn!( + config_id = %config_id_for_thread, + "finish error: {e}" + ); + } + drain_events(&asr_rx, &config_id_for_thread, &result_tx_for_thread); + }) + .expect("spawn asr worker thread"); + } + + drop(result_tx); + result_rx +} + +/// Drain any pending transcript events from the synchronous receiver and +/// forward them on the tokio mpsc. Non-blocking — uses `try_iter`. +fn drain_events( + asr_rx: &std::sync::mpsc::Receiver, + config_id: &str, + result_tx: &mpsc::Sender, +) { + for evt in asr_rx.try_iter() { + let mapped = match evt { + TranscriptEvent::SpeechStarted { ts_ms, .. } => AsrServerEvent::SpeechStarted { + config_id: config_id.to_string(), + ts_ms: ts_ms as f64, + }, + TranscriptEvent::SpeechEnded { ts_ms, .. } => AsrServerEvent::SpeechEnded { + config_id: config_id.to_string(), + ts_ms: ts_ms as f64, + }, + TranscriptEvent::Partial { ts_ms, text, .. } => AsrServerEvent::Partial { + config_id: config_id.to_string(), + ts_ms: ts_ms as f64, + text, + }, + TranscriptEvent::Final { + ts_ms, + end_ms, + text, + confidence, + .. + } => AsrServerEvent::Final { + config_id: config_id.to_string(), + ts_ms: ts_ms as f64, + end_ms: end_ms as f64, + text, + confidence, + }, + TranscriptEvent::Warning(message) => AsrServerEvent::Warning { + config_id: config_id.to_string(), + message, + }, + }; + if result_tx.blocking_send(mapped).is_err() { + // Receiver dropped — nothing more to do. + return; + } + } +} + +/// Available ASR backends and their configurable parameters. +pub fn available_asr_backends() -> HashMap> { + use crate::pipeline::{ParamInfo, ParamType, SelectOption}; + + let mut backends = HashMap::new(); + backends.insert( + "sherpa-onnx".to_string(), + vec![ParamInfo { + name: "preset".to_string(), + description: "Model preset".to_string(), + param_type: ParamType::Select(vec![ + SelectOption { + value: "bilingual".into(), + label: "Bilingual ZH+EN (default)".into(), + }, + SelectOption { + value: "en".into(), + label: "English (Zipformer)".into(), + }, + SelectOption { + value: "zh".into(), + label: "Chinese (Paraformer)".into(), + }, + SelectOption { + value: "paraformer-zh-en".into(), + label: "Bilingual ZH+EN (Paraformer)".into(), + }, + ]), + default: serde_json::json!("bilingual"), + }], + ); + backends +} diff --git a/tools/audio-lab/backend/src/main.rs b/tools/audio-lab/backend/src/main.rs index 60d499c..17fdaa3 100644 --- a/tools/audio-lab/backend/src/main.rs +++ b/tools/audio-lab/backend/src/main.rs @@ -1,3 +1,4 @@ +mod asr; mod audio_source; mod pipeline; mod session; diff --git a/tools/audio-lab/backend/src/ws.rs b/tools/audio-lab/backend/src/ws.rs index 2bdc325..72441f1 100644 --- a/tools/audio-lab/backend/src/ws.rs +++ b/tools/audio-lab/backend/src/ws.rs @@ -3,6 +3,7 @@ use futures::{SinkExt, StreamExt}; use serde::{Deserialize, Serialize}; use tokio::sync::broadcast; +use crate::asr::{self, AsrConfig, AsrServerEvent}; use crate::audio_source::{self, AudioDevice, AudioFrame, ChannelSelect}; use crate::pipeline; use crate::session::VadConfig; @@ -46,6 +47,11 @@ pub enum ClientMessage { SetPipelineConfigs { configs: Vec, }, + ListAsrBackends, + /// Set the active ASR configs (replaces previous list). + SetAsrConfigs { + configs: Vec, + }, } /// Messages sent from the server to the client. @@ -124,12 +130,97 @@ pub enum ServerMessage { turn_latency_ms: Option, audio_duration_ms: Option, }, + AsrBackends { + backends: std::collections::HashMap>, + }, + /// ASR transcript event from a specific config. `kind` is one of + /// `ready` | `speech_started` | `speech_ended` | `partial` | `final` | + /// `warning`. Optional fields are populated based on `kind`. + Asr { + config_id: String, + kind: String, + ts_ms: Option, + end_ms: Option, + text: Option, + confidence: Option, + message: Option, + }, Done, Error { message: String, }, } +/// Convert an internal `AsrServerEvent` into the flat wire shape. +fn asr_event_to_server_msg(evt: AsrServerEvent) -> ServerMessage { + match evt { + AsrServerEvent::Ready { config_id } => ServerMessage::Asr { + config_id, + kind: "ready".into(), + ts_ms: None, + end_ms: None, + text: None, + confidence: None, + message: None, + }, + AsrServerEvent::SpeechStarted { config_id, ts_ms } => ServerMessage::Asr { + config_id, + kind: "speech_started".into(), + ts_ms: Some(ts_ms), + end_ms: None, + text: None, + confidence: None, + message: None, + }, + AsrServerEvent::SpeechEnded { config_id, ts_ms } => ServerMessage::Asr { + config_id, + kind: "speech_ended".into(), + ts_ms: Some(ts_ms), + end_ms: None, + text: None, + confidence: None, + message: None, + }, + AsrServerEvent::Partial { + config_id, + ts_ms, + text, + } => ServerMessage::Asr { + config_id, + kind: "partial".into(), + ts_ms: Some(ts_ms), + end_ms: None, + text: Some(text), + confidence: None, + message: None, + }, + AsrServerEvent::Final { + config_id, + ts_ms, + end_ms, + text, + confidence, + } => ServerMessage::Asr { + config_id, + kind: "final".into(), + ts_ms: Some(ts_ms), + end_ms: Some(end_ms), + text: Some(text), + confidence: Some(confidence), + message: None, + }, + AsrServerEvent::Warning { config_id, message } => ServerMessage::Asr { + config_id, + kind: "warning".into(), + ts_ms: None, + end_ms: None, + text: None, + confidence: None, + message: Some(message), + }, + } +} + fn send_msg(msg: &ServerMessage) -> Message { Message::Text(serde_json::to_string(msg).unwrap().into()) } @@ -140,6 +231,7 @@ pub async fn handle_ws(socket: WebSocket) { let mut configs: Vec = Vec::new(); let mut turn_configs: Vec = Vec::new(); let mut pipeline_configs: Vec = Vec::new(); + let mut asr_configs: Vec = Vec::new(); let mut stop_tx: Option> = None; let mut spectrum_bins: usize = DEFAULT_OUTPUT_BINS; @@ -213,6 +305,20 @@ pub async fn handle_ws(socket: WebSocket) { pipeline_configs = new_pipeline_configs; } + ClientMessage::ListAsrBackends => { + let backends = asr::available_asr_backends(); + let _ = ws_tx + .send(send_msg(&ServerMessage::AsrBackends { backends })) + .await; + } + + ClientMessage::SetAsrConfigs { + configs: new_asr_configs, + } => { + tracing::info!(count = new_asr_configs.len(), "asr configs updated"); + asr_configs = new_asr_configs; + } + ClientMessage::SetSpectrumBins { bins: new_bins } => { // Validate bins (must be power of 2 and divide 512 evenly) let valid_bins = [32, 64, 128, 256, 512]; @@ -292,6 +398,13 @@ pub async fn handle_ws(socket: WebSocket) { None }; + // Start ASR pipeline + let mut asr_rx = if !asr_configs.is_empty() { + Some(asr::run_asr_pipeline(&asr_configs, &audio_tx, sample_rate)) + } else { + None + }; + // Collect messages from both audio and pipeline into one channel let (msg_tx, mut msg_rx) = tokio::sync::mpsc::channel::(512); @@ -323,6 +436,19 @@ pub async fn handle_ws(socket: WebSocket) { } }); + // Forward ASR transcript events + if let Some(mut asr_rx) = asr_rx.take() { + let msg_tx_asr = msg_tx.clone(); + tokio::spawn(async move { + while let Some(evt) = asr_rx.recv().await { + let msg = asr_event_to_server_msg(evt); + if msg_tx_asr.send(msg).await.is_err() { + break; + } + } + }); + } + // Forward turn detection results if let Some(mut turn_rx) = turn_rx.take() { let msg_tx_turn = msg_tx.clone(); @@ -558,6 +684,13 @@ pub async fn handle_ws(socket: WebSocket) { None }; + // Start ASR pipeline BEFORE emitting frames + let asr_rx = if !asr_configs.is_empty() { + Some(asr::run_asr_pipeline(&asr_configs, &audio_tx, sample_rate)) + } else { + None + }; + // Emit all frames at full speed (no sleep) audio_source::emit_frames( &loaded.samples, @@ -671,7 +804,20 @@ pub async fn handle_ws(socket: WebSocket) { }); } - // Task 4: Forward VAD + preprocessed results + // Task 4: Forward ASR transcript events + if let Some(mut asr_rx) = asr_rx { + let msg_tx_asr = msg_tx.clone(); + tokio::spawn(async move { + while let Some(evt) = asr_rx.recv().await { + let msg = asr_event_to_server_msg(evt); + if msg_tx_asr.send(msg).await.is_err() { + break; + } + } + }); + } + + // Task 5: Forward VAD + preprocessed results let msg_tx_vad = msg_tx; let vad_bins = spectrum_bins; let mut result_rx = result_rx; From f0109f0b1ab1ffbc5c8abc879adc0cdbd98559ca Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Fri, 15 May 2026 11:18:53 +1200 Subject: [PATCH 2/9] feat(audio-lab): live ASR transcript panel (M2) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Frontend half of the ASR integration: - New AsrConfigPanel mirrors TurnConfigPanel — backend + preset + label. - New AsrTranscript card renders finals (with [mm:ss.s–mm:ss.s] prefix) plus a dimmed trailing partial that overwrites until the final lands. Footer shows last confidence, count of finals, average segment duration. "loading model…" until the backend's `ready` event arrives. Copy-all button concatenates final text to the clipboard. - App.tsx wires list_asr_backends on connect, persists asr configs to localStorage, pushes set_asr_configs on change + before start / load_file, resets transcripts on new session. - websocket.ts: new AsrConfig / AsrEventKind types, asr_backends + asr server messages, list_asr_backends + set_asr_configs client messages. Log panel batches `partial` events (matching how `vad` is batched) and inlines finals / warnings. cargo isn't touched — backend already merged on feat/asr-backend. npm run lint clean (no new warnings); npm run build clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/audio-lab/Cargo.lock | 378 +++++++++++++++++- tools/audio-lab/frontend/src/App.tsx | 121 ++++++ .../src/components/AsrConfigPanel.tsx | 207 ++++++++++ .../frontend/src/components/AsrTranscript.tsx | 154 +++++++ tools/audio-lab/frontend/src/lib/websocket.ts | 60 ++- 5 files changed, 907 insertions(+), 13 deletions(-) create mode 100644 tools/audio-lab/frontend/src/components/AsrConfigPanel.tsx create mode 100644 tools/audio-lab/frontend/src/components/AsrTranscript.tsx diff --git a/tools/audio-lab/Cargo.lock b/tools/audio-lab/Cargo.lock index 832229b..ee010c9 100644 --- a/tools/audio-lab/Cargo.lock +++ b/tools/audio-lab/Cargo.lock @@ -143,6 +143,7 @@ dependencies = [ "tower-http", "tracing", "tracing-subscriber", + "wavekat-asr", "wavekat-turn", "wavekat-vad", ] @@ -260,6 +261,26 @@ version = "1.11.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1e748733b7cbc798e1434b6ac524f0c1ff2ab456fe201501e6497c8417a4fc33" +[[package]] +name = "bzip2" +version = "0.4.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bdb116a6ef3f6c3698828873ad02c3014b3c85cadb88496095628e3ef1e347f8" +dependencies = [ + "bzip2-sys", + "libc", +] + +[[package]] +name = "bzip2-sys" +version = "0.1.13+1.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "225bff33b2141874fe80d71e07d6eec4f85c5c216453dd96388240f96e1acc14" +dependencies = [ + "cc", + "pkg-config", +] + [[package]] name = "cc" version = "1.2.57" @@ -404,6 +425,16 @@ dependencies = [ "url", ] +[[package]] +name = "core-foundation" +version = "0.9.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "91e195e091a93c46f7102ec7818a2aa394e1e1771c3ab4825963fa03e45afb8f" +dependencies = [ + "core-foundation-sys", + "libc", +] + [[package]] name = "core-foundation" version = "0.10.1" @@ -743,6 +774,16 @@ version = "2.3.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "37909eebbb50d72f9059c3b6d82c0463f2ff062c9e95845c43a6c9c0355411be" +[[package]] +name = "filetime" +version = "0.2.29" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "5c287a33c7f0a620c38e641e7f60827713987b3c0f26e8ddc9462cc69cf75759" +dependencies = [ + "cfg-if", + "libc", +] + [[package]] name = "find-msvc-tools" version = "0.1.9" @@ -759,6 +800,12 @@ dependencies = [ "miniz_oxide", ] +[[package]] +name = "fnv" +version = "1.0.7" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "3f9eec918d3f24069decb9af1554cad7c880e2da24a9afd88aca000531ab82c1" + [[package]] name = "foreign-types" version = "0.3.2" @@ -915,6 +962,25 @@ dependencies = [ "wasip2", ] +[[package]] +name = "h2" +version = "0.4.14" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "171fefbc92fe4a4de27e0698d6a5b392d6a0e333506bc49133760b3bcf948733" +dependencies = [ + "atomic-waker", + "bytes", + "fnv", + "futures-core", + "futures-sink", + "http", + "indexmap 2.13.0", + "slab", + "tokio", + "tokio-util", + "tracing", +] + [[package]] name = "hashbrown" version = "0.12.3" @@ -953,11 +1019,13 @@ dependencies = [ "indicatif", "libc", "log", + "native-tls", "rand", + "reqwest", "serde", "serde_json", "thiserror 2.0.18", - "ureq", + "ureq 3.3.0", "windows-sys 0.61.2", ] @@ -1034,6 +1102,7 @@ dependencies = [ "bytes", "futures-channel", "futures-core", + "h2", "http", "http-body", "httparse", @@ -1043,6 +1112,38 @@ dependencies = [ "pin-utils", "smallvec", "tokio", + "want", +] + +[[package]] +name = "hyper-rustls" +version = "0.27.9" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "33ca68d021ef39cf6463ab54c1d0f5daf03377b70561305bb89a8f83aab66e0f" +dependencies = [ + "http", + "hyper", + "hyper-util", + "rustls", + "tokio", + "tokio-rustls", + "tower-service", +] + +[[package]] +name = "hyper-tls" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "70206fc6890eaca9fde8a0bf71caa2ddfc9fe045ac9e5c70df101a7dbde866e0" +dependencies = [ + "bytes", + "http-body-util", + "hyper", + "hyper-util", + "native-tls", + "tokio", + "tokio-native-tls", + "tower-service", ] [[package]] @@ -1051,13 +1152,23 @@ version = "0.1.20" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "96547c2556ec9d12fb1578c4eaf448b04993e7fb79cbaad930a656880a6bdfa0" dependencies = [ + "base64", "bytes", + "futures-channel", + "futures-util", "http", "http-body", "hyper", + "ipnet", + "libc", + "percent-encoding", "pin-project-lite", + "socket2", + "system-configuration", "tokio", "tower-service", + "tracing", + "windows-registry", ] [[package]] @@ -1196,6 +1307,22 @@ dependencies = [ "web-time", ] +[[package]] +name = "ipnet" +version = "2.12.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "d98f6fed1fde3f8c21bc40a1abb88dd75e67924f9cffc3ef95607bad8017f8e2" + +[[package]] +name = "iri-string" +version = "0.7.12" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "25e659a4bb38e810ebc252e53b5814ff908a8c58c2a9ce2fae1bbec24cbf4e20" +dependencies = [ + "memchr", + "serde", +] + [[package]] name = "is_terminal_polyfill" version = "1.70.2" @@ -1713,7 +1840,7 @@ dependencies = [ "ort-sys", "smallvec", "tracing", - "ureq", + "ureq 3.3.0", ] [[package]] @@ -1724,7 +1851,7 @@ checksum = "d7b497d21a8b6fbb4b5a544f8fadb77e801a09ae0add9e411d31c6f89e3c1e90" dependencies = [ "hmac-sha256", "lzma-rust2", - "ureq", + "ureq 3.3.0", ] [[package]] @@ -1951,6 +2078,49 @@ version = "0.8.10" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "dc897dd8d9e8bd1ed8cdad82b5966c3e0ecae09fb1907d58efaa013543185d0a" +[[package]] +name = "reqwest" +version = "0.12.28" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "eddd3ca559203180a307f12d114c268abf583f59b03cb906fd0b3ff8646c1147" +dependencies = [ + "base64", + "bytes", + "encoding_rs", + "futures-core", + "futures-util", + "h2", + "http", + "http-body", + "http-body-util", + "hyper", + "hyper-rustls", + "hyper-tls", + "hyper-util", + "js-sys", + "log", + "mime", + "native-tls", + "percent-encoding", + "pin-project-lite", + "rustls-pki-types", + "serde", + "serde_json", + "serde_urlencoded", + "sync_wrapper", + "tokio", + "tokio-native-tls", + "tokio-util", + "tower", + "tower-http", + "tower-service", + "url", + "wasm-bindgen", + "wasm-bindgen-futures", + "wasm-streams", + "web-sys", +] + [[package]] name = "ring" version = "0.17.14" @@ -2116,7 +2286,7 @@ source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "b7f4bc775c73d9a02cde8bf7b2ec4c9d12743edf609006c7facc23998404cd1d" dependencies = [ "bitflags 2.11.0", - "core-foundation", + "core-foundation 0.10.1", "core-foundation-sys", "libc", "security-framework-sys", @@ -2229,6 +2399,28 @@ dependencies = [ "lazy_static", ] +[[package]] +name = "sherpa-onnx" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "f70620e4fa58e4cb1acf4e0a9c2cbc7496ea8284f80e55be23d443b92e563e49" +dependencies = [ + "serde", + "serde_json", + "sherpa-onnx-sys", +] + +[[package]] +name = "sherpa-onnx-sys" +version = "1.13.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "c7f3fe4987367b162336027b5d1ffca6dcd627bee6a324e46f80e82dfcb4365b" +dependencies = [ + "bzip2", + "tar", + "ureq 2.12.1", +] + [[package]] name = "shlex" version = "1.3.0" @@ -2336,6 +2528,9 @@ name = "sync_wrapper" version = "1.0.2" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "0bf256ce5efdfa370213c1dabab5935a12e49f2c58d15e9eac2870d3b4f27263" +dependencies = [ + "futures-core", +] [[package]] name = "synstructure" @@ -2348,6 +2543,38 @@ dependencies = [ "syn", ] +[[package]] +name = "system-configuration" +version = "0.7.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "a13f3d0daba03132c0aa9767f98351b3488edc2c100cda2d2ec2b04f3d8d3c8b" +dependencies = [ + "bitflags 2.11.0", + "core-foundation 0.9.4", + "system-configuration-sys", +] + +[[package]] +name = "system-configuration-sys" +version = "0.6.0" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "8e1d1b10ced5ca923a1fcb8d03e96b8d3268065d724548c0211415ff6ac6bac4" +dependencies = [ + "core-foundation-sys", + "libc", +] + +[[package]] +name = "tar" +version = "0.4.45" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "22692a6476a21fa75fdfc11d452fda482af402c008cdbaf3476414e122040973" +dependencies = [ + "filetime", + "libc", + "xattr", +] + [[package]] name = "tempfile" version = "3.27.0" @@ -2494,6 +2721,26 @@ dependencies = [ "syn", ] +[[package]] +name = "tokio-native-tls" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bbae76ab933c85776efabc971569dd6119c580d8f5d448769dec1764bf796ef2" +dependencies = [ + "native-tls", + "tokio", +] + +[[package]] +name = "tokio-rustls" +version = "0.26.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "1729aa945f29d91ba541258c8df89027d5792d85a8841fb65e8bf0f4ede4ef61" +dependencies = [ + "rustls", + "tokio", +] + [[package]] name = "tokio-tungstenite" version = "0.28.0" @@ -2580,12 +2827,14 @@ dependencies = [ "http-body-util", "http-range-header", "httpdate", + "iri-string", "mime", "mime_guess", "percent-encoding", "pin-project-lite", "tokio", "tokio-util", + "tower", "tower-layer", "tower-service", "tracing", @@ -2675,6 +2924,12 @@ dependencies = [ "strength_reduce", ] +[[package]] +name = "try-lock" +version = "0.2.5" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "e421abadd41a4225275504ea4d6566923418b7f05506fbc9c0fe86ba7396114b" + [[package]] name = "tungstenite" version = "0.28.0" @@ -2728,6 +2983,22 @@ version = "0.9.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "8ecb6da28b8a351d773b68d5825ac39017e680750f980f3a1a85cd8dd28a47c1" +[[package]] +name = "ureq" +version = "2.12.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02d1a66277ed75f640d608235660df48c8e3c19f3b4edb6a263315626cc3c01d" +dependencies = [ + "base64", + "flate2", + "log", + "once_cell", + "rustls", + "rustls-pki-types", + "url", + "webpki-roots 0.26.11", +] + [[package]] name = "ureq" version = "3.3.0" @@ -2749,7 +3020,7 @@ dependencies = [ "ureq-proto", "utf8-zero", "webpki-root-certs", - "webpki-roots", + "webpki-roots 1.0.6", ] [[package]] @@ -2828,6 +3099,15 @@ dependencies = [ "winapi-util", ] +[[package]] +name = "want" +version = "0.3.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "bfa7760aed19e106de2c7c0b581b509f2f25d3dacaf737cb82ac61bc6d760b0e" +dependencies = [ + "try-lock", +] + [[package]] name = "wasi" version = "0.11.1+wasi-snapshot-preview1" @@ -2902,12 +3182,44 @@ dependencies = [ "unicode-ident", ] +[[package]] +name = "wasm-streams" +version = "0.4.2" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "15053d8d85c7eccdbefef60f06769760a563c7f0a9d6902a13d35c7800b0ad65" +dependencies = [ + "futures-util", + "js-sys", + "wasm-bindgen", + "wasm-bindgen-futures", + "web-sys", +] + +[[package]] +name = "wavekat-asr" +version = "0.0.4" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "b595b5dae1c58edfc5dcc903b5d61d4e22ba2900120cba1f0efc7f797c61ed0a" +dependencies = [ + "hf-hub", + "sherpa-onnx", + "thiserror 2.0.18", + "tracing", + "wavekat-core 0.0.8", +] + [[package]] name = "wavekat-core" version = "0.0.4" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1a7c163994a3476a01081470692a54e87f03fccc1c1ba13149c9b3a00a21b097" +[[package]] +name = "wavekat-core" +version = "0.0.8" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "88969f770ebc8905f450d2c0783a8c529e3fc86b166bb8c9989728f5d2b01103" + [[package]] name = "wavekat-turn" version = "0.0.8" @@ -2919,8 +3231,8 @@ dependencies = [ "ort", "realfft", "thiserror 2.0.18", - "ureq", - "wavekat-core", + "ureq 3.3.0", + "wavekat-core 0.0.4", ] [[package]] @@ -2936,7 +3248,7 @@ dependencies = [ "rubato", "serde", "thiserror 2.0.18", - "ureq", + "ureq 3.3.0", "webrtc-vad", ] @@ -2969,6 +3281,15 @@ dependencies = [ "rustls-pki-types", ] +[[package]] +name = "webpki-roots" +version = "0.26.11" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "521bc38abb08001b01866da9f51eb7c5d647a19260e00054a8c7fd5f9e57f7a9" +dependencies = [ + "webpki-roots 1.0.6", +] + [[package]] name = "webpki-roots" version = "1.0.6" @@ -3034,7 +3355,7 @@ version = "0.54.0" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "12661b9c89351d684a50a8a643ce5f608e20243b9fb84687800163429f161d65" dependencies = [ - "windows-result", + "windows-result 0.1.2", "windows-targets 0.52.6", ] @@ -3044,6 +3365,17 @@ version = "0.2.1" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "f0805222e57f7521d6a62e36fa9163bc891acd422f971defe97d64e70d0a4fe5" +[[package]] +name = "windows-registry" +version = "0.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "02752bf7fbdcce7f2a27a742f798510f3e5ad88dbe84871e5168e2120c3d5720" +dependencies = [ + "windows-link", + "windows-result 0.4.1", + "windows-strings", +] + [[package]] name = "windows-result" version = "0.1.2" @@ -3053,6 +3385,24 @@ dependencies = [ "windows-targets 0.52.6", ] +[[package]] +name = "windows-result" +version = "0.4.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7781fa89eaf60850ac3d2da7af8e5242a5ea78d1a11c49bf2910bb5a73853eb5" +dependencies = [ + "windows-link", +] + +[[package]] +name = "windows-strings" +version = "0.5.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "7837d08f69c77cf6b07689544538e017c1bfcf57e34b4c0ff58e6c2cd3b37091" +dependencies = [ + "windows-link", +] + [[package]] name = "windows-sys" version = "0.45.0" @@ -3222,6 +3572,16 @@ version = "0.6.3" source = "registry+https://github.com/rust-lang/crates.io-index" checksum = "1ffae5123b2d3fc086436f8834ae3ab053a283cfac8fe0a0b8eaae044768a4c4" +[[package]] +name = "xattr" +version = "1.6.1" +source = "registry+https://github.com/rust-lang/crates.io-index" +checksum = "32e45ad4206f6d2479085147f02bc2ef834ac85886624a23575ae137c8aa8156" +dependencies = [ + "libc", + "rustix", +] + [[package]] name = "yoke" version = "0.8.2" diff --git a/tools/audio-lab/frontend/src/App.tsx b/tools/audio-lab/frontend/src/App.tsx index 759438a..3c5fa0e 100644 --- a/tools/audio-lab/frontend/src/App.tsx +++ b/tools/audio-lab/frontend/src/App.tsx @@ -19,6 +19,11 @@ import { STATE_COLORS as TURN_STATE_COLORS } from "@/lib/turnColors"; import { TurnConfigPanel } from "@/components/TurnConfigPanel"; import { PipelineConfigPanel } from "@/components/PipelineConfigPanel"; import { PipelineTimeline } from "@/components/PipelineTimeline"; +import { AsrConfigPanel } from "@/components/AsrConfigPanel"; +import { + AsrTranscript, + type AsrTranscriptState, +} from "@/components/AsrTranscript"; import { type Viewport, createDefaultViewport, @@ -32,6 +37,7 @@ import { type TurnConfig, type PipelineConfig, type PipelineResultPoint, + type AsrConfig, type ParamInfo, type ServerMessage, type ConnectionState, @@ -259,6 +265,17 @@ function App() { const pipelineSeededRef = useRef(false); const [pipelineResults, setPipelineResults] = useState>({}); + const [asrBackends, setAsrBackends] = useState>({}); + const [asrConfigs, setAsrConfigs] = useState(() => { + try { + const saved = localStorage.getItem("lab-asr-configs"); + if (saved !== null) return JSON.parse(saved) as AsrConfig[]; + } catch { /* ignore */ } + return []; + }); + const [asrTranscripts, setAsrTranscripts] = useState>({}); + const [asrOpen, setAsrOpen] = useState(true); + // Preprocessed data per config const [preprocessedSamples, setPreprocessedSamples] = useState>({}); const [preprocessedSpectrumData, setPreprocessedSpectrumData] = useState< @@ -297,6 +314,18 @@ function App() { setPipelineConfigs(createDefaultPipelineConfigs(configs, turnConfigs)); }, [configs, turnConfigs, pipelineConfigs]); + // Persist asr configs to localStorage + useEffect(() => { + localStorage.setItem("lab-asr-configs", JSON.stringify(asrConfigs)); + }, [asrConfigs]); + + // Push asr config changes to the backend so the next start picks them up + useEffect(() => { + const socket = socketRef.current; + if (!socket || !connected) return; + socket.send({ type: "set_asr_configs", configs: asrConfigs }); + }, [asrConfigs, connected]); + // Resolve playback samples based on selected source const playbackSamples = playbackSource === "original" ? samples : (preprocessedSamples[playbackSource] ?? []); @@ -341,6 +370,7 @@ function App() { socket.send({ type: "list_devices" }); socket.send({ type: "list_backends" }); socket.send({ type: "list_turn_backends" }); + socket.send({ type: "list_asr_backends" }); }, []); const handleMessage = useCallback((msg: ServerMessage) => { @@ -508,6 +538,59 @@ function App() { })); break; + case "asr_backends": + setAsrBackends(msg.backends); + break; + + case "asr": + setAsrTranscripts((prev) => { + const existing: AsrTranscriptState = prev[msg.config_id] ?? { + ready: false, + finals: [], + partial: null, + warning: null, + }; + switch (msg.kind) { + case "ready": + return { + ...prev, + [msg.config_id]: { ...existing, ready: true, warning: null }, + }; + case "partial": + return { + ...prev, + [msg.config_id]: { ...existing, partial: msg.text ?? null }, + }; + case "final": + return { + ...prev, + [msg.config_id]: { + ...existing, + partial: null, + finals: [ + ...existing.finals, + { + ts_ms: msg.ts_ms ?? 0, + end_ms: msg.end_ms ?? msg.ts_ms ?? 0, + text: msg.text ?? "", + confidence: msg.confidence ?? 1, + }, + ], + }, + }; + case "warning": + return { + ...prev, + [msg.config_id]: { ...existing, warning: msg.message ?? null }, + }; + case "speech_started": + case "speech_ended": + default: + return prev; + } + }); + break; + case "done": recordingRef.current = false; setRecording(false); @@ -563,6 +646,7 @@ function App() { setTurnResults({}); setTurnTiming({}); setPipelineResults({}); + setAsrTranscripts({}); setPlaybackSource("original"); setTotalDurationMs(0); setSampleRate(null); @@ -579,6 +663,7 @@ function App() { socket.send({ type: "set_configs", configs }); socket.send({ type: "set_turn_configs", configs: turnConfigs }); socket.send({ type: "set_pipeline_configs", configs: pipelineConfigs }); + socket.send({ type: "set_asr_configs", configs: asrConfigs }); socket.send({ type: "start_recording", device_index: parseInt(selectedDevice), @@ -612,6 +697,7 @@ function App() { setTurnResults({}); setTurnTiming({}); setPipelineResults({}); + setAsrTranscripts({}); setPlaybackSource("original"); setTotalDurationMs(0); setSampleRate(null); @@ -620,6 +706,7 @@ function App() { socket.send({ type: "set_configs", configs }); socket.send({ type: "set_turn_configs", configs: turnConfigs }); socket.send({ type: "set_pipeline_configs", configs: pipelineConfigs }); + socket.send({ type: "set_asr_configs", configs: asrConfigs }); socket.send({ type: "load_file", path, channel }); setLoadingFile(true); }; @@ -1054,6 +1141,10 @@ function App() { /> ))} + {asrConfigs.length > 0 && ( + + )} + {/* Preprocessed Waveforms/Spectrograms/VAD - only for configs with showPreprocessed enabled */} {vadOpen && configs.filter((c) => showPreprocessed[c.id]).map((config) => { const configIndex = configs.findIndex((c) => c.id === config.id); @@ -1218,6 +1309,36 @@ function App() { )} + {/* ASR Config Panel */} +
+
+ +
+ {asrOpen && ( + { + const backendNames = Object.keys(asrBackends); + if (backendNames.length === 0) return; + const backend = backendNames[0]; + const params: Record = {}; + for (const p of asrBackends[backend]) { + params[p.name] = p.default; + } + setAsrConfigs([{ id: "asr-1", label: "asr-1", backend, params }]); + }} + /> + )} +
+ {/* Logs */} diff --git a/tools/audio-lab/frontend/src/components/AsrConfigPanel.tsx b/tools/audio-lab/frontend/src/components/AsrConfigPanel.tsx new file mode 100644 index 0000000..fa25924 --- /dev/null +++ b/tools/audio-lab/frontend/src/components/AsrConfigPanel.tsx @@ -0,0 +1,207 @@ +import { useMemo } from "react"; +import { Button } from "@/components/ui/button"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import { Input } from "@/components/ui/input"; +import { Label } from "@/components/ui/label"; +import { + Select, + SelectContent, + SelectItem, + SelectTrigger, + SelectValue, +} from "@/components/ui/select"; +import type { AsrConfig, ParamInfo } from "@/lib/websocket"; + +export type { AsrConfig }; + +interface AsrConfigPanelProps { + configs: AsrConfig[]; + backends: Record; + onConfigsChange: (configs: AsrConfig[]) => void; + onResetDefaults: () => void; +} + +export function AsrConfigPanel({ + configs, + backends, + onConfigsChange, + onResetDefaults, +}: AsrConfigPanelProps) { + const nextId = useMemo(() => { + let max = 0; + for (const c of configs) { + const match = c.id.match(/^asr-(\d+)$/); + if (match) { + max = Math.max(max, parseInt(match[1], 10)); + } + } + return max + 1; + }, [configs]); + + const addConfig = () => { + const backendNames = Object.keys(backends); + if (backendNames.length === 0) return; + + const backend = backendNames[0]; + const params: Record = {}; + for (const p of backends[backend]) { + params[p.name] = p.default; + } + + const id = `asr-${nextId}`; + onConfigsChange([ + ...configs, + { id, label: `asr-${nextId}`, backend, params }, + ]); + }; + + const removeConfig = (id: string) => { + onConfigsChange(configs.filter((c) => c.id !== id)); + }; + + const cloneConfig = (config: AsrConfig) => { + const id = `asr-${nextId}`; + onConfigsChange([ + ...configs, + { ...config, id, label: `${config.label} (copy)`, params: { ...config.params } }, + ]); + }; + + const updateConfig = (id: string, updates: Partial) => { + onConfigsChange( + configs.map((c) => { + if (c.id !== id) return c; + const updated = { ...c, ...updates }; + + if (updates.backend && updates.backend !== c.backend) { + const newParams: Record = {}; + for (const p of backends[updates.backend] ?? []) { + newParams[p.name] = p.default; + } + updated.params = newParams; + } + + return updated; + }) + ); + }; + + const updateParam = (configId: string, paramName: string, value: unknown) => { + onConfigsChange( + configs.map((c) => { + if (c.id !== configId) return c; + return { ...c, params: { ...c.params, [paramName]: value } }; + }) + ); + }; + + return ( +
+
+ + +
+ +
+ {configs.map((config) => ( + + +
+ + updateConfig(config.id, { label: e.target.value })} + /> + +
+ + +
+
+
+ +
+ + +
+ + {(backends[config.backend] ?? []).map((param) => ( +
+ + {param.param_type.type === "Select" && ( + + )} + {param.param_type.type === "Float" && ( + { + const val = parseFloat(e.target.value); + if (!isNaN(val)) { + updateParam(config.id, param.name, val); + } + }} + className="h-8 text-xs w-24" + /> + )} +
+ ))} +
+
+ ))} +
+
+ ); +} diff --git a/tools/audio-lab/frontend/src/components/AsrTranscript.tsx b/tools/audio-lab/frontend/src/components/AsrTranscript.tsx new file mode 100644 index 0000000..ac0a45d --- /dev/null +++ b/tools/audio-lab/frontend/src/components/AsrTranscript.tsx @@ -0,0 +1,154 @@ +import { useEffect, useMemo, useRef } from "react"; +import { Button } from "@/components/ui/button"; +import { Card, CardContent, CardHeader, CardTitle } from "@/components/ui/card"; +import type { AsrConfig } from "@/lib/websocket"; + +export interface AsrTranscriptFinal { + ts_ms: number; + end_ms: number; + text: string; + confidence: number; +} + +export interface AsrTranscriptState { + ready: boolean; + finals: AsrTranscriptFinal[]; + partial: string | null; + warning: string | null; +} + +interface AsrTranscriptProps { + configs: AsrConfig[]; + states: Record; +} + +function formatMs(ms: number): string { + const totalSeconds = ms / 1000; + const minutes = Math.floor(totalSeconds / 60); + const seconds = totalSeconds - minutes * 60; + return `${minutes.toString().padStart(2, "0")}:${seconds.toFixed(1).padStart(4, "0")}`; +} + +export function AsrTranscript({ configs, states }: AsrTranscriptProps) { + if (configs.length === 0) return null; + + return ( +
+ {configs.map((config) => ( + + ))} +
+ ); +} + +interface AsrTranscriptCardProps { + config: AsrConfig; + state: AsrTranscriptState | undefined; +} + +function AsrTranscriptCard({ config, state }: AsrTranscriptCardProps) { + const ready = state?.ready ?? false; + const finals = useMemo(() => state?.finals ?? [], [state?.finals]); + const partial = state?.partial ?? null; + const warning = state?.warning ?? null; + const preset = + typeof config.params.preset === "string" ? config.params.preset : "—"; + + const stats = useMemo(() => { + if (finals.length === 0) return null; + const lastFinal = finals[finals.length - 1]; + const avgDuration = + finals.reduce((sum, f) => sum + (f.end_ms - f.ts_ms), 0) / finals.length; + return { + count: finals.length, + lastConfidence: lastFinal.confidence, + avgDurationMs: avgDuration, + }; + }, [finals]); + + const scrollRef = useRef(null); + const autoScrollRef = useRef(true); + + useEffect(() => { + const el = scrollRef.current; + if (!el || !autoScrollRef.current) return; + el.scrollTop = el.scrollHeight; + }, [finals.length, partial]); + + const onScroll = () => { + const el = scrollRef.current; + if (!el) return; + const atBottom = el.scrollHeight - el.scrollTop - el.clientHeight < 20; + autoScrollRef.current = atBottom; + }; + + const copyAll = () => { + const text = finals.map((f) => f.text).join("\n"); + navigator.clipboard.writeText(text).catch(() => {}); + }; + + return ( + + +
+ + ASR: {config.label}{" "} + + · {config.backend} · {preset} + + + +
+
+ +
+ {!ready && !warning && ( +
loading model…
+ )} + {warning && ( +
⚠ {warning}
+ )} + {finals.map((f, i) => ( +
+ + [{formatMs(f.ts_ms)}–{formatMs(f.end_ms)}] + {" "} + {f.text} +
+ ))} + {partial && ( +
+ partial: {partial} +
+ )} +
+
+ {stats ? ( + + conf {stats.lastConfidence.toFixed(2)} · {stats.count} finals · avg{" "} + {(stats.avgDurationMs / 1000).toFixed(1)}s/segment + + ) : ( + {ready ? "waiting for speech…" : "—"} + )} +
+
+
+ ); +} diff --git a/tools/audio-lab/frontend/src/lib/websocket.ts b/tools/audio-lab/frontend/src/lib/websocket.ts index f885659..dc607be 100644 --- a/tools/audio-lab/frontend/src/lib/websocket.ts +++ b/tools/audio-lab/frontend/src/lib/websocket.ts @@ -24,6 +24,21 @@ export interface TurnConfig { params: Record; } +export interface AsrConfig { + id: string; + label: string; + backend: string; + params: Record; +} + +export type AsrEventKind = + | "ready" + | "speech_started" + | "speech_ended" + | "partial" + | "final" + | "warning"; + export interface PipelineConfig { id: string; label: string; @@ -64,6 +79,8 @@ export type ServerMessage = | { type: "turn_backends"; backends: Record } | { type: "turn"; config_id: string; timestamp_ms: number; state: string; confidence: number; latency_ms: number; stage_times: Array<{ name: string; us: number }> } | { type: "pipeline"; config_id: string; timestamp_ms: number; event: string; turn_state?: string; turn_confidence?: number; turn_latency_ms?: number; audio_duration_ms?: number } + | { type: "asr_backends"; backends: Record } + | { type: "asr"; config_id: string; kind: AsrEventKind; ts_ms?: number; end_ms?: number; text?: string; confidence?: number; message?: string } | { type: "done" } | { type: "error"; message: string }; @@ -78,7 +95,9 @@ export type ClientMessage = | { type: "set_spectrum_bins"; bins: number } | { type: "list_turn_backends" } | { type: "set_turn_configs"; configs: TurnConfig[] } - | { type: "set_pipeline_configs"; configs: PipelineConfig[] }; + | { type: "set_pipeline_configs"; configs: PipelineConfig[] } + | { type: "list_asr_backends" } + | { type: "set_asr_configs"; configs: AsrConfig[] }; export type MessageHandler = (msg: ServerMessage) => void; @@ -106,6 +125,7 @@ interface StreamBatch { preprocessedAudioFrames: Map; preprocessedSpectrumFrames: Map; vad: Map; + asrPartials: Map; } export class VadLabSocket { @@ -247,8 +267,14 @@ export class VadLabSocket { } private logServerMessage(msg: ServerMessage) { - if (msg.type === "audio" || msg.type === "vad" || msg.type === "spectrum" || - msg.type === "preprocessed_audio" || msg.type === "preprocessed_spectrum") { + if ( + msg.type === "audio" || + msg.type === "vad" || + msg.type === "spectrum" || + msg.type === "preprocessed_audio" || + msg.type === "preprocessed_spectrum" || + (msg.type === "asr" && msg.kind === "partial") + ) { this.addToBatch(msg); } else { // Flush any pending batch before logging a non-streaming message @@ -257,7 +283,7 @@ export class VadLabSocket { } } - private addToBatch(msg: ServerMessage & { type: "audio" | "vad" | "spectrum" | "preprocessed_audio" | "preprocessed_spectrum" }) { + private addToBatch(msg: ServerMessage & { type: "audio" | "vad" | "spectrum" | "preprocessed_audio" | "preprocessed_spectrum" | "asr" }) { if (!this.streamBatch) { this.streamBatch = { audioFrames: 0, @@ -267,6 +293,7 @@ export class VadLabSocket { preprocessedAudioFrames: new Map(), preprocessedSpectrumFrames: new Map(), vad: new Map(), + asrPartials: new Map(), }; this.startBatchTimer(); } @@ -288,6 +315,11 @@ export class VadLabSocket { msg.config_id, (batch.preprocessedSpectrumFrames.get(msg.config_id) ?? 0) + 1 ); + } else if (msg.type === "asr") { + batch.asrPartials.set( + msg.config_id, + (batch.asrPartials.get(msg.config_id) ?? 0) + 1 + ); } else { const existing = batch.vad.get(msg.config_id); if (existing) { @@ -363,6 +395,10 @@ export class VadLabSocket { } } + for (const [configId, count] of batch.asrPartials) { + parts.push(`asr [${configId}]: ${count} partials`); + } + if (parts.length > 0) { this.emitLog("recv", parts.join(" | ")); } @@ -382,6 +418,20 @@ function summarizeServer(msg: ServerMessage): string { case "turn_backends": return `turn_backends (${Object.keys(msg.backends).length})`; case "turn": return `turn [${msg.config_id}] t=${msg.timestamp_ms.toFixed(0)}ms state=${msg.state} conf=${msg.confidence.toFixed(2)} lat=${msg.latency_ms}ms`; case "pipeline": return `pipeline [${msg.config_id}] t=${msg.timestamp_ms.toFixed(0)}ms ${msg.event}${msg.turn_state ? ` ${msg.turn_state} ${((msg.turn_confidence ?? 0) * 100).toFixed(0)}%` : ""}`; + case "asr_backends": return `asr_backends (${Object.keys(msg.backends).length})`; + case "asr": { + const where = msg.ts_ms !== undefined ? ` t=${msg.ts_ms.toFixed(0)}ms` : ""; + if (msg.kind === "final") { + return `asr [${msg.config_id}] final${where} "${msg.text ?? ""}" conf=${(msg.confidence ?? 0).toFixed(2)}`; + } + if (msg.kind === "partial") { + return `asr [${msg.config_id}] partial${where} "${msg.text ?? ""}"`; + } + if (msg.kind === "warning") { + return `asr [${msg.config_id}] warning: ${msg.message ?? ""}`; + } + return `asr [${msg.config_id}] ${msg.kind}${where}`; + } case "done": return "done"; case "error": return `error: ${msg.message}`; } @@ -399,5 +449,7 @@ function summarizeClient(msg: ClientMessage): string { case "list_turn_backends": return "list_turn_backends"; case "set_turn_configs": return `set_turn_configs (${msg.configs.length})`; case "set_pipeline_configs": return `set_pipeline_configs (${msg.configs.length})`; + case "list_asr_backends": return "list_asr_backends"; + case "set_asr_configs": return `set_asr_configs (${msg.configs.length})`; } } From e26181712a36168476d9a73c050b5574a73242ff Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Fri, 15 May 2026 11:23:33 +1200 Subject: [PATCH 3/9] docs(audio-lab): README updates for ASR backend (M3) - New "ASR" subsection under Supported Backends with the sherpa-onnx preset table and a NOTE about the first-run model download (~75 MB to \$HF_HOME). - Mention "live transcripts" in the audio-lab What It Does list. - Top-level README: include ASR in the audio-lab description + tool layout blurb. Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 6 +++--- tools/audio-lab/README.md | 14 +++++++++++++- 2 files changed, 16 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 11f624c..2ffccb5 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ A research repo for the [WaveKat](https://github.com/wavekat) project — intera ``` wavekat-lab/ ├── tools/ -│ ├── audio-lab/ Real-time VAD + Turn Detection comparison app (Rust + React) +│ ├── audio-lab/ Real-time VAD + Turn Detection + ASR comparison app (Rust + React) │ └── cv-explorer/ Mozilla Common Voice dataset browser (Cloudflare Workers + React) ├── notebooks/ Jupyter notebooks (training, validation, dataset splits) └── docs/ Plans and design docs @@ -30,9 +30,9 @@ Each tool is self-contained — its own Makefile, lockfiles, and build setup liv ### [Audio Lab](tools/audio-lab/) — `tools/audio-lab/` -Web app for testing and comparing WaveKat library backends side by side in real time. Live mic capture, WAV upload, multi-config fan-out, VAD-gated pipeline mode, waveform + spectrogram + probability timelines. +Web app for testing and comparing WaveKat library backends side by side in real time. Live mic capture, WAV upload, multi-config fan-out, VAD-gated pipeline mode, live ASR transcripts, waveform + spectrogram + probability timelines. -Backends: webrtc-vad, silero-vad, ten-vad, firered-vad, pipecat smart-turn. [Details →](tools/audio-lab/README.md) +Backends: webrtc-vad, silero-vad, ten-vad, firered-vad, pipecat smart-turn, sherpa-onnx ASR. [Details →](tools/audio-lab/README.md) ### [Common Voice Explorer](tools/cv-explorer/) — `tools/cv-explorer/` diff --git a/tools/audio-lab/README.md b/tools/audio-lab/README.md index faa988d..1d098b3 100644 --- a/tools/audio-lab/README.md +++ b/tools/audio-lab/README.md @@ -1,6 +1,6 @@ # Audio Lab -A web-based experimentation tool for testing and comparing [WaveKat](https://github.com/wavekat) library backends — VAD, turn detection, and more — side by side in real time. +A web-based experimentation tool for testing and comparing [WaveKat](https://github.com/wavekat) library backends — VAD, turn detection, ASR, and more — side by side in real time. > [!WARNING] > Early development. Things may change. @@ -10,6 +10,7 @@ A web-based experimentation tool for testing and comparing [WaveKat](https://git - **Live recording** — capture audio from your microphone server-side, stream results to the browser in real time - **File analysis** — upload a WAV file and run multiple configs against it at full speed - **Side-by-side comparison** — fan out audio to N configurations simultaneously and compare outputs +- **Live transcripts** — stream partial + final ASR transcripts per config as audio plays - **Preprocessing exploration** — apply high-pass filters, RNNoise denoising, or normalization per-config - **Interactive visualization** — waveform, spectrogram, and probability timelines with synchronized zoom, pan, and hover @@ -60,6 +61,17 @@ Each config can also enable per-config preprocessing: high-pass filter, RNNoise The `wavekat-zh` ONNX is downloaded from HuggingFace on first use and cached under `$HF_HOME/hub/` (default `~/.cache/huggingface/hub/`). For offline runs, set `WAVEKAT_TURN_MODEL_DIR` to a directory containing `zh/smart-turn-cpu.onnx`. +### ASR + +Streaming speech-to-text via [`wavekat-asr`](https://github.com/wavekat/wavekat-asr). Each ASR config gets a per-config transcript card stacked under the timelines: committed finals with `[mm:ss.s–mm:ss.s]` timestamps, a dimmed trailing line for the live partial, and footer stats (last confidence, count, average segment duration). + +| Backend | Description | Preset (`preset` param) | +|---------|-------------|-------------------------| +| **sherpa-onnx** | Local streaming Zipformer / Paraformer via [sherpa-onnx](https://github.com/k2-fsa/sherpa-onnx) | `bilingual` (default, ZH+EN) · `en` · `zh` · `paraformer-zh-en` | + +> [!NOTE] +> The first time you record or load a file with an ASR config enabled, sherpa-onnx downloads the chosen model from HuggingFace (~75 MB for `bilingual`) into `$HF_HOME` (default `~/.cache/huggingface/hub/`). The transcript card shows `loading model…` until the model is ready; subsequent runs are instant. + ## Architecture The Rust backend handles all audio capture and processing; the React frontend is embedded in the binary and handles visualization only. From 6bfb659b311b4d6d0566cb7448266139f1616e46 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Fri, 15 May 2026 11:36:06 +1200 Subject: [PATCH 4/9] feat(audio-lab): two-column layout with config sidebar Wraps the post-controls body in a flex container so all config panels (VAD, Turn, Pipeline, ASR) stack in a left aside (w-80 on lg+) and the waveform / spectrum / timelines / ASR transcript / preprocessed sections fill a flex-1 main column. Matches the layout sketch in docs/05-plan-asr.md. Collapses to a single column on screens narrower than lg. npm run lint and npm run build clean. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/audio-lab/frontend/src/App.tsx | 220 ++++++++++++++------------- 1 file changed, 115 insertions(+), 105 deletions(-) diff --git a/tools/audio-lab/frontend/src/App.tsx b/tools/audio-lab/frontend/src/App.tsx index 3c5fa0e..adae826 100644 --- a/tools/audio-lab/frontend/src/App.tsx +++ b/tools/audio-lab/frontend/src/App.tsx @@ -942,8 +942,119 @@ function App() { - {/* Waveform and VAD Timelines */} -
+
+ + +
+ {/* Waveform and VAD Timelines */} +

); })} -

- - - - {/* VAD Config Panel */} -
-
- -
- {vadOpen && ( - setConfigs(createDefaultConfigs())} - showPreprocessed={showPreprocessed} - onShowPreprocessedChange={(configId, show) => - setShowPreprocessed((prev) => ({ ...prev, [configId]: show })) - } - /> - )} -
- - {/* Turn Detection Config Panel */} -
-
- -
- {turnOpen && ( - { - const defaults = buildDefaultTurnConfigs(turnBackends); - if (defaults.length > 0) setTurnConfigs(defaults); - }} - /> - )} -
- - {/* Pipeline Mode Config Panel */} -
-
- -
- {pipelineOpen && ( - - )} -
- - {/* ASR Config Panel */} -
-
- -
- {asrOpen && ( - { - const backendNames = Object.keys(asrBackends); - if (backendNames.length === 0) return; - const backend = backendNames[0]; - const params: Record = {}; - for (const p of asrBackends[backend]) { - params[p.name] = p.default; - } - setAsrConfigs([{ id: "asr-1", label: "asr-1", backend, params }]); - }} - /> - )} +
+
From 0e18c69e3e208c55b86bcc9c7c16d19c47b46dd2 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Fri, 15 May 2026 11:43:57 +1200 Subject: [PATCH 5/9] chore(audio-lab): make dev runs backend + frontend concurrently Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/audio-lab/Makefile | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tools/audio-lab/Makefile b/tools/audio-lab/Makefile index 740784b..73988a3 100644 --- a/tools/audio-lab/Makefile +++ b/tools/audio-lab/Makefile @@ -16,6 +16,7 @@ help: @echo " install-frontend Install npm dependencies (uses .nvmrc)" @echo "" @echo "Development:" + @echo " dev Run backend + frontend concurrently (Ctrl-C stops both)" @echo " dev-backend Run backend with auto-rebuild" @echo " dev-frontend Run frontend dev server" @echo "" @@ -49,7 +50,11 @@ dev-frontend: $(NVM) cd frontend && nvm use --silent && npm run dev dev: - @echo "Run 'make dev-backend' and 'make dev-frontend' in separate terminals" + @echo "Starting audio-lab dev stack (backend + frontend). Ctrl-C to stop both." + @trap 'kill 0' INT TERM; \ + $(MAKE) --no-print-directory dev-backend & \ + $(MAKE) --no-print-directory dev-frontend & \ + wait # ─── Quality ────────────────────────────────────────────────────────────────── From 8edccbb71f0e466b38468c3a99b51c49ff30e3b1 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Fri, 15 May 2026 11:52:29 +1200 Subject: [PATCH 6/9] fix(audio-lab): stack config cards in sidebar Sidebar config panels used a 3-column grid that clipped titles and wrapped labels in the narrow aside. Stack cards vertically, let the title input fill remaining width, and widen the sidebar to 384px. Co-Authored-By: Claude Opus 4.7 (1M context) --- tools/audio-lab/frontend/src/App.tsx | 2 +- .../frontend/src/components/AsrConfigPanel.tsx | 10 +++++----- .../audio-lab/frontend/src/components/ConfigPanel.tsx | 10 +++++----- .../frontend/src/components/PipelineConfigPanel.tsx | 10 +++++----- .../frontend/src/components/TurnConfigPanel.tsx | 10 +++++----- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/tools/audio-lab/frontend/src/App.tsx b/tools/audio-lab/frontend/src/App.tsx index adae826..5a72fe4 100644 --- a/tools/audio-lab/frontend/src/App.tsx +++ b/tools/audio-lab/frontend/src/App.tsx @@ -943,7 +943,7 @@ function App() {
-