From b191a68b00141d3606883a98398e58e912953271 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Thu, 14 May 2026 16:00:09 +1200 Subject: [PATCH 1/5] feat: add AudioSource/AudioSink traits + G.711 codec MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two upstream-shaped additions for the WaveKat audio ecosystem: - `audio_io::{AudioSource, AudioSink}` — the producer/consumer seam the whole audio pipeline is drawn against. Frames keyed on `AudioFrame` so the trait doesn't have to know about the codec. - `codec::g711` — PCMU / PCMA encode/decode + `G711Codec` enum. Codecs are consumer-layer (`wavekat-sip` stays codec-agnostic) but benefit from a shared home so future consumers (wavekat-asr, wavekat-voice, an eventual `wavekat-agent`) don't each reimplement. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/wavekat-core/Cargo.toml | 1 + crates/wavekat-core/src/audio_io.rs | 80 ++++++++ crates/wavekat-core/src/codec/g711.rs | 263 ++++++++++++++++++++++++++ crates/wavekat-core/src/codec/mod.rs | 8 + crates/wavekat-core/src/lib.rs | 3 + 5 files changed, 355 insertions(+) create mode 100644 crates/wavekat-core/src/audio_io.rs create mode 100644 crates/wavekat-core/src/codec/g711.rs create mode 100644 crates/wavekat-core/src/codec/mod.rs diff --git a/crates/wavekat-core/Cargo.toml b/crates/wavekat-core/Cargo.toml index 8d37555..81b35d0 100644 --- a/crates/wavekat-core/Cargo.toml +++ b/crates/wavekat-core/Cargo.toml @@ -21,6 +21,7 @@ hound = { version = "3.5", optional = true } rubato = { version = "2.0", optional = true } [dev-dependencies] +tokio = { version = "1", features = ["macros", "rt"] } [package.metadata.docs.rs] all-features = true diff --git a/crates/wavekat-core/src/audio_io.rs b/crates/wavekat-core/src/audio_io.rs new file mode 100644 index 0000000..bb856fd --- /dev/null +++ b/crates/wavekat-core/src/audio_io.rs @@ -0,0 +1,80 @@ +//! Audio source/sink traits. +//! +//! These are the seam the WaveKat audio pipeline is drawn against: +//! whatever produces audio (a microphone, a TTS engine, a WAV file) +//! implements [`AudioSource`]; whatever consumes it (a speaker, an +//! RTP encoder, an ASR worker) implements [`AudioSink`]. Concrete +//! impls live in the consuming crates — cpal-backed mic/speaker in +//! `wavekat-voice`, a future agent-driven impl in `wavekat-agent`, +//! and so on — so that adding a new producer or consumer is "implement +//! the trait" rather than "rewrite the RTP path." +//! +//! The traits speak in [`AudioFrame<'static>`]: sample-rate-tagged +//! frames so consumers can resample to whatever rate the codec wants +//! without either side of the trait having to know the codec exists. + +use core::future::Future; + +use crate::AudioFrame; + +/// Produces owned [`AudioFrame`]s. `next_frame().await` returns the +/// next frame when one is available, or `None` once the source has +/// run out (file ended, device closed, dialogue terminated). Each +/// frame's [`AudioFrame::sample_rate`] is set by the implementation — +/// consumers resample as needed. +pub trait AudioSource: Send { + fn next_frame(&mut self) -> impl Future>> + Send; +} + +/// Consumes audio frames. Implementations may drop frames on +/// backpressure rather than block the caller; the alternative — +/// stalling — is worse on the RTP receive path, where it stalls the +/// whole pipeline. +pub trait AudioSink: Send { + fn write_frame(&mut self, frame: AudioFrame<'_>) -> impl Future + Send; +} + +#[cfg(test)] +mod tests { + use super::*; + + /// Smallest possible impl-pair: confirms the traits can be + /// implemented (in particular, that the `impl Future` returns are + /// `Send`) and that frames flow through them end-to-end. + #[derive(Default)] + struct VecSink { + frames: Vec>, + } + + impl AudioSink for VecSink { + async fn write_frame(&mut self, frame: AudioFrame<'_>) { + self.frames.push(frame.into_owned()); + } + } + + struct OnceSource { + frame: Option>, + } + + impl AudioSource for OnceSource { + async fn next_frame(&mut self) -> Option> { + self.frame.take() + } + } + + #[tokio::test] + async fn traits_compose_end_to_end() { + let mut source = OnceSource { + frame: Some(AudioFrame::from_vec(vec![0.5, -0.5], 8000)), + }; + let mut sink = VecSink::default(); + + let frame = source.next_frame().await.expect("frame"); + sink.write_frame(frame).await; + assert!(source.next_frame().await.is_none()); + + assert_eq!(sink.frames.len(), 1); + assert_eq!(sink.frames[0].samples(), &[0.5, -0.5]); + assert_eq!(sink.frames[0].sample_rate(), 8000); + } +} diff --git a/crates/wavekat-core/src/codec/g711.rs b/crates/wavekat-core/src/codec/g711.rs new file mode 100644 index 0000000..e19196c --- /dev/null +++ b/crates/wavekat-core/src/codec/g711.rs @@ -0,0 +1,263 @@ +//! G.711 μ-law (PCMU) and A-law (PCMA) codecs. +//! +//! All four functions are byte-for-byte conversions: one 16-bit PCM +//! sample ↔ one 8-bit codeword. A 20 ms RTP frame at 8 kHz is therefore +//! 160 samples / 160 bytes — no length surprises. +//! +//! The tables follow ITU-T G.711; see +//! for the recommendation and +//! for a readable summary. +//! Implementations are cross-checked against the reference vectors in +//! Sun Microsystems' `g711.c` and SpanDSP's reference. +//! +//! G.711 lives in `wavekat-core` (not `wavekat-sip`) because codecs are +//! a consumer-layer choice — `wavekat-sip` deliberately stays +//! codec-agnostic; SDP advertises both PCMU and PCMA and the consumer +//! picks one after answering. + +/// SDP / RTP static payload type for μ-law (G.711U). +pub const PCMU_PAYLOAD_TYPE: u8 = 0; +/// SDP / RTP static payload type for A-law (G.711A). +pub const PCMA_PAYLOAD_TYPE: u8 = 8; + +/// Sample rate of every static G.711 stream. The wire format does not +/// carry the rate; both endpoints just know. +pub const G711_SAMPLE_RATE: u32 = 8000; +/// Samples in a 20 ms G.711 frame (the standard RTP packetization +/// interval). +pub const G711_FRAME_SAMPLES: usize = 160; + +const CLIP: i32 = 32635; +const BIAS: i32 = 0x84; +const SIGN_BIT: u8 = 0x80; +const QUANT_MASK: u8 = 0x0F; +const SEG_SHIFT: u8 = 4; +const SEG_MASK: u8 = 0x70; + +#[inline] +fn seg_for(pcm: i32, seg_end: &[i32; 8]) -> usize { + for (i, &end) in seg_end.iter().enumerate() { + if pcm <= end { + return i; + } + } + seg_end.len() +} + +/// Encode one 16-bit PCM sample to a μ-law byte (G.711U). +pub fn linear_to_ulaw(pcm: i16) -> u8 { + const SEG_END: [i32; 8] = [0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF]; + + let mut pcm = pcm as i32; + let sign = if pcm < 0 { + pcm = -pcm; + 0x7F + } else { + 0xFF + }; + if pcm > CLIP { + pcm = CLIP; + } + pcm += BIAS; + + let seg = seg_for(pcm, &SEG_END); + if seg >= 8 { + 0x7F ^ sign + } else { + let mantissa = ((pcm >> (seg + 3)) & 0x0F) as u8; + let coded = ((seg as u8) << 4) | mantissa; + coded ^ sign + } +} + +/// Decode one μ-law byte to a 16-bit PCM sample. +pub fn ulaw_to_linear(ulaw: u8) -> i16 { + let ulaw = !ulaw; + let sign = (ulaw & SIGN_BIT) != 0; + let exponent = (ulaw & SEG_MASK) >> SEG_SHIFT; + let mantissa = ulaw & QUANT_MASK; + let mut sample = (((mantissa as i32) << 3) + BIAS) << exponent; + sample -= BIAS; + if sign { + -sample as i16 + } else { + sample as i16 + } +} + +/// Encode one 16-bit PCM sample to an A-law byte (G.711A). +pub fn linear_to_alaw(pcm: i16) -> u8 { + const SEG_END: [i32; 8] = [0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF]; + + let (pcm, mask) = if pcm >= 0 { + (pcm as i32, 0xD5u8) + } else { + (((!pcm) as i32) & 0x7FFF, 0x55u8) + }; + + let seg = seg_for(pcm, &SEG_END); + if seg >= 8 { + 0x7F ^ mask + } else { + let mantissa = if seg < 1 { + ((pcm >> 4) & 0x0F) as u8 + } else { + ((pcm >> (seg + 3)) & 0x0F) as u8 + }; + let coded = ((seg as u8) << 4) | mantissa; + coded ^ mask + } +} + +/// Decode one A-law byte to a 16-bit PCM sample. +/// +/// A-law's sign-bit convention is opposite to μ-law's: after XOR with +/// `0x55`, sign bit set means *positive* (see ITU-T G.711 §2.3, or +/// SpanDSP's reference implementation). +pub fn alaw_to_linear(alaw: u8) -> i16 { + let alaw = alaw ^ 0x55; + let sign_set = (alaw & SIGN_BIT) != 0; + let exponent = (alaw & SEG_MASK) >> SEG_SHIFT; + let mantissa = alaw & QUANT_MASK; + let mut sample = ((mantissa as i32) << 4) + 8; + if exponent != 0 { + sample = (sample + 0x100) << (exponent - 1); + } + if sign_set { + sample as i16 + } else { + -sample as i16 + } +} + +/// Codec selection for a session. The wire payload-type number +/// (`0`/`8`) is the canonical identifier; this enum is the typed +/// version we pass around in code. +#[derive(Debug, Clone, Copy, PartialEq, Eq)] +pub enum G711Codec { + Pcmu, + Pcma, +} + +impl G711Codec { + pub fn payload_type(self) -> u8 { + match self { + G711Codec::Pcmu => PCMU_PAYLOAD_TYPE, + G711Codec::Pcma => PCMA_PAYLOAD_TYPE, + } + } + + /// Resolve from a SIP/RTP payload type number. Returns `None` for + /// any non-G.711 payload type — the caller decides whether to fall + /// through (e.g. accept anyway, ask for re-INVITE, reject). + pub fn from_payload_type(pt: u8) -> Option { + match pt { + PCMU_PAYLOAD_TYPE => Some(G711Codec::Pcmu), + PCMA_PAYLOAD_TYPE => Some(G711Codec::Pcma), + _ => None, + } + } + + /// Encode a slice of 16-bit PCM samples into G.711 bytes, one byte + /// per sample. Appends to `out`. + pub fn encode(self, pcm: &[i16], out: &mut Vec) { + out.reserve(pcm.len()); + match self { + G711Codec::Pcmu => out.extend(pcm.iter().map(|&s| linear_to_ulaw(s))), + G711Codec::Pcma => out.extend(pcm.iter().map(|&s| linear_to_alaw(s))), + } + } + + /// Decode G.711 bytes into 16-bit PCM samples, one sample per byte. + /// Appends to `out`. + pub fn decode(self, encoded: &[u8], out: &mut Vec) { + out.reserve(encoded.len()); + match self { + G711Codec::Pcmu => out.extend(encoded.iter().map(|&b| ulaw_to_linear(b))), + G711Codec::Pcma => out.extend(encoded.iter().map(|&b| alaw_to_linear(b))), + } + } +} + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn ulaw_round_trip_silence() { + assert_eq!(linear_to_ulaw(0), 0xFF); + // μ-law silence (0xFF) decodes to a small non-zero residue — the + // codec is not loss-free near zero. The residue should round back + // to 0xFF on re-encode, which is the property that matters for + // end-to-end stability. + let s = ulaw_to_linear(0xFF); + assert_eq!(linear_to_ulaw(s), 0xFF); + } + + #[test] + fn alaw_round_trip_silence() { + let encoded = linear_to_alaw(0); + let s = alaw_to_linear(encoded); + assert_eq!(linear_to_alaw(s), encoded); + } + + #[test] + fn ulaw_handles_full_scale() { + assert_eq!(linear_to_ulaw(i16::MAX), 0x80); + assert_eq!(linear_to_ulaw(i16::MIN), 0x00); + } + + #[test] + fn alaw_handles_full_scale() { + assert_eq!(linear_to_alaw(i16::MAX), 0xD5 ^ 0x7F); + assert_eq!(linear_to_alaw(i16::MIN), 0x55 ^ 0x7F); + } + + #[test] + fn codec_encode_decode_length_matches_samples() { + let pcm: Vec = (0..160).map(|i| (i * 200) as i16).collect(); + let mut encoded = Vec::new(); + G711Codec::Pcmu.encode(&pcm, &mut encoded); + assert_eq!(encoded.len(), pcm.len()); + let mut decoded = Vec::new(); + G711Codec::Pcmu.decode(&encoded, &mut decoded); + assert_eq!(decoded.len(), encoded.len()); + } + + #[test] + fn payload_type_round_trips() { + assert_eq!(G711Codec::from_payload_type(0), Some(G711Codec::Pcmu)); + assert_eq!(G711Codec::from_payload_type(8), Some(G711Codec::Pcma)); + assert_eq!(G711Codec::from_payload_type(127), None); + assert_eq!(G711Codec::Pcmu.payload_type(), 0); + assert_eq!(G711Codec::Pcma.payload_type(), 8); + } + + #[test] + fn ulaw_round_trip_preserves_loud_samples_within_codec_step() { + let inputs: &[i16] = &[1000, -1000, 8000, -8000, 16000, -16000]; + for &s in inputs { + let encoded = linear_to_ulaw(s); + let decoded = ulaw_to_linear(encoded); + let diff = (s as i32 - decoded as i32).abs(); + assert!( + diff < 400, + "μ-law round-trip drift too large: {s} → {decoded} (Δ={diff})" + ); + } + } + + #[test] + fn alaw_round_trip_preserves_loud_samples_within_codec_step() { + let inputs: &[i16] = &[1000, -1000, 8000, -8000, 16000, -16000]; + for &s in inputs { + let encoded = linear_to_alaw(s); + let decoded = alaw_to_linear(encoded); + let diff = (s as i32 - decoded as i32).abs(); + assert!( + diff < 400, + "A-law round-trip drift too large: {s} → {decoded} (Δ={diff})" + ); + } + } +} diff --git a/crates/wavekat-core/src/codec/mod.rs b/crates/wavekat-core/src/codec/mod.rs new file mode 100644 index 0000000..45c3d0c --- /dev/null +++ b/crates/wavekat-core/src/codec/mod.rs @@ -0,0 +1,8 @@ +//! Telephony / streaming audio codecs. +//! +//! Each codec lives in a submodule so the public surface stays +//! deliberately granular: a consumer that only needs G.711 imports +//! `wavekat_core::codec::g711`. Future additions (Opus, iLBC, …) live +//! beside it and stay independently selectable. + +pub mod g711; diff --git a/crates/wavekat-core/src/lib.rs b/crates/wavekat-core/src/lib.rs index 5798562..f0d22af 100644 --- a/crates/wavekat-core/src/lib.rs +++ b/crates/wavekat-core/src/lib.rs @@ -25,7 +25,10 @@ //! ``` mod audio; +pub mod audio_io; +pub mod codec; mod error; pub use audio::{AudioFrame, IntoSamples}; +pub use audio_io::{AudioSink, AudioSource}; pub use error::CoreError; From f1aa31385770608e60a4ea0e260046b69247b3e5 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Thu, 14 May 2026 16:08:12 +1200 Subject: [PATCH 2/5] test: expand audio_io and g711 coverage - audio_io: source-drain ordering, sink backpressure pattern, per-frame sample-rate preservation, compile-time Send check. - g711: per-codeword decode fixed-point sweep, full-codeword-space no-panic, distinctness sanity, PCMU vs PCMA cross-check, slice encode/decode buffer-append behavior, sine-wave round-trip with a mean-error bound, RFC 3551 payload-type pinning. Also document the new modules under "What Belongs Here" and the repo-structure tree in CLAUDE.md. Co-Authored-By: Claude Opus 4.7 (1M context) --- CLAUDE.md | 8 +- crates/wavekat-core/src/audio_io.rs | 110 ++++++++++++++++++- crates/wavekat-core/src/codec/g711.rs | 148 ++++++++++++++++++++++++++ 3 files changed, 262 insertions(+), 4 deletions(-) diff --git a/CLAUDE.md b/CLAUDE.md index 0fb9af1..015be26 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,6 +13,8 @@ Provide the common audio primitives so that all WaveKat crates speak the same la - Common constants (sample rates, format standards) - Shared error types (only when genuinely shared) - Format normalisation operations on `AudioFrame` (i16→f32, sample-rate conversion) — gated behind optional features +- The `AudioSource` / `AudioSink` traits — the producer/consumer seam any audio pipeline composes against. Concrete impls (cpal mic, AI agent, RTP receive) live in their consuming crates; the trait shape lives here so they all interoperate. +- Telephony codecs (`codec::g711`: PCMU + PCMA). Codecs are consumer-layer for `wavekat-sip` (which stays codec-agnostic), but multiple downstream crates need them — keep one canonical implementation here. ## What Does NOT Belong Here @@ -59,7 +61,11 @@ wavekat-core/ │ └── wavekat-core/ # library crate │ ├── src/ │ │ ├── lib.rs # public API, re-exports -│ │ ├── audio.rs # AudioFrame, IntoSamples, resample +│ │ ├── audio.rs # AudioFrame, IntoSamples, resample, wav +│ │ ├── audio_io.rs # AudioSource / AudioSink traits +│ │ ├── codec/ # telephony codecs (g711 today) +│ │ │ ├── mod.rs +│ │ │ └── g711.rs # PCMU + PCMA encode/decode, G711Codec enum │ │ └── error.rs # CoreError │ └── Cargo.toml ├── docs/ # design documents diff --git a/crates/wavekat-core/src/audio_io.rs b/crates/wavekat-core/src/audio_io.rs index bb856fd..20efd3d 100644 --- a/crates/wavekat-core/src/audio_io.rs +++ b/crates/wavekat-core/src/audio_io.rs @@ -38,9 +38,7 @@ pub trait AudioSink: Send { mod tests { use super::*; - /// Smallest possible impl-pair: confirms the traits can be - /// implemented (in particular, that the `impl Future` returns are - /// `Send`) and that frames flow through them end-to-end. + /// In-memory sink: collects frames in a Vec. #[derive(Default)] struct VecSink { frames: Vec>, @@ -52,6 +50,7 @@ mod tests { } } + /// Source that yields a single frame then ends. struct OnceSource { frame: Option>, } @@ -62,6 +61,37 @@ mod tests { } } + /// Source that drains a queue of pre-loaded frames, then signals + /// exhaustion with `None`. Mirrors what a file-backed or + /// agent-backed source looks like in practice. + struct QueueSource { + frames: std::collections::VecDeque>, + } + + impl AudioSource for QueueSource { + async fn next_frame(&mut self) -> Option> { + self.frames.pop_front() + } + } + + /// Sink that drops every frame past `cap`. Models the "drop on + /// backpressure" pattern the trait docs call out. + struct CapSink { + cap: usize, + frames: Vec>, + dropped: usize, + } + + impl AudioSink for CapSink { + async fn write_frame(&mut self, frame: AudioFrame<'_>) { + if self.frames.len() >= self.cap { + self.dropped += 1; + return; + } + self.frames.push(frame.into_owned()); + } + } + #[tokio::test] async fn traits_compose_end_to_end() { let mut source = OnceSource { @@ -77,4 +107,78 @@ mod tests { assert_eq!(sink.frames[0].samples(), &[0.5, -0.5]); assert_eq!(sink.frames[0].sample_rate(), 8000); } + + #[tokio::test] + async fn source_drains_in_order_then_returns_none() { + let mut source = QueueSource { + frames: vec![ + AudioFrame::from_vec(vec![0.1], 8000), + AudioFrame::from_vec(vec![0.2], 8000), + AudioFrame::from_vec(vec![0.3], 8000), + ] + .into(), + }; + let mut sink = VecSink::default(); + while let Some(f) = source.next_frame().await { + sink.write_frame(f).await; + } + assert_eq!(sink.frames.len(), 3); + assert_eq!(sink.frames[0].samples(), &[0.1]); + assert_eq!(sink.frames[1].samples(), &[0.2]); + assert_eq!(sink.frames[2].samples(), &[0.3]); + + // Past exhaustion the source must keep returning None — callers + // rely on this to unblock their drain loop. + assert!(source.next_frame().await.is_none()); + assert!(source.next_frame().await.is_none()); + } + + #[tokio::test] + async fn sink_with_capacity_drops_overflow() { + let mut sink = CapSink { + cap: 2, + frames: Vec::new(), + dropped: 0, + }; + for i in 0..5 { + sink.write_frame(AudioFrame::from_vec(vec![i as f32], 8000)) + .await; + } + assert_eq!(sink.frames.len(), 2); + assert_eq!(sink.dropped, 3); + // Cap policy is FIFO-keep / tail-drop here; the first two went + // through, the rest got dropped. + assert_eq!(sink.frames[0].samples(), &[0.0]); + assert_eq!(sink.frames[1].samples(), &[1.0]); + } + + #[tokio::test] + async fn frame_sample_rate_round_trips_through_sink() { + // Different impls emit at their native rates — the sink must + // preserve `sample_rate` so the consumer can resample + // downstream without losing the original rate. + let mut sink = VecSink::default(); + sink.write_frame(AudioFrame::from_vec(vec![0.0], 8000)) + .await; + sink.write_frame(AudioFrame::from_vec(vec![0.0], 16_000)) + .await; + sink.write_frame(AudioFrame::from_vec(vec![0.0], 48_000)) + .await; + let rates: Vec = sink.frames.iter().map(|f| f.sample_rate()).collect(); + assert_eq!(rates, vec![8000, 16_000, 48_000]); + } + + /// Compile-time check that the trait bounds (the trait is `Send`, + /// and so are the returned futures) actually compose. If anything + /// drops the `Send` bound, this test stops compiling — which is + /// what we want, because the RTP path holds these impls across + /// `.await` points in a multi-threaded runtime. + #[test] + fn impls_are_send() { + fn assert_send(_: &T) {} + let source = OnceSource { frame: None }; + let sink = VecSink::default(); + assert_send(&source); + assert_send(&sink); + } } diff --git a/crates/wavekat-core/src/codec/g711.rs b/crates/wavekat-core/src/codec/g711.rs index e19196c..b1f0d5f 100644 --- a/crates/wavekat-core/src/codec/g711.rs +++ b/crates/wavekat-core/src/codec/g711.rs @@ -260,4 +260,152 @@ mod tests { ); } } + + #[test] + fn ulaw_decode_is_a_fixed_point_for_every_codeword() { + // The right invariant: a decoded sample is the canonical form + // of its codeword, so decode(encode(decode(b))) must equal + // decode(b) for every codeword. The weaker variant + // (encode→decode→encode == encode for every i16) fails near + // zero because μ-law has separate +0/-0 codewords that + // collapse to the same decoded sample; that's a property of + // the codec, not a bug. + for b in 0u8..=255 { + let mid = ulaw_to_linear(b); + let again = ulaw_to_linear(linear_to_ulaw(mid)); + assert_eq!(again, mid, "μ-law decode not fixed-point at {b:#x}"); + } + } + + #[test] + fn alaw_decode_is_a_fixed_point_for_every_codeword() { + for b in 0u8..=255 { + let mid = alaw_to_linear(b); + let again = alaw_to_linear(linear_to_alaw(mid)); + assert_eq!(again, mid, "A-law decode not fixed-point at {b:#x}"); + } + } + + #[test] + fn ulaw_decode_covers_full_codeword_space_without_panic() { + // 256 possible codewords. Decoding all of them must not panic + // and must stay in i16 range. + for b in 0u8..=255 { + let _ = ulaw_to_linear(b); + } + } + + #[test] + fn alaw_decode_covers_full_codeword_space_without_panic() { + for b in 0u8..=255 { + let _ = alaw_to_linear(b); + } + } + + #[test] + fn ulaw_zero_is_distinct_from_full_scale() { + // Sanity: a non-trivial codec maps 0 and i16::MAX to different + // bytes. Guards against a stub impl that returns a constant. + assert_ne!(linear_to_ulaw(0), linear_to_ulaw(i16::MAX)); + assert_ne!(linear_to_ulaw(0), linear_to_ulaw(i16::MIN)); + } + + #[test] + fn alaw_zero_is_distinct_from_full_scale() { + assert_ne!(linear_to_alaw(0), linear_to_alaw(i16::MAX)); + assert_ne!(linear_to_alaw(0), linear_to_alaw(i16::MIN)); + } + + #[test] + fn pcmu_and_pcma_produce_different_bytes_for_the_same_input() { + // Guards against accidentally aliasing the two paths (e.g. a + // typo wiring Pcma to linear_to_ulaw). PCMU and PCMA share the + // shape but pick different quantisation tables and silence + // codewords; they should not match on a non-trivial sample. + let s = 12345i16; + assert_ne!(linear_to_ulaw(s), linear_to_alaw(s)); + } + + #[test] + fn codec_enum_dispatches_to_the_right_path() { + // Crossing the enum boundary must end up at the matching + // function — not swapped, not aliased. + let pcm = vec![1000i16, -2000, 3000]; + + let mut a = Vec::new(); + G711Codec::Pcmu.encode(&pcm, &mut a); + let mut b = Vec::new(); + for &s in &pcm { + b.push(linear_to_ulaw(s)); + } + assert_eq!(a, b); + + let mut c = Vec::new(); + G711Codec::Pcma.encode(&pcm, &mut c); + let mut d = Vec::new(); + for &s in &pcm { + d.push(linear_to_alaw(s)); + } + assert_eq!(c, d); + } + + #[test] + fn slice_encode_then_decode_recovers_signal_within_codec_drift() { + // Twenty-millisecond G.711 frame of a small sine — encode, + // decode, and compare against the input. The codec is lossy + // (log-PCM quantisation), so we allow a per-sample drift, but + // the average error must be small for an "audible" path. + let samples: Vec = (0..G711_FRAME_SAMPLES) + .map(|i| { + let t = i as f32 / G711_SAMPLE_RATE as f32; + ((t * 440.0 * 2.0 * std::f32::consts::PI).sin() * 8000.0) as i16 + }) + .collect(); + + for codec in [G711Codec::Pcmu, G711Codec::Pcma] { + let mut encoded = Vec::new(); + codec.encode(&samples, &mut encoded); + assert_eq!(encoded.len(), G711_FRAME_SAMPLES); + + let mut decoded = Vec::new(); + codec.decode(&encoded, &mut decoded); + assert_eq!(decoded.len(), G711_FRAME_SAMPLES); + + let mean_abs_error: f64 = samples + .iter() + .zip(decoded.iter()) + .map(|(a, b)| (*a as i32 - *b as i32).abs() as f64) + .sum::() + / samples.len() as f64; + // 200 i16 units ≈ 0.6% of full scale — comfortably below + // perceptible degradation for telephony. + assert!( + mean_abs_error < 200.0, + "{codec:?}: mean abs error {mean_abs_error} too high" + ); + } + } + + #[test] + fn encode_appends_rather_than_replacing() { + // The slice-level encode/decode take `&mut Vec<…>` and append. + // Verifying that explicitly so callers can reuse buffers + // across RTP packets without per-packet alloc. + let mut buf = vec![0xFFu8, 0xFEu8]; + let pcm = vec![0i16; 3]; + G711Codec::Pcmu.encode(&pcm, &mut buf); + assert_eq!(buf.len(), 5); + assert_eq!(&buf[..2], &[0xFF, 0xFE]); + } + + #[test] + fn payload_type_constants_match_rfc3551() { + // RFC 3551 §6 pins PCMU=0 and PCMA=8. Hard-coding these + // numbers in tests protects against a casual rename that would + // silently break SDP negotiation against any real PBX. + assert_eq!(PCMU_PAYLOAD_TYPE, 0); + assert_eq!(PCMA_PAYLOAD_TYPE, 8); + assert_eq!(G711_SAMPLE_RATE, 8000); + assert_eq!(G711_FRAME_SAMPLES, 160); + } } From ba648862816ef1842c4f991f80f99e833c0ebc5e Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Thu, 14 May 2026 16:14:52 +1200 Subject: [PATCH 3/5] refactor: drop dead saturation guards in g711 Replace seg_for's loop with leading_zeros bit-math so it can't fall through, and remove the unreachable seg >= 8 arms in linear_to_ulaw and linear_to_alaw. Inputs are already bounded to [0, 0x7FFF], so the guards never fired and showed up as missing patch coverage. Co-Authored-By: Claude Opus 4.7 (1M context) --- crates/wavekat-core/src/codec/g711.rs | 50 +++++++++++---------------- 1 file changed, 21 insertions(+), 29 deletions(-) diff --git a/crates/wavekat-core/src/codec/g711.rs b/crates/wavekat-core/src/codec/g711.rs index b1f0d5f..dd8fa41 100644 --- a/crates/wavekat-core/src/codec/g711.rs +++ b/crates/wavekat-core/src/codec/g711.rs @@ -34,20 +34,22 @@ const QUANT_MASK: u8 = 0x0F; const SEG_SHIFT: u8 = 4; const SEG_MASK: u8 = 0x70; +// G.711 segment index for `pcm` in `[0, 0x7FFF]`. The segment is the +// position of the highest set bit above bit 7, clamped to 0 for +// `pcm < 0x100`. Callers in this file bound their inputs to `≤ 0x7FFF` +// (μ-law clips to CLIP+BIAS=0x7FFF; A-law masks with 0x7FFF), so we +// pick the bit-math form that needs no out-of-range fallback. #[inline] -fn seg_for(pcm: i32, seg_end: &[i32; 8]) -> usize { - for (i, &end) in seg_end.iter().enumerate() { - if pcm <= end { - return i; - } +fn seg_for(pcm: i32) -> u32 { + if pcm < 0x100 { + 0 + } else { + 31 - (pcm as u32).leading_zeros() - 7 } - seg_end.len() } /// Encode one 16-bit PCM sample to a μ-law byte (G.711U). pub fn linear_to_ulaw(pcm: i16) -> u8 { - const SEG_END: [i32; 8] = [0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF]; - let mut pcm = pcm as i32; let sign = if pcm < 0 { pcm = -pcm; @@ -60,14 +62,10 @@ pub fn linear_to_ulaw(pcm: i16) -> u8 { } pcm += BIAS; - let seg = seg_for(pcm, &SEG_END); - if seg >= 8 { - 0x7F ^ sign - } else { - let mantissa = ((pcm >> (seg + 3)) & 0x0F) as u8; - let coded = ((seg as u8) << 4) | mantissa; - coded ^ sign - } + let seg = seg_for(pcm); + let mantissa = ((pcm >> (seg + 3)) & 0x0F) as u8; + let coded = ((seg as u8) << 4) | mantissa; + coded ^ sign } /// Decode one μ-law byte to a 16-bit PCM sample. @@ -87,26 +85,20 @@ pub fn ulaw_to_linear(ulaw: u8) -> i16 { /// Encode one 16-bit PCM sample to an A-law byte (G.711A). pub fn linear_to_alaw(pcm: i16) -> u8 { - const SEG_END: [i32; 8] = [0xFF, 0x1FF, 0x3FF, 0x7FF, 0xFFF, 0x1FFF, 0x3FFF, 0x7FFF]; - let (pcm, mask) = if pcm >= 0 { (pcm as i32, 0xD5u8) } else { (((!pcm) as i32) & 0x7FFF, 0x55u8) }; - let seg = seg_for(pcm, &SEG_END); - if seg >= 8 { - 0x7F ^ mask + let seg = seg_for(pcm); + let mantissa = if seg < 1 { + ((pcm >> 4) & 0x0F) as u8 } else { - let mantissa = if seg < 1 { - ((pcm >> 4) & 0x0F) as u8 - } else { - ((pcm >> (seg + 3)) & 0x0F) as u8 - }; - let coded = ((seg as u8) << 4) | mantissa; - coded ^ mask - } + ((pcm >> (seg + 3)) & 0x0F) as u8 + }; + let coded = ((seg as u8) << 4) | mantissa; + coded ^ mask } /// Decode one A-law byte to a 16-bit PCM sample. From f0955132579b51d4cf266f7facee46cbf5c1a06d Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Thu, 14 May 2026 16:17:16 +1200 Subject: [PATCH 4/5] docs: document audio_io + g711 in README, fill missing rustdoc Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 59 +++++++++++++++++++++++++++ crates/wavekat-core/src/codec/g711.rs | 4 ++ 2 files changed, 63 insertions(+) diff --git a/README.md b/README.md index e3453c3..0738933 100644 --- a/README.md +++ b/README.md @@ -19,6 +19,8 @@ Shared types for the WaveKat audio processing ecosystem. |------|-------------| | `AudioFrame` | Audio samples with sample rate, accepts `i16` and `f32` in slice, Vec, or array form | | `IntoSamples` | Trait for transparent sample format conversion | +| `AudioSource` / `AudioSink` | Async producer/consumer traits — the seam every WaveKat audio pipeline composes against | +| `codec::g711` | G.711 μ-law (PCMU) and A-law (PCMA) — telephony codecs for SIP/RTP | ## Quick Start @@ -65,6 +67,63 @@ Your audio (any format) +---> wavekat-asr (future) ``` +## Audio Sources and Sinks + +`AudioSource` and `AudioSink` are the producer/consumer seam every WaveKat +audio pipeline composes against. Concrete impls live in their consuming +crates (cpal mic/speaker in `wavekat-voice`, an agent-backed source in +`wavekat-agent`, …) so adding a new producer or consumer is "implement +the trait" rather than "rewrite the RTP path." + +```rust +use wavekat_core::{AudioFrame, AudioSink, AudioSource}; + +struct FileSource { /* … */ } +struct RtpSink { /* … */ } + +impl AudioSource for FileSource { + async fn next_frame(&mut self) -> Option> { + // None signals end-of-stream; callers stop draining. + todo!() + } +} + +impl AudioSink for RtpSink { + async fn write_frame(&mut self, frame: AudioFrame<'_>) { + // Implementations may drop on backpressure rather than block — + // stalling the RTP path is worse than dropping a frame. + todo!() + } +} +``` + +## G.711 Telephony Codec + +PCMU (μ-law) and PCMA (A-law) — the two static codecs every SIP endpoint +speaks. One 16-bit PCM sample ↔ one 8-bit codeword; a 20 ms RTP frame at +8 kHz is 160 samples / 160 bytes. + +```rust +use wavekat_core::codec::g711::{ + G711Codec, G711_FRAME_SAMPLES, G711_SAMPLE_RATE, +}; + +// Resolve the codec from a SIP/RTP payload type. +let codec = G711Codec::from_payload_type(0).unwrap(); // 0 = PCMU, 8 = PCMA + +// Encode a 20 ms frame of PCM into G.711 bytes. +let pcm: Vec = vec![0; G711_FRAME_SAMPLES]; +let mut bytes = Vec::with_capacity(G711_FRAME_SAMPLES); +codec.encode(&pcm, &mut bytes); +assert_eq!(bytes.len(), G711_FRAME_SAMPLES); + +// Decode the other direction. +let mut decoded = Vec::with_capacity(bytes.len()); +codec.decode(&bytes, &mut decoded); + +assert_eq!(G711_SAMPLE_RATE, 8000); +``` + ## Optional Features ### `wav` diff --git a/crates/wavekat-core/src/codec/g711.rs b/crates/wavekat-core/src/codec/g711.rs index dd8fa41..4419d9c 100644 --- a/crates/wavekat-core/src/codec/g711.rs +++ b/crates/wavekat-core/src/codec/g711.rs @@ -127,11 +127,15 @@ pub fn alaw_to_linear(alaw: u8) -> i16 { /// version we pass around in code. #[derive(Debug, Clone, Copy, PartialEq, Eq)] pub enum G711Codec { + /// μ-law (G.711U) — North America / Japan default, RTP payload type `0`. Pcmu, + /// A-law (G.711A) — Europe / rest-of-world default, RTP payload type `8`. Pcma, } impl G711Codec { + /// The static RTP payload-type number for this codec — `0` for PCMU, + /// `8` for PCMA (RFC 3551 §6). pub fn payload_type(self) -> u8 { match self { G711Codec::Pcmu => PCMU_PAYLOAD_TYPE, From 410618e6abe7c37c65b32eb2009420df11e51396 Mon Sep 17 00:00:00 2001 From: Eason WaveKat Date: Thu, 14 May 2026 16:19:41 +1200 Subject: [PATCH 5/5] docs: drop sibling crate names from audio_io section Co-Authored-By: Claude Opus 4.7 (1M context) --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0738933..991af58 100644 --- a/README.md +++ b/README.md @@ -70,10 +70,10 @@ Your audio (any format) ## Audio Sources and Sinks `AudioSource` and `AudioSink` are the producer/consumer seam every WaveKat -audio pipeline composes against. Concrete impls live in their consuming -crates (cpal mic/speaker in `wavekat-voice`, an agent-backed source in -`wavekat-agent`, …) so adding a new producer or consumer is "implement -the trait" rather than "rewrite the RTP path." +audio pipeline composes against. Concrete impls (cpal-backed mic/speaker, +agent-driven sources, RTP-driven sinks, …) live in the consuming crates so +that adding a new producer or consumer is "implement the trait" rather than +"rewrite the RTP path." ```rust use wavekat_core::{AudioFrame, AudioSink, AudioSource};