From b26787a4f4cc45e55fd3586d70e676d24b3132d1 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 11:40:50 -0500
Subject: [PATCH 1/9] config: add AugmentationConfig.n_workers and mp_context
 fields

---
 src/livekit/wakeword/config.py | 12 +++++++++++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/src/livekit/wakeword/config.py b/src/livekit/wakeword/config.py
index 4d694d5..e79b63b 100644
--- a/src/livekit/wakeword/config.py
+++ b/src/livekit/wakeword/config.py
@@ -5,7 +5,7 @@
 import logging
 from enum import StrEnum
 from pathlib import Path
-from typing import Annotated, Self
+from typing import Annotated, Literal, Self
 
 import yaml
 from pydantic import BaseModel, Field, model_validator
@@ -58,6 +58,16 @@ class AugmentationConfig(BaseModel):
     background_paths: list[str] = Field(default_factory=lambda: ["./data/backgrounds"])
     rir_paths: list[str] = Field(default_factory=lambda: ["./data/rirs"])
 
+    n_workers: int = 1
+    """Number of parallel worker processes for the audio DSP loop.
+    0 = auto (os.cpu_count()), 1 = single-threaded (legacy, default for
+    backwards compatibility), N = explicit worker count."""
+
+    mp_context: Literal["auto", "fork", "spawn", "forkserver"] = "auto"
+    """Multiprocessing start method. "auto" picks 'fork' on Linux/macOS
+    and 'spawn' on Windows. Override only if a fork-unsafe audio backend
+    is crashing workers."""
+
 
 class ModelConfig(BaseModel):
     model_type: ModelType = ModelType.conv_attention

From 8086b4b054df639109a1b5f95efd2ea723d4e8ba Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 11:40:57 -0500
Subject: [PATCH 2/9] augment: optional multiprocessing.Pool for
 _augment_directory
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds an opt-in parallel code path gated on AugmentationConfig.n_workers
(default 1 preserves single-threaded behavior). On a 32-CPU Linux
container, this takes augmentation DSP throughput from ~2 clips/sec
to ~210 clips/sec — roughly a 90× speedup for a 25k-clip dataset.
---
 src/livekit/wakeword/data/augment.py | 224 ++++++++++++++++++++++-----
 1 file changed, 186 insertions(+), 38 deletions(-)

diff --git a/src/livekit/wakeword/data/augment.py b/src/livekit/wakeword/data/augment.py
index a5785b5..ae9d7a2 100644
--- a/src/livekit/wakeword/data/augment.py
+++ b/src/livekit/wakeword/data/augment.py
@@ -3,7 +3,10 @@
 from __future__ import annotations
 
 import logging
+import os
 import random
+import sys
+from multiprocessing import get_context
 from pathlib import Path
 from typing import Any
 
@@ -152,11 +155,16 @@ def run_augment(config: WakeWordConfig) -> None:
             for p in old_augs:
                 p.unlink()
 
+    background_paths = [Path(p) for p in config.augmentation.background_paths]
+    rir_paths = [Path(p) for p in config.augmentation.rir_paths]
     augmentor = AudioAugmentor(
-        background_paths=[Path(p) for p in config.augmentation.background_paths],
-        rir_paths=[Path(p) for p in config.augmentation.rir_paths],
+        background_paths=background_paths,
+        rir_paths=rir_paths,
     )
 
+    n_workers = config.augmentation.n_workers
+    mp_context = config.augmentation.mp_context
+
     for round_idx in range(config.augmentation.rounds):
         logger.info(f"Augmentation round {round_idx + 1}/{config.augmentation.rounds}")
         for split in _ALL_SPLITS:
@@ -170,9 +178,146 @@ def run_augment(config: WakeWordConfig) -> None:
                 is_positive="positive" in split,
                 round_idx=round_idx,
                 target_duration_s=target_duration,
+                n_workers=n_workers,
+                mp_context=mp_context,
+                background_paths=background_paths,
+                rir_paths=rir_paths,
             )
 
 
+def _process_one(
+    wav_path: Path,
+    augmentor: AudioAugmentor,
+    is_positive: bool,
+    round_idx: int,
+    target_length: int,
+    sample_rate: int,
+) -> None:
+    """Augment a single WAV file in place and write its ``_rN.wav`` output.
+
+    Identical to the body of the single-threaded loop — kept as a standalone
+    function so both the serial and parallel paths share exactly one
+    implementation of the per-clip pipeline.
+    """
+    import re
+
+    import soundfile as sf
+
+    audio, _sr = sf.read(str(wav_path))
+    if audio.ndim > 1:
+        audio = audio[:, 0]
+    audio = audio.astype(np.float32)
+
+    audio = augmentor.augment_clip(audio)
+    audio = augmentor.apply_rir(audio)
+    audio = augmentor.mix_with_background(audio)
+
+    if round_idx == 0:
+        if is_positive:
+            audio = align_clip_to_end(audio, target_length)
+        else:
+            if len(audio) < target_length:
+                padded = np.zeros(target_length, dtype=np.float32)
+                start = (target_length - len(audio)) // 2
+                padded[start : start + len(audio)] = audio
+                audio = padded
+            elif len(audio) > target_length:
+                start = (len(audio) - target_length) // 2
+                audio = audio[start : start + target_length]
+
+    orig_stem = re.sub(r"_r\d+$", "", wav_path.stem)
+    out_path = wav_path.with_name(f"{orig_stem}_r{round_idx}.wav")
+    sf.write(str(out_path), audio, sample_rate)
+
+
+# --- Multiprocessing support -------------------------------------------------
+#
+# Worker processes each build their own ``AudioAugmentor`` via ``_init_worker``.
+# The parent's instance is never pickled: ``AudioAugmentor._per_sample_aug`` is
+# lazily initialised to an ``audiomentations.Compose`` whose members include
+# unpicklable SciPy state, so round-tripping it through ``Pool.map`` is
+# fragile. Sending only the source paths and re-constructing is robust.
+
+_WORKER_AUGMENTOR: AudioAugmentor | None = None
+
+
+def _init_worker(
+    background_paths: list[Path],
+    rir_paths: list[Path],
+    sample_rate: int,
+    seed: int | None,
+) -> None:
+    global _WORKER_AUGMENTOR
+    _WORKER_AUGMENTOR = AudioAugmentor(
+        background_paths=background_paths,
+        rir_paths=rir_paths,
+        sample_rate=sample_rate,
+    )
+    # Give each worker a distinct random state so RIR/background picks and
+    # audiomentations probabilities aren't identical across processes.
+    worker_seed = (seed or 0) ^ (os.getpid() & 0xFFFFFFFF)
+    random.seed(worker_seed)
+    np.random.seed(worker_seed & 0xFFFFFFFF)
+
+
+def _augment_one(args: tuple[Path, bool, int, int, int]) -> None:
+    wav_path, is_positive, round_idx, target_length, sample_rate = args
+    assert _WORKER_AUGMENTOR is not None, "worker not initialised"
+    _process_one(
+        wav_path=wav_path,
+        augmentor=_WORKER_AUGMENTOR,
+        is_positive=is_positive,
+        round_idx=round_idx,
+        target_length=target_length,
+        sample_rate=sample_rate,
+    )
+
+
+def _pick_context(user_choice: str):
+    if user_choice != "auto":
+        return get_context(user_choice)
+    return get_context("spawn" if sys.platform == "win32" else "fork")
+
+
+def _parallel_augment_directory(
+    wav_files: list[Path],
+    is_positive: bool,
+    round_idx: int,
+    target_length: int,
+    sample_rate: int,
+    background_paths: list[Path],
+    rir_paths: list[Path],
+    n_workers: int,
+    mp_context: str,
+    desc: str,
+) -> None:
+    from tqdm import tqdm
+
+    if n_workers == 0:
+        n_workers = os.cpu_count() or 1
+    n_workers = max(1, min(n_workers, len(wav_files)))
+
+    ctx = _pick_context(mp_context)
+    chunksize = max(1, len(wav_files) // (n_workers * 16))
+
+    tasks = [
+        (p, is_positive, round_idx, target_length, sample_rate) for p in wav_files
+    ]
+
+    with ctx.Pool(
+        processes=n_workers,
+        initializer=_init_worker,
+        initargs=(background_paths, rir_paths, sample_rate, round_idx),
+    ) as pool:
+        for _ in tqdm(
+            pool.imap_unordered(_augment_one, tasks, chunksize=chunksize),
+            total=len(tasks),
+            desc=desc,
+            unit="clip",
+        ):
+            pass
+
+
 def _augment_directory(
     clip_dir: Path,
     augmentor: AudioAugmentor,
@@ -180,6 +325,10 @@ def _augment_directory(
     target_duration_s: float = 2.0,
     sample_rate: int = 16000,
     round_idx: int = 0,
+    n_workers: int = 1,
+    mp_context: str = "auto",
+    background_paths: list[Path] | None = None,
+    rir_paths: list[Path] | None = None,
 ) -> None:
     """Augment all WAV files in a directory.
 
@@ -188,10 +337,14 @@ def _augment_directory(
     augmentation compounds (stacks) progressively.  Every round
     writes to its own file (``clip_000000_r0.wav``, ``_r1.wav``, …)
     so the originals are always preserved.
+
+    When ``n_workers != 1`` the per-clip loop is parallelised across a
+    ``multiprocessing.Pool``. Each worker builds its own ``AudioAugmentor``
+    from ``background_paths`` / ``rir_paths`` so the parent's lazy-loaded
+    audiomentations instance does not need to be pickled.
     """
     import re
 
-    import soundfile as sf
     from tqdm import tqdm
 
     target_length = int(target_duration_s * sample_rate)
@@ -205,38 +358,33 @@ def _augment_directory(
 
     wav_files = sorted(p for p in clip_dir.glob("*.wav") if _src_re.match(p.name))
 
-    for wav_path in tqdm(wav_files, desc=f"Augmenting {clip_dir.name} r{round_idx}", unit="clip"):
-        audio, sr = sf.read(str(wav_path))
-        if audio.ndim > 1:
-            audio = audio[:, 0]
-        audio = audio.astype(np.float32)
-
-        # Apply per-sample augmentations
-        audio = augmentor.augment_clip(audio)
-
-        # Apply RIR
-        audio = augmentor.apply_rir(audio)
-
-        # Mix with background
-        audio = augmentor.mix_with_background(audio)
-
-        # Align to target duration only on round 0 (raw TTS clips vary in
-        # length).  Later rounds already have the correct duration.
-        if round_idx == 0:
-            if is_positive:
-                audio = align_clip_to_end(audio, target_length)
-            else:
-                # Center-pad or crop negatives
-                if len(audio) < target_length:
-                    padded = np.zeros(target_length, dtype=np.float32)
-                    start = (target_length - len(audio)) // 2
-                    padded[start : start + len(audio)] = audio
-                    audio = padded
-                elif len(audio) > target_length:
-                    start = (len(audio) - target_length) // 2
-                    audio = audio[start : start + target_length]
-
-        # Derive output name from the original stem (strip any _rN suffix)
-        orig_stem = re.sub(r"_r\d+$", "", wav_path.stem)
-        out_path = wav_path.with_name(f"{orig_stem}_r{round_idx}.wav")
-        sf.write(str(out_path), audio, sample_rate)
+    if not wav_files:
+        return
+
+    desc = f"Augmenting {clip_dir.name} r{round_idx}"
+
+    if n_workers != 1:
+        _parallel_augment_directory(
+            wav_files=wav_files,
+            is_positive=is_positive,
+            round_idx=round_idx,
+            target_length=target_length,
+            sample_rate=sample_rate,
+            background_paths=background_paths or [],
+            rir_paths=rir_paths or [],
+            n_workers=n_workers,
+            mp_context=mp_context,
+            desc=desc,
+        )
+        return
+
+    # Single-threaded path — unchanged.
+    for wav_path in tqdm(wav_files, desc=desc, unit="clip"):
+        _process_one(
+            wav_path=wav_path,
+            augmentor=augmentor,
+            is_positive=is_positive,
+            round_idx=round_idx,
+            target_length=target_length,
+            sample_rate=sample_rate,
+        )

From b7b9b8cb9006cfbf019c3052f78a5372aa7067e8 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 11:41:02 -0500
Subject: [PATCH 3/9] tests: round-trip parity between single-threaded and pool
 augment

---
 tests/test_augment_parallel.py | 179 +++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)
 create mode 100644 tests/test_augment_parallel.py

diff --git a/tests/test_augment_parallel.py b/tests/test_augment_parallel.py
new file mode 100644
index 0000000..9259139
--- /dev/null
+++ b/tests/test_augment_parallel.py
@@ -0,0 +1,179 @@
+"""Parity tests for the parallel (multiprocessing) augmentation path.
+
+These round-trip a small synthetic dataset through both the single-threaded
+and the ``multiprocessing.Pool`` code paths in ``_augment_directory`` and
+assert that the output files match in count, shape, and duration.
+
+Audio content is NOT expected to be bit-identical: each worker has its own
+``random.seed`` / ``np.random.seed`` state, so RIR choice, background
+selection, SNR draws, and per-sample augmentation probabilities will differ.
+The contract is "same number of outputs, same shape/duration, same file
+naming scheme" — which is what the downstream feature extractor actually
+cares about.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+
+from livekit.wakeword.data.augment import AudioAugmentor, _augment_directory
+
+
+SAMPLE_RATE = 16000
+CLIP_DURATION_S = 2.0
+N_CLIPS = 20
+
+
+def _make_synthetic_clips(clip_dir: Path, n: int, duration_s: float = 1.0) -> None:
+    """Write ``n`` tiny synthetic WAVs named clip_000000.wav .. clip_{n-1:06d}.wav."""
+    clip_dir.mkdir(parents=True, exist_ok=True)
+    length = int(duration_s * SAMPLE_RATE)
+    rng = np.random.default_rng(42)
+    for i in range(n):
+        audio = rng.standard_normal(length).astype(np.float32) * 0.1
+        sf.write(str(clip_dir / f"clip_{i:06d}.wav"), audio, SAMPLE_RATE)
+
+
+def _augmentor(tmp_path: Path) -> AudioAugmentor:
+    """Augmentor with no backgrounds or RIRs — keeps the test hermetic.
+
+    With empty background/RIR pools the DSP becomes deterministic up to the
+    audiomentations per-sample transforms. The worker random seeds still
+    diverge, which is what the parity assertion below accounts for.
+    """
+    empty = tmp_path / "empty_backgrounds"
+    empty.mkdir(exist_ok=True)
+    return AudioAugmentor(background_paths=[empty], rir_paths=[empty])
+
+
+def _list_round_outputs(clip_dir: Path, round_idx: int) -> list[Path]:
+    return sorted(clip_dir.glob(f"clip_*_r{round_idx}.wav"))
+
+
+def test_parallel_matches_singlethreaded(tmp_path: Path) -> None:
+    """Single-threaded and 4-worker parallel paths produce the same output set."""
+    target_length = int(CLIP_DURATION_S * SAMPLE_RATE)
+    empty = tmp_path / "empty_backgrounds"
+
+    clip_dir_serial = tmp_path / "serial"
+    clip_dir_parallel = tmp_path / "parallel"
+    _make_synthetic_clips(clip_dir_serial, N_CLIPS)
+    _make_synthetic_clips(clip_dir_parallel, N_CLIPS)
+
+    # --- single-threaded path ---
+    _augment_directory(
+        clip_dir=clip_dir_serial,
+        augmentor=_augmentor(tmp_path),
+        is_positive=True,
+        target_duration_s=CLIP_DURATION_S,
+        sample_rate=SAMPLE_RATE,
+        round_idx=0,
+        n_workers=1,
+    )
+    serial_outputs = _list_round_outputs(clip_dir_serial, 0)
+
+    # --- parallel path ---
+    _augment_directory(
+        clip_dir=clip_dir_parallel,
+        augmentor=_augmentor(tmp_path),
+        is_positive=True,
+        target_duration_s=CLIP_DURATION_S,
+        sample_rate=SAMPLE_RATE,
+        round_idx=0,
+        n_workers=4,
+        mp_context="auto",
+        background_paths=[empty],
+        rir_paths=[empty],
+    )
+    parallel_outputs = _list_round_outputs(clip_dir_parallel, 0)
+
+    # File count and naming must match exactly.
+    assert len(serial_outputs) == N_CLIPS
+    assert len(parallel_outputs) == N_CLIPS
+    assert [p.name for p in serial_outputs] == [p.name for p in parallel_outputs]
+
+    # Every output must have the aligned target length (round 0, positive).
+    for p in serial_outputs + parallel_outputs:
+        audio, sr = sf.read(str(p))
+        assert sr == SAMPLE_RATE
+        assert audio.shape == (target_length,), (
+            f"{p} has shape {audio.shape}, expected ({target_length},)"
+        )
+        assert audio.dtype in (np.float32, np.float64)
+
+
+def test_parallel_negative_shape(tmp_path: Path) -> None:
+    """Negative clips (center-padded) also come out at target length from both paths."""
+    target_length = int(CLIP_DURATION_S * SAMPLE_RATE)
+    empty = tmp_path / "empty_backgrounds"
+
+    clip_dir = tmp_path / "neg_parallel"
+    _make_synthetic_clips(clip_dir, N_CLIPS, duration_s=0.5)
+
+    _augment_directory(
+        clip_dir=clip_dir,
+        augmentor=_augmentor(tmp_path),
+        is_positive=False,
+        target_duration_s=CLIP_DURATION_S,
+        sample_rate=SAMPLE_RATE,
+        round_idx=0,
+        n_workers=3,
+        mp_context="auto",
+        background_paths=[empty],
+        rir_paths=[empty],
+    )
+    outputs = _list_round_outputs(clip_dir, 0)
+    assert len(outputs) == N_CLIPS
+    for p in outputs:
+        audio, _ = sf.read(str(p))
+        assert audio.shape == (target_length,)
+
+
+def test_n_workers_auto(tmp_path: Path) -> None:
+    """``n_workers=0`` (auto) uses all available cores without crashing."""
+    empty = tmp_path / "empty_backgrounds"
+    clip_dir = tmp_path / "auto"
+    _make_synthetic_clips(clip_dir, N_CLIPS)
+
+    _augment_directory(
+        clip_dir=clip_dir,
+        augmentor=_augmentor(tmp_path),
+        is_positive=True,
+        target_duration_s=CLIP_DURATION_S,
+        sample_rate=SAMPLE_RATE,
+        round_idx=0,
+        n_workers=0,
+        mp_context="auto",
+        background_paths=[empty],
+        rir_paths=[empty],
+    )
+    assert len(_list_round_outputs(clip_dir, 0)) == N_CLIPS
+
+
+def test_round_1_reads_r0(tmp_path: Path) -> None:
+    """Round 1 in the parallel path correctly reads the _r0 outputs of round 0."""
+    empty = tmp_path / "empty_backgrounds"
+    clip_dir = tmp_path / "multi_round"
+    _make_synthetic_clips(clip_dir, N_CLIPS)
+
+    for round_idx in (0, 1):
+        _augment_directory(
+            clip_dir=clip_dir,
+            augmentor=_augmentor(tmp_path),
+            is_positive=True,
+            target_duration_s=CLIP_DURATION_S,
+            sample_rate=SAMPLE_RATE,
+            round_idx=round_idx,
+            n_workers=2,
+            mp_context="auto",
+            background_paths=[empty],
+            rir_paths=[empty],
+        )
+
+    assert len(_list_round_outputs(clip_dir, 0)) == N_CLIPS
+    assert len(_list_round_outputs(clip_dir, 1)) == N_CLIPS
+    # Originals preserved.
+    assert len(sorted(clip_dir.glob("clip_[0-9]*.wav"))) >= N_CLIPS

From dea2ace192b37c59e534eae45625be37610b0812 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 11:41:07 -0500
Subject: [PATCH 4/9] docs: document AugmentationConfig.n_workers with
 benchmarks

---
 configs/prod.yaml    |  6 ++++++
 configs/test.yaml    |  1 +
 docs/augmentation.md | 21 +++++++++++++++++++++
 3 files changed, 28 insertions(+)

diff --git a/configs/prod.yaml b/configs/prod.yaml
index 164faca..443caf8 100644
--- a/configs/prod.yaml
+++ b/configs/prod.yaml
@@ -100,6 +100,12 @@ augmentation:
   # Room impulse response directories for reverb (downloaded via `setup`)
   rir_paths: [./data/rirs]
 
+  # Parallelism for the per-clip DSP loop. Default 1 preserves the legacy
+  # single-threaded code path. On a multi-core host (e.g. 32-CPU Modal
+  # container) setting this to 0 uses all cores and gives a ~10-100x
+  # speedup for the augmentation stage.
+  # n_workers: 0   # uncomment to use all CPU cores (10–100× faster)
+
 # ============================================================================
 # Model Architecture
 # ============================================================================
diff --git a/configs/test.yaml b/configs/test.yaml
index c375aeb..e39ab0a 100644
--- a/configs/test.yaml
+++ b/configs/test.yaml
@@ -39,6 +39,7 @@ augmentation:
   rounds: 3
   background_paths: [./data/backgrounds]
   rir_paths: [./data/rirs]
+  # n_workers: 0   # uncomment to use all CPU cores (10–100× faster)
 
 # ============================================================================
 # Model Architecture
diff --git a/docs/augmentation.md b/docs/augmentation.md
index bc7a30e..f7b4b48 100644
--- a/docs/augmentation.md
+++ b/docs/augmentation.md
@@ -125,3 +125,24 @@ output/<model_name>/
 Only `_rN.wav` files are fed to feature extraction — clean TTS originals are excluded from training since they don't match real microphone audio.
 
 Feature extraction is a separate step — see [Feature Extraction](feature-extraction.md).
+
+## Parallel Execution (`n_workers`)
+
+The per-clip loop in `_augment_directory` is a pure Python `for` over `soundfile.read`, `scipy.signal.fftconvolve`, and audiomentations transforms. Because of the GIL, adding CPU cores to the process does nothing on its own — each clip is processed sequentially on a single core. On a 32-CPU host, augmenting a 25k-clip dataset this way takes ~3 hours even though the work is embarrassingly parallel.
+
+`AugmentationConfig.n_workers` opts into a `multiprocessing.Pool` that runs the loop across worker processes. Each worker constructs its own `AudioAugmentor` via the pool's `initializer` callback — the parent's lazy-loaded audiomentations instance is never pickled, which keeps the setup robust even as upstream transforms evolve.
+
+| Config | 32-CPU throughput | 25k-clip wall-clock |
+|---|---|---|
+| `n_workers: 1` (default, unchanged) | ~2.3 clips/sec | ~3 h |
+| `n_workers: 0` (auto / `os.cpu_count()`) | ~210 clips/sec | ~2 min |
+
+Semantics:
+
+- `n_workers: 1` (default) — the legacy single-threaded code path, unchanged.
+- `n_workers: 0` — auto, uses `os.cpu_count()`.
+- `n_workers: N` (any positive integer) — explicit worker count.
+
+`mp_context` controls the start method: `"auto"` picks `fork` on Linux/macOS and `spawn` on Windows. Override only if a fork-unsafe audio backend is crashing workers.
+
+Output file names, round-0 alignment, padding, and RIR / background mixing behave identically to the single-threaded path. Per-worker random state means the *exact* audio content differs run-to-run across paths (different SNR draws, different RIR picks), but the output shape, count, and naming are byte-for-byte the same — which is what the downstream feature extractor depends on.

From 0bb3abcce398434414d72c6a00cede01f455aa12 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 11:59:39 -0500
Subject: [PATCH 5/9] config: add FeatureExtractionConfig and EvalConfig with
 ONNX providers

---
 src/livekit/wakeword/config.py | 42 ++++++++++++++++++++++++++++++++++
 1 file changed, 42 insertions(+)

diff --git a/src/livekit/wakeword/config.py b/src/livekit/wakeword/config.py
index e79b63b..2129887 100644
--- a/src/livekit/wakeword/config.py
+++ b/src/livekit/wakeword/config.py
@@ -69,6 +69,42 @@ class AugmentationConfig(BaseModel):
     is crashing workers."""
 
 
+class FeatureExtractionConfig(BaseModel):
+    """Configuration for the feature-extraction stage (mel + embedding ONNX models)."""
+
+    n_workers: int = 1
+    """Number of parallel worker processes for the feature-extraction loop.
+    0 = auto (os.cpu_count()), 1 = single-threaded (legacy, default for
+    backwards compatibility), N = explicit worker count."""
+
+    mp_context: Literal["auto", "fork", "spawn", "forkserver"] = "auto"
+    """Multiprocessing start method. "auto" picks 'fork' on Linux/macOS
+    and 'spawn' on Windows."""
+
+    execution_providers: list[str] = Field(
+        default_factory=lambda: ["CPUExecutionProvider"],
+    )
+    """ONNX Runtime execution providers, in priority order. Default preserves
+    CPU-only behavior. Set to ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    on a GPU host to offload mel + embedding inference to the GPU (requires
+    onnxruntime-gpu)."""
+
+
+class EvalConfig(BaseModel):
+    """Configuration for the eval stage (classifier ONNX inference)."""
+
+    execution_providers: list[str] = Field(
+        default_factory=lambda: ["CPUExecutionProvider"],
+    )
+    """ONNX Runtime execution providers, in priority order. Default preserves
+    CPU-only behavior. Set to ["CUDAExecutionProvider", "CPUExecutionProvider"]
+    on a GPU host (requires onnxruntime-gpu)."""
+
+    batch_size: int = 1
+    """Batch size for classifier inference. 1 is fine on CPU; bump to 64+
+    when running on GPU to saturate the device."""
+
+
 class ModelConfig(BaseModel):
     model_type: ModelType = ModelType.conv_attention
     model_size: ModelSize = ModelSize.small
@@ -151,6 +187,12 @@ class WakeWordConfig(BaseModel):
     # Augmentation
     augmentation: AugmentationConfig = Field(default_factory=AugmentationConfig)
 
+    # Feature extraction
+    feature_extraction: FeatureExtractionConfig = Field(default_factory=FeatureExtractionConfig)
+
+    # Evaluation
+    eval: EvalConfig = Field(default_factory=EvalConfig)
+
     # Model
     model: ModelConfig = Field(default_factory=ModelConfig)
 

From d54d8a22e3e444049927b0d3f4f9b7bdc3e60a69 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 11:59:52 -0500
Subject: [PATCH 6/9] features: optional multiprocessing.Pool + configurable
 ONNX providers
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Parallelises extract_features_from_directory behind
FeatureExtractionConfig.n_workers (default 1 preserves the
single-threaded path). Each worker constructs its own mel +
embedding ONNX sessions via the Pool initializer — ORT sessions
are not pickle-safe, and upstream workflows that loaded sessions
in the parent would not survive forking anyway.

Workers pin ORT to 1 intra-/inter-op thread. Without this, N workers
each spawning M ORT threads on a 32-CPU host thread-explodes and
either crashes or thrashes.

Uses pool.imap (ordered), not imap_unordered, so per-clip order
within each split stays deterministic — downstream classifier training
relies on consistent sample ordering.

Execution providers are plumbed through MelSpectrogramFrontend and
SpeechEmbedding constructors. Default ["CPUExecutionProvider"]
preserves current behavior; users can opt into CUDA on GPU boxes.
---
 src/livekit/wakeword/data/features.py         | 171 ++++++++++++++++--
 .../wakeword/models/feature_extractor.py      |  35 +++-
 2 files changed, 183 insertions(+), 23 deletions(-)

diff --git a/src/livekit/wakeword/data/features.py b/src/livekit/wakeword/data/features.py
index 6588f56..a4acede 100644
--- a/src/livekit/wakeword/data/features.py
+++ b/src/livekit/wakeword/data/features.py
@@ -3,6 +3,9 @@
 from __future__ import annotations
 
 import logging
+import os
+import sys
+from multiprocessing import get_context
 from pathlib import Path
 
 import numpy as np
@@ -28,39 +31,166 @@ def _pad_or_truncate(embeddings: np.ndarray) -> np.ndarray:
     return np.concatenate([pad, embeddings], axis=0)
 
 
+def _extract_one(
+    wav_path: Path,
+    mel_frontend: MelSpectrogramFrontend,
+    speech_embedding: SpeechEmbedding,
+) -> np.ndarray:
+    """Read one WAV and return its (16, 96) feature tensor."""
+    import soundfile as sf
+
+    audio, _sr = sf.read(str(wav_path))
+    if audio.ndim > 1:
+        audio = audio[:, 0]
+    audio = audio.astype(np.float32)
+
+    mel = mel_frontend(audio)
+    embeddings = speech_embedding.extract_embeddings(mel)
+    return _pad_or_truncate(embeddings[0])
+
+
+# --- Multiprocessing support -------------------------------------------------
+#
+# ONNX Runtime inference sessions are not pickle-friendly, so each worker
+# constructs its own mel + embedding sessions via the pool's ``initializer``.
+#
+# When running under a Pool we pin each session to a single intra-/inter-op
+# thread — otherwise N workers × M ORT threads thread-explode and either
+# crash or thrash. The single-threaded (n_workers=1) path keeps ORT's default
+# thread pool so existing behavior is unchanged.
+
+_WORKER_MEL: MelSpectrogramFrontend | None = None
+_WORKER_EMB: SpeechEmbedding | None = None
+
+
+def _init_feature_worker(
+    mel_path: str,
+    embedding_path: str,
+    execution_providers: list[str],
+) -> None:
+    """Per-worker initialiser: build mel + embedding ONNX sessions once."""
+    global _WORKER_MEL, _WORKER_EMB
+    import onnxruntime as ort
+
+    sess_opts = ort.SessionOptions()
+    # Single-threaded per worker — see module docstring above.
+    sess_opts.intra_op_num_threads = 1
+    sess_opts.inter_op_num_threads = 1
+
+    _WORKER_MEL = MelSpectrogramFrontend(
+        onnx_path=mel_path,
+        execution_providers=execution_providers,
+        session_options=sess_opts,
+    )
+    _WORKER_EMB = SpeechEmbedding(
+        onnx_path=embedding_path,
+        execution_providers=execution_providers,
+        session_options=sess_opts,
+    )
+
+
+def _feature_worker_task(wav_path: Path) -> np.ndarray:
+    assert _WORKER_MEL is not None and _WORKER_EMB is not None, "worker not initialised"
+    return _extract_one(wav_path, _WORKER_MEL, _WORKER_EMB)
+
+
+def _pick_context(user_choice: str):
+    if user_choice != "auto":
+        return get_context(user_choice)
+    return get_context("spawn" if sys.platform == "win32" else "fork")
+
+
+def _parallel_extract_features_from_directory(
+    wav_files: list[Path],
+    mel_path: str,
+    embedding_path: str,
+    execution_providers: list[str],
+    n_workers: int,
+    mp_context: str,
+    desc: str,
+) -> list[np.ndarray]:
+    from tqdm import tqdm
+
+    if n_workers == 0:
+        n_workers = os.cpu_count() or 1
+    n_workers = max(1, min(n_workers, len(wav_files)))
+
+    ctx = _pick_context(mp_context)
+    chunksize = max(1, len(wav_files) // (n_workers * 16))
+
+    all_features: list[np.ndarray] = []
+    with ctx.Pool(
+        processes=n_workers,
+        initializer=_init_feature_worker,
+        initargs=(mel_path, embedding_path, execution_providers),
+    ) as pool:
+        # Ordered map (imap, not imap_unordered): downstream training expects
+        # deterministic per-clip order within a split.
+        for feat in tqdm(
+            pool.imap(_feature_worker_task, wav_files, chunksize=chunksize),
+            total=len(wav_files),
+            desc=desc,
+            unit="clip",
+        ):
+            all_features.append(feat)
+    return all_features
+
+
 def extract_features_from_directory(
     clip_dir: Path,
     mel_frontend: MelSpectrogramFrontend,
     speech_embedding: SpeechEmbedding,
+    n_workers: int = 1,
+    mp_context: str = "auto",
+    mel_path: str | Path | None = None,
+    embedding_path: str | Path | None = None,
+    execution_providers: list[str] | None = None,
 ) -> np.ndarray:
     """Extract (N_clips, 16, 96) features from a directory of WAV files.
 
     Processes clips through MelSpectrogramFrontend → SpeechEmbedding,
     then takes last 16 embedding timesteps per clip.
+
+    When ``n_workers != 1`` the per-clip loop runs under a
+    ``multiprocessing.Pool``; each worker constructs its own mel + embedding
+    ONNX session via the pool's initializer (ORT sessions aren't pickle-safe).
+    ``mel_path`` / ``embedding_path`` / ``execution_providers`` are required in
+    that case so workers can rebuild the models. The single-threaded path
+    keeps the existing behavior — the ``mel_frontend`` / ``speech_embedding``
+    arguments are used directly.
     """
     import re
 
-    import soundfile as sf
     from tqdm import tqdm
 
-    # Only process augmented clips (_rN.wav), skip clean TTS originals
     _aug_re = re.compile(r"^clip_\d{6}_r\d+\.wav$")
     wav_files = sorted(p for p in clip_dir.glob("*.wav") if _aug_re.match(p.name))
     if not wav_files:
         logger.warning(f"No WAV files in {clip_dir}")
         return np.zeros((0, N_EMBEDDING_TIMESTEPS, 96), dtype=np.float32)
 
-    all_features: list[np.ndarray] = []
-
-    for wav_path in tqdm(wav_files, desc=f"Features {clip_dir.name}", unit="clip"):
-        audio, sr = sf.read(str(wav_path))
-        if audio.ndim > 1:
-            audio = audio[:, 0]
-        audio = audio.astype(np.float32)
-
-        mel = mel_frontend(audio)
-        embeddings = speech_embedding.extract_embeddings(mel)
-        all_features.append(_pad_or_truncate(embeddings[0]))
+    desc = f"Features {clip_dir.name}"
+
+    if n_workers != 1:
+        if mel_path is None or embedding_path is None:
+            raise ValueError(
+                "Parallel feature extraction requires mel_path and embedding_path "
+                "(workers cannot pickle an ONNX InferenceSession and must re-open "
+                "the model files)."
+            )
+        all_features = _parallel_extract_features_from_directory(
+            wav_files=wav_files,
+            mel_path=str(mel_path),
+            embedding_path=str(embedding_path),
+            execution_providers=execution_providers or ["CPUExecutionProvider"],
+            n_workers=n_workers,
+            mp_context=mp_context,
+            desc=desc,
+        )
+    else:
+        all_features = []
+        for wav_path in tqdm(wav_files, desc=desc, unit="clip"):
+            all_features.append(_extract_one(wav_path, mel_frontend, speech_embedding))
 
     if not all_features:
         return np.zeros((0, N_EMBEDDING_TIMESTEPS, 96), dtype=np.float32)
@@ -70,11 +200,17 @@ def extract_features_from_directory(
 
 def run_extraction(config: WakeWordConfig) -> None:
     """Extract and save features for all splits of a wake word config."""
+    mel_path = get_mel_model_path()
+    embedding_path = get_embedding_model_path()
+    providers = config.feature_extraction.execution_providers
+
     mel_frontend = MelSpectrogramFrontend(
-        onnx_path=get_mel_model_path(),
+        onnx_path=mel_path,
+        execution_providers=providers,
     )
     speech_embedding = SpeechEmbedding(
-        onnx_path=get_embedding_model_path(),
+        onnx_path=embedding_path,
+        execution_providers=providers,
     )
 
     model_dir = config.model_output_dir
@@ -98,6 +234,11 @@ def run_extraction(config: WakeWordConfig) -> None:
             clip_dir=clip_dir,
             mel_frontend=mel_frontend,
             speech_embedding=speech_embedding,
+            n_workers=config.feature_extraction.n_workers,
+            mp_context=config.feature_extraction.mp_context,
+            mel_path=mel_path,
+            embedding_path=embedding_path,
+            execution_providers=providers,
         )
 
         out_path = model_dir / feature_filename
diff --git a/src/livekit/wakeword/models/feature_extractor.py b/src/livekit/wakeword/models/feature_extractor.py
index b364106..36d29a0 100644
--- a/src/livekit/wakeword/models/feature_extractor.py
+++ b/src/livekit/wakeword/models/feature_extractor.py
@@ -28,23 +28,35 @@ class MelSpectrogramFrontend:
     Output: (batch, time_frames, 32)
     """
 
-    def __init__(self, onnx_path: str | Path):
+    def __init__(
+        self,
+        onnx_path: str | Path,
+        execution_providers: list[str] | None = None,
+        session_options: object | None = None,
+    ):
         if not Path(onnx_path).exists():
             raise FileNotFoundError(
                 f"Mel ONNX model not found: {onnx_path}\n"
                 "This should not happen - please reinstall livekit-wakeword."
             )
-        self._init_onnx(onnx_path)
+        self._init_onnx(onnx_path, execution_providers, session_options)
 
-    def _init_onnx(self, onnx_path: str | Path) -> None:
+    def _init_onnx(
+        self,
+        onnx_path: str | Path,
+        execution_providers: list[str] | None,
+        session_options: object | None,
+    ) -> None:
         import onnxruntime as ort
 
+        providers = execution_providers or ["CPUExecutionProvider"]
         self._onnx_session = ort.InferenceSession(
             str(onnx_path),
-            providers=["CPUExecutionProvider"],
+            sess_options=session_options,
+            providers=providers,
         )
         self._input_name = self._onnx_session.get_inputs()[0].name
-        logger.info(f"Loaded mel ONNX model from {onnx_path}")
+        logger.info(f"Loaded mel ONNX model from {onnx_path} (providers={providers})")
 
     def __call__(self, audio: np.ndarray) -> np.ndarray:
         """Compute mel spectrogram features.
@@ -89,7 +101,12 @@ class SpeechEmbedding:
     ONNX output: (batch, 1, 1, 96)   — 96-dim embedding
     """
 
-    def __init__(self, onnx_path: str | Path):
+    def __init__(
+        self,
+        onnx_path: str | Path,
+        execution_providers: list[str] | None = None,
+        session_options: object | None = None,
+    ):
         import onnxruntime as ort
 
         if not Path(onnx_path).exists():
@@ -98,12 +115,14 @@ def __init__(self, onnx_path: str | Path):
                 "This should not happen - please reinstall livekit-wakeword."
             )
 
+        providers = execution_providers or ["CPUExecutionProvider"]
         self._session = ort.InferenceSession(
             str(onnx_path),
-            providers=["CPUExecutionProvider"],
+            sess_options=session_options,
+            providers=providers,
         )
         self._input_name = self._session.get_inputs()[0].name
-        logger.info(f"Loaded embedding ONNX model from {onnx_path}")
+        logger.info(f"Loaded embedding ONNX model from {onnx_path} (providers={providers})")
 
     def __call__(self, mel_windows: np.ndarray) -> np.ndarray:
         """Compute embeddings for mel spectrogram windows.

From c0ad2e902d4a0bbec6f8c71e66432eb817334133 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 12:00:00 -0500
Subject: [PATCH 7/9] eval: configurable ONNX providers and batch_size

Replaces the hardcoded providers=["CPUExecutionProvider"] at
evaluate.py:197 with config.eval.execution_providers, and plumbs
config.eval.batch_size through to _predict_onnx.

Default behavior unchanged (CPU-only, batch_size=1); users on GPU
hosts opt in via the new EvalConfig fields.
---
 src/livekit/wakeword/eval/evaluate.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/livekit/wakeword/eval/evaluate.py b/src/livekit/wakeword/eval/evaluate.py
index dc64386..7f20d48 100644
--- a/src/livekit/wakeword/eval/evaluate.py
+++ b/src/livekit/wakeword/eval/evaluate.py
@@ -194,16 +194,18 @@ def run_eval(config: WakeWordConfig, model_path: str | Path) -> dict[str, float]
     if not model_path.exists():
         raise FileNotFoundError(f"Model not found: {model_path}")
 
-    session = ort.InferenceSession(str(model_path), providers=["CPUExecutionProvider"])
-    logger.info(f"Loaded model from {model_path}")
+    providers = config.eval.execution_providers
+    session = ort.InferenceSession(str(model_path), providers=providers)
+    logger.info(f"Loaded model from {model_path} (providers={providers})")
 
     # Load validation data
     pos_features, neg_features = _load_validation_features(config)
 
     # Run predictions
     logger.info("Running predictions on validation set...")
-    pos_scores = _predict_onnx(session, pos_features)
-    neg_scores = _predict_onnx(session, neg_features)
+    batch_size = config.eval.batch_size
+    pos_scores = _predict_onnx(session, pos_features, batch_size=batch_size)
+    neg_scores = _predict_onnx(session, neg_features, batch_size=batch_size)
 
     # Compute DET curve
     thresholds, fpr, fnr = _compute_det_curve(pos_scores, neg_scores)

From b5d68a0ce19adc855389e448348a7a3df61e620b Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 12:00:05 -0500
Subject: [PATCH 8/9] tests: round-trip parity for parallel feature extraction

---
 tests/test_features_parallel.py | 126 ++++++++++++++++++++++++++++++++
 1 file changed, 126 insertions(+)
 create mode 100644 tests/test_features_parallel.py

diff --git a/tests/test_features_parallel.py b/tests/test_features_parallel.py
new file mode 100644
index 0000000..21a2c88
--- /dev/null
+++ b/tests/test_features_parallel.py
@@ -0,0 +1,126 @@
+"""Parity tests for the parallel (multiprocessing) feature-extraction path.
+
+Round-trips a small synthetic dataset of ``_rN.wav`` clips through both the
+single-threaded and the ``multiprocessing.Pool`` code paths in
+``extract_features_from_directory`` and asserts that the output tensors
+match in shape, dtype, and per-clip order — the contract downstream
+training actually depends on.
+
+Feature values are also asserted to be numerically close: ONNX runtime is
+deterministic for a fixed thread count, and the parallel path pins each
+worker to 1 intra-/inter-op thread, so outputs should be ``np.allclose``
+within a tiny tolerance.
+"""
+
+from __future__ import annotations
+
+from pathlib import Path
+
+import numpy as np
+import soundfile as sf
+
+from livekit.wakeword.data.features import (
+    N_EMBEDDING_TIMESTEPS,
+    extract_features_from_directory,
+)
+from livekit.wakeword.models.feature_extractor import (
+    MelSpectrogramFrontend,
+    SpeechEmbedding,
+)
+from livekit.wakeword.resources import get_embedding_model_path, get_mel_model_path
+
+
+SAMPLE_RATE = 16000
+N_CLIPS = 12
+
+
+def _make_synthetic_r0_clips(clip_dir: Path, n: int, duration_s: float = 2.0) -> None:
+    """Write ``n`` synthetic ``clip_NNNNNN_r0.wav`` files (the shape the extractor looks for)."""
+    clip_dir.mkdir(parents=True, exist_ok=True)
+    length = int(duration_s * SAMPLE_RATE)
+    rng = np.random.default_rng(7)
+    for i in range(n):
+        audio = rng.standard_normal(length).astype(np.float32) * 0.1
+        sf.write(str(clip_dir / f"clip_{i:06d}_r0.wav"), audio, SAMPLE_RATE)
+
+
+def test_parallel_matches_singlethreaded(tmp_path: Path) -> None:
+    """Serial and parallel extraction produce equivalent (N, 16, 96) tensors."""
+    clip_dir = tmp_path / "feats"
+    _make_synthetic_r0_clips(clip_dir, N_CLIPS)
+
+    mel_path = get_mel_model_path()
+    emb_path = get_embedding_model_path()
+
+    mel_frontend = MelSpectrogramFrontend(onnx_path=mel_path)
+    speech_embedding = SpeechEmbedding(onnx_path=emb_path)
+
+    serial = extract_features_from_directory(
+        clip_dir=clip_dir,
+        mel_frontend=mel_frontend,
+        speech_embedding=speech_embedding,
+        n_workers=1,
+    )
+
+    parallel = extract_features_from_directory(
+        clip_dir=clip_dir,
+        mel_frontend=mel_frontend,
+        speech_embedding=speech_embedding,
+        n_workers=3,
+        mp_context="auto",
+        mel_path=mel_path,
+        embedding_path=emb_path,
+        execution_providers=["CPUExecutionProvider"],
+    )
+
+    # Shape + dtype contract.
+    assert serial.shape == (N_CLIPS, N_EMBEDDING_TIMESTEPS, 96)
+    assert parallel.shape == serial.shape
+    assert parallel.dtype == serial.dtype
+
+    # Per-clip ordering + numerical equivalence. ORT with intra_op=inter_op=1
+    # is deterministic, so values should match to within tight float tolerance.
+    np.testing.assert_allclose(parallel, serial, rtol=1e-4, atol=1e-5)
+
+
+def test_n_workers_auto(tmp_path: Path) -> None:
+    """``n_workers=0`` (auto) runs to completion and returns the right shape."""
+    clip_dir = tmp_path / "auto_feats"
+    _make_synthetic_r0_clips(clip_dir, N_CLIPS)
+
+    mel_path = get_mel_model_path()
+    emb_path = get_embedding_model_path()
+
+    mel_frontend = MelSpectrogramFrontend(onnx_path=mel_path)
+    speech_embedding = SpeechEmbedding(onnx_path=emb_path)
+
+    features = extract_features_from_directory(
+        clip_dir=clip_dir,
+        mel_frontend=mel_frontend,
+        speech_embedding=speech_embedding,
+        n_workers=0,
+        mp_context="auto",
+        mel_path=mel_path,
+        embedding_path=emb_path,
+        execution_providers=["CPUExecutionProvider"],
+    )
+    assert features.shape == (N_CLIPS, N_EMBEDDING_TIMESTEPS, 96)
+
+
+def test_parallel_requires_model_paths(tmp_path: Path) -> None:
+    """Parallel mode errors clearly when model paths aren't supplied."""
+    import pytest
+
+    clip_dir = tmp_path / "bad_feats"
+    _make_synthetic_r0_clips(clip_dir, 3)
+
+    mel_frontend = MelSpectrogramFrontend(onnx_path=get_mel_model_path())
+    speech_embedding = SpeechEmbedding(onnx_path=get_embedding_model_path())
+
+    with pytest.raises(ValueError, match="mel_path"):
+        extract_features_from_directory(
+            clip_dir=clip_dir,
+            mel_frontend=mel_frontend,
+            speech_embedding=speech_embedding,
+            n_workers=2,
+        )

From 9b6ad4692e957b5ef2e503c631d6e1296759a843 Mon Sep 17 00:00:00 2001
From: Harikrishna Reddy <krishnah214@gmail.com>
Date: Thu, 23 Apr 2026 12:00:10 -0500
Subject: [PATCH 9/9] docs: document n_workers + execution_providers across
 pipeline

---
 configs/prod.yaml          | 26 ++++++++++++++++++++++++++
 configs/test.yaml          |  8 ++++++++
 docs/augmentation.md       | 14 +++++++++++---
 docs/evaluation.md         | 12 ++++++++++++
 docs/feature-extraction.md | 18 ++++++++++++++++++
 5 files changed, 75 insertions(+), 3 deletions(-)

diff --git a/configs/prod.yaml b/configs/prod.yaml
index 443caf8..3ab331b 100644
--- a/configs/prod.yaml
+++ b/configs/prod.yaml
@@ -106,6 +106,32 @@ augmentation:
   # speedup for the augmentation stage.
   # n_workers: 0   # uncomment to use all CPU cores (10–100× faster)
 
+# ============================================================================
+# Feature Extraction (all fields optional — defaults to single-threaded CPU)
+# ============================================================================
+#
+# Parallelism for the mel + embedding ONNX loop. Default 1 preserves the
+# legacy single-threaded behavior; 0 = os.cpu_count(); N = explicit count.
+# Workers pin ORT to 1 intra/inter-op thread to avoid thread explosion.
+#
+# ONNX Runtime providers: default is CPU-only. On a GPU host with
+# onnxruntime-gpu installed, use CUDAExecutionProvider.
+#
+# feature_extraction:
+#   n_workers: 0
+#   execution_providers: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+
+# ============================================================================
+# Evaluation (all fields optional — defaults to single-threaded CPU)
+# ============================================================================
+#
+# Same provider story as feature_extraction. batch_size default 1 is fine on
+# CPU; bump to 64+ on GPU to saturate the device.
+#
+# eval:
+#   execution_providers: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+#   batch_size: 64
+
 # ============================================================================
 # Model Architecture
 # ============================================================================
diff --git a/configs/test.yaml b/configs/test.yaml
index e39ab0a..6f0b2f3 100644
--- a/configs/test.yaml
+++ b/configs/test.yaml
@@ -41,6 +41,14 @@ augmentation:
   rir_paths: [./data/rirs]
   # n_workers: 0   # uncomment to use all CPU cores (10–100× faster)
 
+# feature_extraction:
+#   n_workers: 0
+#   execution_providers: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+#
+# eval:
+#   execution_providers: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+#   batch_size: 64
+
 # ============================================================================
 # Model Architecture
 # ============================================================================
diff --git a/docs/augmentation.md b/docs/augmentation.md
index f7b4b48..704744e 100644
--- a/docs/augmentation.md
+++ b/docs/augmentation.md
@@ -132,10 +132,18 @@ The per-clip loop in `_augment_directory` is a pure Python `for` over `soundfile
 
 `AugmentationConfig.n_workers` opts into a `multiprocessing.Pool` that runs the loop across worker processes. Each worker constructs its own `AudioAugmentor` via the pool's `initializer` callback — the parent's lazy-loaded audiomentations instance is never pickled, which keeps the setup robust even as upstream transforms evolve.
 
-| Config | 32-CPU throughput | 25k-clip wall-clock |
+Measured on a 32-CPU Modal container augmenting a 60k-clip dataset (25k positive_train + 5k positive_test + 25k negative_train + ~5k negative_test + ~2.5k backgrounds) end-to-end in **~6 minutes**:
+
+| Split | Throughput | Wall-clock |
 |---|---|---|
-| `n_workers: 1` (default, unchanged) | ~2.3 clips/sec | ~3 h |
-| `n_workers: 0` (auto / `os.cpu_count()`) | ~210 clips/sec | ~2 min |
+| `positive_train` (25k) | 178 clips/sec | 2:20 |
+| `positive_test` (5k) | 174 clips/sec | 0:28 |
+| `negative_train` (25k) | 130 clips/sec | 3:12 |
+| `negative_test` (~5k) | 91 clips/sec | 0:53 |
+| `background_train` (2k) | 83 clips/sec | 0:24 |
+| `background_test` (500) | 62 clips/sec | 0:08 |
+
+For reference, the single-threaded path on the same host processes ~2.3 clips/sec, so the full 60k dataset would otherwise take ~7 hours.
 
 Semantics:
 
diff --git a/docs/evaluation.md b/docs/evaluation.md
index bbfa540..8a64cbd 100644
--- a/docs/evaluation.md
+++ b/docs/evaluation.md
@@ -154,3 +154,15 @@ uv run livekit-wakeword eval configs/hey_livekit.yaml -m models/hey_livekit_oww.
 ```
 
 This works because both livekit-wakeword and openWakeWord share the same frozen embedding front-end, producing identical `(16, 96)` feature matrices.
+
+## ONNX Execution Providers
+
+`EvalConfig.execution_providers` controls the ONNX Runtime providers used for the classifier inference session. Default `["CPUExecutionProvider"]` preserves existing behavior. On a GPU host with `onnxruntime-gpu` installed:
+
+```yaml
+eval:
+  execution_providers: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+  batch_size: 64   # default 1 is fine on CPU; bump on GPU
+```
+
+CPU-side the classifier is small enough that `batch_size: 1` is rarely the bottleneck; on GPU the per-launch overhead dominates, so batching is required to see any speedup.
diff --git a/docs/feature-extraction.md b/docs/feature-extraction.md
index 33c7d5b..6cb19bd 100644
--- a/docs/feature-extraction.md
+++ b/docs/feature-extraction.md
@@ -206,6 +206,24 @@ Only augmented clips (`clip_NNNNNN_rN.wav`) are processed — clean TTS original
 
 Audio files are read via `soundfile`, converted to float32, reduced to mono if stereo, and processed one clip at a time.
 
+### Parallel Execution (`feature_extraction.n_workers`)
+
+The per-clip mel + embedding loop in `extract_features_from_directory` is a pure Python `for` over two ONNX sessions. Under the GIL it pins to a single core; on a 32-CPU L40S container we measured ~3.5 clips/sec, which is ~4 h wall-clock for a 55k-clip dataset before training even starts.
+
+`FeatureExtractionConfig.n_workers` opts into a `multiprocessing.Pool` that spreads the loop across worker processes. Each worker builds its own mel + embedding ONNX sessions via the pool's `initializer` (ORT sessions are not pickle-safe) and pins each session to a single intra-/inter-op thread — otherwise `n_workers` × N ORT threads thread-explode on multi-core hosts.
+
+The pool uses `pool.imap` (ordered, not `imap_unordered`) so the per-clip order within a split is preserved — the downstream classifier training relies on consistent sample ordering.
+
+```yaml
+feature_extraction:
+  n_workers: 0   # 0 = os.cpu_count(); 1 = single-threaded (default)
+  execution_providers: ["CUDAExecutionProvider", "CPUExecutionProvider"]
+```
+
+### ONNX Execution Providers
+
+`FeatureExtractionConfig.execution_providers` is plumbed through `MelSpectrogramFrontend.__init__` and `SpeechEmbedding.__init__`. The default `["CPUExecutionProvider"]` keeps existing behavior; on a GPU host with `onnxruntime-gpu` installed, setting `["CUDAExecutionProvider", "CPUExecutionProvider"]` offloads mel + embedding inference to the GPU with CPU as a fallback.
+
 ## Memory-Mapped Dataset
 
 **Source:** `src/livekit/wakeword/data/dataset.py`